Coverage Report

Created: 2026-03-23 06:45

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def c_default_init(self):
90
        import libclinic
91
        self.c_default = libclinic.c_unichar_repr(self.default)
92
93
[python start generated code]*/
94
/*[python end generated code: output=da39a3ee5e6b4b0d input=22f057b68fd9a65a]*/
95
96
/* --- Globals ------------------------------------------------------------
97
98
NOTE: In the interpreter's initialization phase, some globals are currently
99
      initialized dynamically as needed. In the process Unicode objects may
100
      be created before the Unicode type is ready.
101
102
*/
103
104
15.9M
#define MAX_UNICODE _Py_MAX_UNICODE
105
239M
#define ensure_unicode _PyUnicode_EnsureUnicode
106
107
#ifdef Py_DEBUG
108
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
109
#else
110
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
111
#endif
112
113
static inline char* _PyUnicode_UTF8(PyObject *op)
114
184M
{
115
184M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
116
184M
}
117
118
static inline char* PyUnicode_UTF8(PyObject *op)
119
153M
{
120
153M
    assert(_PyUnicode_CHECK(op));
121
153M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
122
137M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
123
137M
    }
124
15.3M
    else {
125
15.3M
         return _PyUnicode_UTF8(op);
126
15.3M
    }
127
153M
}
128
129
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
130
29.8M
{
131
29.8M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
132
29.8M
}
133
134
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
135
71.7M
{
136
71.7M
    assert(_PyUnicode_CHECK(op));
137
71.7M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
138
68.6M
         return _PyASCIIObject_CAST(op)->length;
139
68.6M
    }
140
3.11M
    else {
141
3.11M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
142
3.11M
    }
143
71.7M
}
144
145
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
146
29.8M
{
147
29.8M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
148
29.8M
}
149
150
#define _PyUnicode_LENGTH(op)                           \
151
499M
    (_PyASCIIObject_CAST(op)->length)
152
#define _PyUnicode_STATE(op)                            \
153
3.15G
    (_PyASCIIObject_CAST(op)->state)
154
#define _PyUnicode_HASH(op)                             \
155
464M
    (_PyASCIIObject_CAST(op)->hash)
156
157
1.03G
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
158
159
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
160
41.3M
{
161
41.3M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
162
41.3M
}
163
164
#define _PyUnicode_DATA_ANY(op)                         \
165
64.6M
    (_PyUnicodeObject_CAST(op)->data.any)
166
167
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
168
0
{
169
0
    assert(_PyUnicode_CHECK(op));
170
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
171
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
172
0
}
173
174
/* true if the Unicode object has an allocated UTF-8 memory block
175
   (not shared with other data) */
176
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
177
498M
{
178
498M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
179
155M
            && _PyUnicode_UTF8(op) != NULL
180
13.7M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
181
498M
}
182
183
184
182M
#define LATIN1 _Py_LATIN1_CHR
185
186
/* Forward declaration */
187
static PyObject *
188
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
189
                    const char *errors);
190
static PyObject *
191
unicode_decode_utf8(const char *s, Py_ssize_t size,
192
                    _Py_error_handler error_handler, const char *errors,
193
                    Py_ssize_t *consumed);
194
#ifdef Py_DEBUG
195
static inline int unicode_is_finalizing(void);
196
static int unicode_is_singleton(PyObject *unicode);
197
#endif
198
199
200
// Return a reference to the immortal empty string singleton.
201
PyObject*
202
_PyUnicode_GetEmpty(void)
203
98.5M
{
204
98.5M
    _Py_DECLARE_STR(empty, "");
205
98.5M
    return &_Py_STR(empty);
206
98.5M
}
207
208
/* This dictionary holds per-interpreter interned strings.
209
 * See InternalDocs/string_interning.md for details.
210
 */
211
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
212
4.99M
{
213
4.99M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
214
4.99M
}
215
216
/* This hashtable holds statically allocated interned strings.
217
 * See InternalDocs/string_interning.md for details.
218
 */
219
5.11M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
220
221
/* Get number of all interned strings for the current interpreter. */
222
Py_ssize_t
223
_PyUnicode_InternedSize(void)
224
0
{
225
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
226
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
227
0
}
228
229
/* Get number of immortal interned strings for the current interpreter. */
230
Py_ssize_t
231
_PyUnicode_InternedSize_Immortal(void)
232
0
{
233
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
234
0
    PyObject *key, *value;
235
0
    Py_ssize_t pos = 0;
236
0
    Py_ssize_t count = 0;
237
238
    // It's tempting to keep a count and avoid a loop here. But, this function
239
    // is intended for refleak tests. It spends extra work to report the true
240
    // value, to help detect bugs in optimizations.
241
242
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
243
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
244
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
245
0
           count++;
246
0
       }
247
0
    }
248
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
249
0
}
250
251
static Py_hash_t unicode_hash(PyObject *);
252
253
static Py_uhash_t
254
hashtable_unicode_hash(const void *key)
255
5.11M
{
256
5.11M
    return unicode_hash((PyObject *)key);
257
5.11M
}
258
259
static int
260
hashtable_unicode_compare(const void *key1, const void *key2)
261
526k
{
262
526k
    PyObject *obj1 = (PyObject *)key1;
263
526k
    PyObject *obj2 = (PyObject *)key2;
264
526k
    if (obj1 != NULL && obj2 != NULL) {
265
526k
        return unicode_eq(obj1, obj2);
266
526k
    }
267
0
    else {
268
0
        return obj1 == obj2;
269
0
    }
270
526k
}
271
272
/* Return true if this interpreter should share the main interpreter's
273
   intern_dict.  That's important for interpreters which load basic
274
   single-phase init extension modules (m_size == -1).  There could be interned
275
   immortal strings that are shared between interpreters, due to the
276
   PyDict_Update(mdict, m_copy) call in import_find_extension().
277
278
   It's not safe to deallocate those strings until all interpreters that
279
   potentially use them are freed.  By storing them in the main interpreter, we
280
   ensure they get freed after all other interpreters are freed.
281
*/
282
static bool
283
has_shared_intern_dict(PyInterpreterState *interp)
284
36
{
285
36
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
286
36
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
287
36
}
288
289
static int
290
init_interned_dict(PyInterpreterState *interp)
291
36
{
292
36
    assert(get_interned_dict(interp) == NULL);
293
36
    PyObject *interned;
294
36
    if (has_shared_intern_dict(interp)) {
295
0
        interned = get_interned_dict(_PyInterpreterState_Main());
296
0
        Py_INCREF(interned);
297
0
    }
298
36
    else {
299
36
        interned = PyDict_New();
300
36
        if (interned == NULL) {
301
0
            return -1;
302
0
        }
303
36
    }
304
36
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
305
36
    return 0;
306
36
}
307
308
static void
309
clear_interned_dict(PyInterpreterState *interp)
310
0
{
311
0
    PyObject *interned = get_interned_dict(interp);
312
0
    if (interned != NULL) {
313
0
        if (!has_shared_intern_dict(interp)) {
314
            // only clear if the dict belongs to this interpreter
315
0
            PyDict_Clear(interned);
316
0
        }
317
0
        Py_DECREF(interned);
318
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
319
0
    }
320
0
}
321
322
static PyStatus
323
init_global_interned_strings(PyInterpreterState *interp)
324
36
{
325
36
    assert(INTERNED_STRINGS == NULL);
326
36
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
327
328
36
    INTERNED_STRINGS = _Py_hashtable_new_full(
329
36
        hashtable_unicode_hash,
330
36
        hashtable_unicode_compare,
331
        // Objects stored here are immortal and statically allocated,
332
        // so we don't need key_destroy_func & value_destroy_func:
333
36
        NULL,
334
36
        NULL,
335
36
        &hashtable_alloc
336
36
    );
337
36
    if (INTERNED_STRINGS == NULL) {
338
0
        PyErr_Clear();
339
0
        return _PyStatus_ERR("failed to create global interned dict");
340
0
    }
341
342
    /* Intern statically allocated string identifiers, deepfreeze strings,
343
        * and one-byte latin-1 strings.
344
        * This must be done before any module initialization so that statically
345
        * allocated string identifiers are used instead of heap allocated strings.
346
        * Deepfreeze uses the interned identifiers if present to save space
347
        * else generates them and they are interned to speed up dict lookups.
348
    */
349
36
    _PyUnicode_InitStaticStrings(interp);
350
351
9.25k
    for (int i = 0; i < 256; i++) {
352
9.21k
        PyObject *s = LATIN1(i);
353
9.21k
        _PyUnicode_InternStatic(interp, &s);
354
9.21k
        assert(s == LATIN1(i));
355
9.21k
    }
356
#ifdef Py_DEBUG
357
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
358
359
    for (int i = 0; i < 256; i++) {
360
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
361
    }
362
#endif
363
36
    return _PyStatus_OK();
364
36
}
365
366
static void clear_global_interned_strings(void)
367
0
{
368
0
    if (INTERNED_STRINGS != NULL) {
369
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
370
0
        INTERNED_STRINGS = NULL;
371
0
    }
372
0
}
373
374
#define _Py_RETURN_UNICODE_EMPTY()   \
375
46.9M
    do {                             \
376
46.9M
        return _PyUnicode_GetEmpty();\
377
46.9M
    } while (0)
378
379
380
/* Fast detection of the most frequent whitespace characters */
381
const unsigned char _Py_ascii_whitespace[] = {
382
    0, 0, 0, 0, 0, 0, 0, 0,
383
/*     case 0x0009: * CHARACTER TABULATION */
384
/*     case 0x000A: * LINE FEED */
385
/*     case 0x000B: * LINE TABULATION */
386
/*     case 0x000C: * FORM FEED */
387
/*     case 0x000D: * CARRIAGE RETURN */
388
    0, 1, 1, 1, 1, 1, 0, 0,
389
    0, 0, 0, 0, 0, 0, 0, 0,
390
/*     case 0x001C: * FILE SEPARATOR */
391
/*     case 0x001D: * GROUP SEPARATOR */
392
/*     case 0x001E: * RECORD SEPARATOR */
393
/*     case 0x001F: * UNIT SEPARATOR */
394
    0, 0, 0, 0, 1, 1, 1, 1,
395
/*     case 0x0020: * SPACE */
396
    1, 0, 0, 0, 0, 0, 0, 0,
397
    0, 0, 0, 0, 0, 0, 0, 0,
398
    0, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
    0, 0, 0, 0, 0, 0, 0, 0,
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0
409
};
410
411
/* forward */
412
static PyObject* get_latin1_char(unsigned char ch);
413
414
415
static PyObject *
416
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
417
static PyObject *
418
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
421
422
static PyObject *
423
unicode_encode_call_errorhandler(const char *errors,
424
       PyObject **errorHandler,const char *encoding, const char *reason,
425
       PyObject *unicode, PyObject **exceptionObject,
426
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
427
428
static void
429
raise_encode_exception(PyObject **exceptionObject,
430
                       const char *encoding,
431
                       PyObject *unicode,
432
                       Py_ssize_t startpos, Py_ssize_t endpos,
433
                       const char *reason);
434
435
/* Same for linebreaks */
436
static const unsigned char ascii_linebreak[] = {
437
    0, 0, 0, 0, 0, 0, 0, 0,
438
/*         0x000A, * LINE FEED */
439
/*         0x000B, * LINE TABULATION */
440
/*         0x000C, * FORM FEED */
441
/*         0x000D, * CARRIAGE RETURN */
442
    0, 0, 1, 1, 1, 1, 0, 0,
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
/*         0x001C, * FILE SEPARATOR */
445
/*         0x001D, * GROUP SEPARATOR */
446
/*         0x001E, * RECORD SEPARATOR */
447
    0, 0, 0, 0, 1, 1, 1, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
    0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0
461
};
462
463
static int convert_uc(PyObject *obj, void *addr);
464
465
struct encoding_map;
466
#include "clinic/unicodeobject.c.h"
467
468
_Py_error_handler
469
_Py_GetErrorHandler(const char *errors)
470
2.96M
{
471
2.96M
    if (errors == NULL || strcmp(errors, "strict") == 0) {
472
2.25M
        return _Py_ERROR_STRICT;
473
2.25M
    }
474
704k
    if (strcmp(errors, "surrogateescape") == 0) {
475
487k
        return _Py_ERROR_SURROGATEESCAPE;
476
487k
    }
477
217k
    if (strcmp(errors, "replace") == 0) {
478
217k
        return _Py_ERROR_REPLACE;
479
217k
    }
480
0
    if (strcmp(errors, "ignore") == 0) {
481
0
        return _Py_ERROR_IGNORE;
482
0
    }
483
0
    if (strcmp(errors, "backslashreplace") == 0) {
484
0
        return _Py_ERROR_BACKSLASHREPLACE;
485
0
    }
486
0
    if (strcmp(errors, "surrogatepass") == 0) {
487
0
        return _Py_ERROR_SURROGATEPASS;
488
0
    }
489
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
490
0
        return _Py_ERROR_XMLCHARREFREPLACE;
491
0
    }
492
0
    return _Py_ERROR_OTHER;
493
0
}
494
495
496
static _Py_error_handler
497
get_error_handler_wide(const wchar_t *errors)
498
12.4k
{
499
12.4k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
500
0
        return _Py_ERROR_STRICT;
501
0
    }
502
12.4k
    if (wcscmp(errors, L"surrogateescape") == 0) {
503
12.4k
        return _Py_ERROR_SURROGATEESCAPE;
504
12.4k
    }
505
0
    if (wcscmp(errors, L"replace") == 0) {
506
0
        return _Py_ERROR_REPLACE;
507
0
    }
508
0
    if (wcscmp(errors, L"ignore") == 0) {
509
0
        return _Py_ERROR_IGNORE;
510
0
    }
511
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
512
0
        return _Py_ERROR_BACKSLASHREPLACE;
513
0
    }
514
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
515
0
        return _Py_ERROR_SURROGATEPASS;
516
0
    }
517
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
518
0
        return _Py_ERROR_XMLCHARREFREPLACE;
519
0
    }
520
0
    return _Py_ERROR_OTHER;
521
0
}
522
523
524
static inline int
525
unicode_check_encoding_errors(const char *encoding, const char *errors)
526
39.1M
{
527
39.1M
    if (encoding == NULL && errors == NULL) {
528
12.4M
        return 0;
529
12.4M
    }
530
531
26.7M
    PyInterpreterState *interp = _PyInterpreterState_GET();
532
26.7M
#ifndef Py_DEBUG
533
    /* In release mode, only check in development mode (-X dev) */
534
26.7M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
535
26.7M
        return 0;
536
26.7M
    }
537
#else
538
    /* Always check in debug mode */
539
#endif
540
541
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
542
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
543
0
    if (!interp->unicode.fs_codec.encoding) {
544
0
        return 0;
545
0
    }
546
547
    /* Disable checks during Python finalization. For example, it allows to
548
     * call PyObject_Dump() during finalization for debugging purpose.
549
     */
550
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
551
0
        return 0;
552
0
    }
553
554
0
    if (encoding != NULL
555
        // Fast path for the most common built-in encodings. Even if the codec
556
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
557
        // create a temporary Unicode string (the key in the cache).
558
0
        && strcmp(encoding, "utf-8") != 0
559
0
        && strcmp(encoding, "utf8") != 0
560
0
        && strcmp(encoding, "ascii") != 0)
561
0
    {
562
0
        PyObject *handler = _PyCodec_Lookup(encoding);
563
0
        if (handler == NULL) {
564
0
            return -1;
565
0
        }
566
0
        Py_DECREF(handler);
567
0
    }
568
569
0
    if (errors != NULL
570
        // Fast path for the most common built-in error handlers.
571
0
        && strcmp(errors, "strict") != 0
572
0
        && strcmp(errors, "ignore") != 0
573
0
        && strcmp(errors, "replace") != 0
574
0
        && strcmp(errors, "surrogateescape") != 0
575
0
        && strcmp(errors, "surrogatepass") != 0)
576
0
    {
577
0
        PyObject *handler = PyCodec_LookupError(errors);
578
0
        if (handler == NULL) {
579
0
            return -1;
580
0
        }
581
0
        Py_DECREF(handler);
582
0
    }
583
0
    return 0;
584
0
}
585
586
587
int
588
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
589
0
{
590
0
#define CHECK(expr) \
591
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
592
593
0
    assert(op != NULL);
594
0
    CHECK(PyUnicode_Check(op));
595
596
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
597
0
    int kind = ascii->state.kind;
598
599
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
600
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
601
0
    }
602
0
    else {
603
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
604
0
        void *data;
605
606
0
        if (ascii->state.compact == 1) {
607
0
            data = compact + 1;
608
0
            CHECK(kind == PyUnicode_1BYTE_KIND
609
0
                                 || kind == PyUnicode_2BYTE_KIND
610
0
                                 || kind == PyUnicode_4BYTE_KIND);
611
0
            CHECK(ascii->state.ascii == 0);
612
0
            CHECK(_PyUnicode_UTF8(op) != data);
613
0
        }
614
0
        else {
615
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
616
617
0
            data = unicode->data.any;
618
0
            CHECK(kind == PyUnicode_1BYTE_KIND
619
0
                     || kind == PyUnicode_2BYTE_KIND
620
0
                     || kind == PyUnicode_4BYTE_KIND);
621
0
            CHECK(ascii->state.compact == 0);
622
0
            CHECK(data != NULL);
623
0
            if (ascii->state.ascii) {
624
0
                CHECK(_PyUnicode_UTF8(op) == data);
625
0
                CHECK(compact->utf8_length == ascii->length);
626
0
            }
627
0
            else {
628
0
                CHECK(_PyUnicode_UTF8(op) != data);
629
0
            }
630
0
        }
631
0
#ifndef Py_GIL_DISABLED
632
0
        if (_PyUnicode_UTF8(op) == NULL)
633
0
            CHECK(compact->utf8_length == 0);
634
0
#endif
635
0
    }
636
637
    /* check that the best kind is used: O(n) operation */
638
0
    if (check_content) {
639
0
        Py_ssize_t i;
640
0
        Py_UCS4 maxchar = 0;
641
0
        const void *data;
642
0
        Py_UCS4 ch;
643
644
0
        data = PyUnicode_DATA(ascii);
645
0
        for (i=0; i < ascii->length; i++)
646
0
        {
647
0
            ch = PyUnicode_READ(kind, data, i);
648
0
            if (ch > maxchar)
649
0
                maxchar = ch;
650
0
        }
651
0
        if (kind == PyUnicode_1BYTE_KIND) {
652
0
            if (ascii->state.ascii == 0) {
653
0
                CHECK(maxchar >= 128);
654
0
                CHECK(maxchar <= 255);
655
0
            }
656
0
            else
657
0
                CHECK(maxchar < 128);
658
0
        }
659
0
        else if (kind == PyUnicode_2BYTE_KIND) {
660
0
            CHECK(maxchar >= 0x100);
661
0
            CHECK(maxchar <= 0xFFFF);
662
0
        }
663
0
        else {
664
0
            CHECK(maxchar >= 0x10000);
665
0
            CHECK(maxchar <= MAX_UNICODE);
666
0
        }
667
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
668
0
    }
669
670
    /* Check interning state */
671
#ifdef Py_DEBUG
672
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
673
    // extensions can make immortal strings mortal (but with a high enough
674
    // refcount).
675
    // The other way is extremely unlikely (worth a potential failed assertion
676
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
677
    switch (PyUnicode_CHECK_INTERNED(op)) {
678
        case SSTATE_NOT_INTERNED:
679
            if (ascii->state.statically_allocated) {
680
                // This state is for two exceptions:
681
                // - strings are currently checked before they're interned
682
                // - the 256 one-latin1-character strings
683
                //   are static but use SSTATE_NOT_INTERNED
684
            }
685
            else {
686
                CHECK(!_Py_IsImmortal(op));
687
            }
688
            break;
689
        case SSTATE_INTERNED_MORTAL:
690
            CHECK(!ascii->state.statically_allocated);
691
            CHECK(!_Py_IsImmortal(op));
692
            break;
693
        case SSTATE_INTERNED_IMMORTAL:
694
            CHECK(!ascii->state.statically_allocated);
695
            break;
696
        case SSTATE_INTERNED_IMMORTAL_STATIC:
697
            CHECK(ascii->state.statically_allocated);
698
            break;
699
        default:
700
            Py_UNREACHABLE();
701
    }
702
#endif
703
704
0
    return 1;
705
706
0
#undef CHECK
707
0
}
708
709
PyObject*
710
_PyUnicode_Result(PyObject *unicode)
711
46.7M
{
712
46.7M
    assert(_PyUnicode_CHECK(unicode));
713
714
46.7M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
715
46.7M
    if (length == 0) {
716
300
        PyObject *empty = _PyUnicode_GetEmpty();
717
300
        if (unicode != empty) {
718
0
            Py_DECREF(unicode);
719
0
        }
720
300
        return empty;
721
300
    }
722
723
46.7M
    if (length == 1) {
724
856k
        int kind = PyUnicode_KIND(unicode);
725
856k
        if (kind == PyUnicode_1BYTE_KIND) {
726
112k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
727
112k
            Py_UCS1 ch = data[0];
728
112k
            PyObject *latin1_char = LATIN1(ch);
729
112k
            if (unicode != latin1_char) {
730
107k
                Py_DECREF(unicode);
731
107k
            }
732
112k
            return latin1_char;
733
112k
        }
734
856k
    }
735
736
46.7M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
737
46.6M
    return unicode;
738
46.7M
}
739
1.22M
#define unicode_result _PyUnicode_Result
740
741
static PyObject*
742
unicode_result_unchanged(PyObject *unicode)
743
89.2M
{
744
89.2M
    if (PyUnicode_CheckExact(unicode)) {
745
86.4M
        return Py_NewRef(unicode);
746
86.4M
    }
747
2.84M
    else
748
        /* Subtype -- return genuine unicode string with the same value. */
749
2.84M
        return _PyUnicode_Copy(unicode);
750
89.2M
}
751
752
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
753
   ASCII, Latin1, UTF-8, etc. */
754
static char*
755
backslashreplace(PyBytesWriter *writer, char *str,
756
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
757
0
{
758
0
    Py_ssize_t size, i;
759
0
    Py_UCS4 ch;
760
0
    int kind;
761
0
    const void *data;
762
763
0
    kind = PyUnicode_KIND(unicode);
764
0
    data = PyUnicode_DATA(unicode);
765
766
0
    size = 0;
767
    /* determine replacement size */
768
0
    for (i = collstart; i < collend; ++i) {
769
0
        Py_ssize_t incr;
770
771
0
        ch = PyUnicode_READ(kind, data, i);
772
0
        if (ch < 0x100)
773
0
            incr = 2+2;
774
0
        else if (ch < 0x10000)
775
0
            incr = 2+4;
776
0
        else {
777
0
            assert(ch <= MAX_UNICODE);
778
0
            incr = 2+8;
779
0
        }
780
0
        if (size > PY_SSIZE_T_MAX - incr) {
781
0
            PyErr_SetString(PyExc_OverflowError,
782
0
                            "encoded result is too long for a Python string");
783
0
            return NULL;
784
0
        }
785
0
        size += incr;
786
0
    }
787
788
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
789
0
    if (str == NULL) {
790
0
        return NULL;
791
0
    }
792
793
    /* generate replacement */
794
0
    for (i = collstart; i < collend; ++i) {
795
0
        ch = PyUnicode_READ(kind, data, i);
796
0
        *str++ = '\\';
797
0
        if (ch >= 0x00010000) {
798
0
            *str++ = 'U';
799
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
800
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
801
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
802
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
803
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
804
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
805
0
        }
806
0
        else if (ch >= 0x100) {
807
0
            *str++ = 'u';
808
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
809
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
810
0
        }
811
0
        else
812
0
            *str++ = 'x';
813
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
814
0
        *str++ = Py_hexdigits[ch&0xf];
815
0
    }
816
0
    return str;
817
0
}
818
819
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
820
   ASCII, Latin1, UTF-8, etc. */
821
static char*
822
xmlcharrefreplace(PyBytesWriter *writer, char *str,
823
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
824
0
{
825
0
    Py_ssize_t size, i;
826
0
    Py_UCS4 ch;
827
0
    int kind;
828
0
    const void *data;
829
830
0
    kind = PyUnicode_KIND(unicode);
831
0
    data = PyUnicode_DATA(unicode);
832
833
0
    size = 0;
834
    /* determine replacement size */
835
0
    for (i = collstart; i < collend; ++i) {
836
0
        Py_ssize_t incr;
837
838
0
        ch = PyUnicode_READ(kind, data, i);
839
0
        if (ch < 10)
840
0
            incr = 2+1+1;
841
0
        else if (ch < 100)
842
0
            incr = 2+2+1;
843
0
        else if (ch < 1000)
844
0
            incr = 2+3+1;
845
0
        else if (ch < 10000)
846
0
            incr = 2+4+1;
847
0
        else if (ch < 100000)
848
0
            incr = 2+5+1;
849
0
        else if (ch < 1000000)
850
0
            incr = 2+6+1;
851
0
        else {
852
0
            assert(ch <= MAX_UNICODE);
853
0
            incr = 2+7+1;
854
0
        }
855
0
        if (size > PY_SSIZE_T_MAX - incr) {
856
0
            PyErr_SetString(PyExc_OverflowError,
857
0
                            "encoded result is too long for a Python string");
858
0
            return NULL;
859
0
        }
860
0
        size += incr;
861
0
    }
862
863
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
864
0
    if (str == NULL) {
865
0
        return NULL;
866
0
    }
867
868
    /* generate replacement */
869
0
    for (i = collstart; i < collend; ++i) {
870
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
871
0
        if (size < 0) {
872
0
            return NULL;
873
0
        }
874
0
        str += size;
875
0
    }
876
0
    return str;
877
0
}
878
879
/* --- Bloom Filters ----------------------------------------------------- */
880
881
/* stuff to implement simple "bloom filters" for Unicode characters.
882
   to keep things simple, we use a single bitmask, using the least 5
883
   bits from each unicode characters as the bit index. */
884
885
/* the linebreak mask is set up by _PyUnicode_Init() below */
886
887
#if LONG_BIT >= 128
888
#define BLOOM_WIDTH 128
889
#elif LONG_BIT >= 64
890
20.7M
#define BLOOM_WIDTH 64
891
#elif LONG_BIT >= 32
892
#define BLOOM_WIDTH 32
893
#else
894
#error "LONG_BIT is smaller than 32"
895
#endif
896
897
7.31M
#define BLOOM_MASK unsigned long
898
899
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
900
901
27.3M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
902
903
#define BLOOM_LINEBREAK(ch)                                             \
904
115M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
905
115M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
906
907
static inline BLOOM_MASK
908
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
909
3.65M
{
910
3.65M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
911
3.65M
    do {                                               \
912
3.65M
        TYPE *data = (TYPE *)PTR;                      \
913
3.65M
        TYPE *end = data + LEN;                        \
914
3.65M
        Py_UCS4 ch;                                    \
915
9.05M
        for (; data != end; data++) {                  \
916
5.39M
            ch = *data;                                \
917
5.39M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
918
5.39M
        }                                              \
919
3.65M
        break;                                         \
920
3.65M
    } while (0)
921
922
    /* calculate simple bloom-style bitmask for a given unicode string */
923
924
3.65M
    BLOOM_MASK mask;
925
926
3.65M
    mask = 0;
927
3.65M
    switch (kind) {
928
3.65M
    case PyUnicode_1BYTE_KIND:
929
3.65M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
930
3.65M
        break;
931
36
    case PyUnicode_2BYTE_KIND:
932
36
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
933
36
        break;
934
0
    case PyUnicode_4BYTE_KIND:
935
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
936
0
        break;
937
0
    default:
938
0
        Py_UNREACHABLE();
939
3.65M
    }
940
3.65M
    return mask;
941
942
3.65M
#undef BLOOM_UPDATE
943
3.65M
}
944
945
/* Compilation of templated routines */
946
947
974k
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
948
949
#include "stringlib/asciilib.h"
950
#include "stringlib/fastsearch.h"
951
#include "stringlib/partition.h"
952
#include "stringlib/split.h"
953
#include "stringlib/count.h"
954
#include "stringlib/find.h"
955
#include "stringlib/find_max_char.h"
956
#include "stringlib/undef.h"
957
958
#include "stringlib/ucs1lib.h"
959
#include "stringlib/fastsearch.h"
960
#include "stringlib/partition.h"
961
#include "stringlib/split.h"
962
#include "stringlib/count.h"
963
#include "stringlib/find.h"
964
#include "stringlib/replace.h"
965
#include "stringlib/repr.h"
966
#include "stringlib/find_max_char.h"
967
#include "stringlib/undef.h"
968
969
#include "stringlib/ucs2lib.h"
970
#include "stringlib/fastsearch.h"
971
#include "stringlib/partition.h"
972
#include "stringlib/split.h"
973
#include "stringlib/count.h"
974
#include "stringlib/find.h"
975
#include "stringlib/replace.h"
976
#include "stringlib/repr.h"
977
#include "stringlib/find_max_char.h"
978
#include "stringlib/undef.h"
979
980
#include "stringlib/ucs4lib.h"
981
#include "stringlib/fastsearch.h"
982
#include "stringlib/partition.h"
983
#include "stringlib/split.h"
984
#include "stringlib/count.h"
985
#include "stringlib/find.h"
986
#include "stringlib/replace.h"
987
#include "stringlib/repr.h"
988
#include "stringlib/find_max_char.h"
989
#include "stringlib/undef.h"
990
991
#undef STRINGLIB_GET_EMPTY
992
993
/* --- Unicode Object ----------------------------------------------------- */
994
995
static inline Py_ssize_t
996
findchar(const void *s, int kind,
997
         Py_ssize_t size, Py_UCS4 ch,
998
         int direction)
999
194M
{
1000
194M
    switch (kind) {
1001
188M
    case PyUnicode_1BYTE_KIND:
1002
188M
        if ((Py_UCS1) ch != ch)
1003
3.20k
            return -1;
1004
188M
        if (direction > 0)
1005
188M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1006
65.8k
        else
1007
65.8k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1008
4.36M
    case PyUnicode_2BYTE_KIND:
1009
4.36M
        if ((Py_UCS2) ch != ch)
1010
0
            return -1;
1011
4.36M
        if (direction > 0)
1012
4.17M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1013
194k
        else
1014
194k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1015
1.58M
    case PyUnicode_4BYTE_KIND:
1016
1.58M
        if (direction > 0)
1017
1.49M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1018
88.0k
        else
1019
88.0k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1020
0
    default:
1021
0
        Py_UNREACHABLE();
1022
194M
    }
1023
194M
}
1024
1025
#ifdef Py_DEBUG
1026
/* Fill the data of a Unicode string with invalid characters to detect bugs
1027
   earlier.
1028
1029
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1030
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1031
   invalid character in Unicode 6.0. */
1032
static void
1033
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1034
{
1035
    int kind = PyUnicode_KIND(unicode);
1036
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1037
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1038
    if (length <= old_length)
1039
        return;
1040
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1041
}
1042
#endif
1043
1044
static PyObject*
1045
resize_copy(PyObject *unicode, Py_ssize_t length)
1046
0
{
1047
0
    Py_ssize_t copy_length;
1048
0
    PyObject *copy;
1049
1050
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051
0
    if (copy == NULL)
1052
0
        return NULL;
1053
1054
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1055
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1056
0
    return copy;
1057
0
}
1058
1059
PyObject*
1060
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1061
51.1M
{
1062
51.1M
    Py_ssize_t char_size;
1063
51.1M
    Py_ssize_t struct_size;
1064
51.1M
    Py_ssize_t new_size;
1065
51.1M
    PyObject *new_unicode;
1066
#ifdef Py_DEBUG
1067
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1068
#endif
1069
1070
51.1M
    if (!_PyUnicode_IsModifiable(unicode)) {
1071
0
        PyObject *copy = resize_copy(unicode, length);
1072
0
        if (copy == NULL) {
1073
0
            return NULL;
1074
0
        }
1075
0
        Py_DECREF(unicode);
1076
0
        return copy;
1077
0
    }
1078
51.1M
    assert(PyUnicode_IS_COMPACT(unicode));
1079
1080
51.1M
    char_size = PyUnicode_KIND(unicode);
1081
51.1M
    if (PyUnicode_IS_ASCII(unicode))
1082
38.2M
        struct_size = sizeof(PyASCIIObject);
1083
12.9M
    else
1084
12.9M
        struct_size = sizeof(PyCompactUnicodeObject);
1085
1086
51.1M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1087
0
        PyErr_NoMemory();
1088
0
        return NULL;
1089
0
    }
1090
51.1M
    new_size = (struct_size + (length + 1) * char_size);
1091
1092
51.1M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1093
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1094
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1095
0
        PyUnicode_SET_UTF8(unicode, NULL);
1096
0
    }
1097
#ifdef Py_TRACE_REFS
1098
    _Py_ForgetReference(unicode);
1099
#endif
1100
51.1M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1101
1102
51.1M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1103
51.1M
    if (new_unicode == NULL) {
1104
0
        _Py_NewReferenceNoTotal(unicode);
1105
0
        PyErr_NoMemory();
1106
0
        return NULL;
1107
0
    }
1108
51.1M
    unicode = new_unicode;
1109
51.1M
    _Py_NewReferenceNoTotal(unicode);
1110
1111
51.1M
    _PyUnicode_LENGTH(unicode) = length;
1112
#ifdef Py_DEBUG
1113
    unicode_fill_invalid(unicode, old_length);
1114
#endif
1115
51.1M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1116
51.1M
                    length, 0);
1117
51.1M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1118
51.1M
    return unicode;
1119
51.1M
}
1120
1121
static int
1122
resize_inplace(PyObject *unicode, Py_ssize_t length)
1123
0
{
1124
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1125
0
    assert(Py_REFCNT(unicode) == 1);
1126
1127
0
    Py_ssize_t new_size;
1128
0
    Py_ssize_t char_size;
1129
0
    int share_utf8;
1130
0
    void *data;
1131
#ifdef Py_DEBUG
1132
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1133
#endif
1134
1135
0
    data = _PyUnicode_DATA_ANY(unicode);
1136
0
    char_size = PyUnicode_KIND(unicode);
1137
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1138
1139
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1140
0
        PyErr_NoMemory();
1141
0
        return -1;
1142
0
    }
1143
0
    new_size = (length + 1) * char_size;
1144
1145
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1146
0
    {
1147
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1148
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1149
0
        PyUnicode_SET_UTF8(unicode, NULL);
1150
0
    }
1151
1152
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1153
0
    if (data == NULL) {
1154
0
        PyErr_NoMemory();
1155
0
        return -1;
1156
0
    }
1157
0
    _PyUnicode_DATA_ANY(unicode) = data;
1158
0
    if (share_utf8) {
1159
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1160
0
        PyUnicode_SET_UTF8(unicode, data);
1161
0
    }
1162
0
    _PyUnicode_LENGTH(unicode) = length;
1163
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1164
#ifdef Py_DEBUG
1165
    unicode_fill_invalid(unicode, old_length);
1166
#endif
1167
1168
    /* check for integer overflow */
1169
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1170
0
        PyErr_NoMemory();
1171
0
        return -1;
1172
0
    }
1173
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1174
0
    return 0;
1175
0
}
1176
1177
static const char*
1178
unicode_kind_name(PyObject *unicode)
1179
0
{
1180
    /* don't check consistency: unicode_kind_name() is called from
1181
       _PyUnicode_Dump() */
1182
0
    if (!PyUnicode_IS_COMPACT(unicode))
1183
0
    {
1184
0
        switch (PyUnicode_KIND(unicode))
1185
0
        {
1186
0
        case PyUnicode_1BYTE_KIND:
1187
0
            if (PyUnicode_IS_ASCII(unicode))
1188
0
                return "legacy ascii";
1189
0
            else
1190
0
                return "legacy latin1";
1191
0
        case PyUnicode_2BYTE_KIND:
1192
0
            return "legacy UCS2";
1193
0
        case PyUnicode_4BYTE_KIND:
1194
0
            return "legacy UCS4";
1195
0
        default:
1196
0
            return "<legacy invalid kind>";
1197
0
        }
1198
0
    }
1199
0
    switch (PyUnicode_KIND(unicode)) {
1200
0
    case PyUnicode_1BYTE_KIND:
1201
0
        if (PyUnicode_IS_ASCII(unicode))
1202
0
            return "ascii";
1203
0
        else
1204
0
            return "latin1";
1205
0
    case PyUnicode_2BYTE_KIND:
1206
0
        return "UCS2";
1207
0
    case PyUnicode_4BYTE_KIND:
1208
0
        return "UCS4";
1209
0
    default:
1210
0
        return "<invalid compact kind>";
1211
0
    }
1212
0
}
1213
1214
#ifdef Py_DEBUG
1215
/* Functions wrapping macros for use in debugger */
1216
const char *_PyUnicode_utf8(void *unicode_raw){
1217
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1218
    return PyUnicode_UTF8(unicode);
1219
}
1220
1221
const void *_PyUnicode_compact_data(void *unicode_raw) {
1222
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1223
    return _PyUnicode_COMPACT_DATA(unicode);
1224
}
1225
const void *_PyUnicode_data(void *unicode_raw) {
1226
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1227
    printf("obj %p\n", (void*)unicode);
1228
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1229
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1230
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1231
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1232
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1233
    return PyUnicode_DATA(unicode);
1234
}
1235
1236
void
1237
_PyUnicode_Dump(PyObject *op)
1238
{
1239
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1240
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1241
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1242
    const void *data;
1243
1244
    if (ascii->state.compact)
1245
    {
1246
        if (ascii->state.ascii)
1247
            data = (ascii + 1);
1248
        else
1249
            data = (compact + 1);
1250
    }
1251
    else
1252
        data = unicode->data.any;
1253
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1254
1255
    if (!ascii->state.ascii) {
1256
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1257
    }
1258
    printf(", data=%p\n", data);
1259
}
1260
#endif
1261
1262
1263
PyObject *
1264
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1265
456M
{
1266
    /* Optimization for empty strings */
1267
456M
    if (size == 0) {
1268
24.2M
        return _PyUnicode_GetEmpty();
1269
24.2M
    }
1270
1271
432M
    PyObject *obj;
1272
432M
    PyCompactUnicodeObject *unicode;
1273
432M
    void *data;
1274
432M
    int kind;
1275
432M
    int is_ascii;
1276
432M
    Py_ssize_t char_size;
1277
432M
    Py_ssize_t struct_size;
1278
1279
432M
    is_ascii = 0;
1280
432M
    struct_size = sizeof(PyCompactUnicodeObject);
1281
432M
    if (maxchar < 128) {
1282
305M
        kind = PyUnicode_1BYTE_KIND;
1283
305M
        char_size = 1;
1284
305M
        is_ascii = 1;
1285
305M
        struct_size = sizeof(PyASCIIObject);
1286
305M
    }
1287
127M
    else if (maxchar < 256) {
1288
13.3M
        kind = PyUnicode_1BYTE_KIND;
1289
13.3M
        char_size = 1;
1290
13.3M
    }
1291
113M
    else if (maxchar < 65536) {
1292
104M
        kind = PyUnicode_2BYTE_KIND;
1293
104M
        char_size = 2;
1294
104M
    }
1295
8.87M
    else {
1296
8.87M
        if (maxchar > MAX_UNICODE) {
1297
0
            PyErr_SetString(PyExc_SystemError,
1298
0
                            "invalid maximum character passed to PyUnicode_New");
1299
0
            return NULL;
1300
0
        }
1301
8.87M
        kind = PyUnicode_4BYTE_KIND;
1302
8.87M
        char_size = 4;
1303
8.87M
    }
1304
1305
    /* Ensure we won't overflow the size. */
1306
432M
    if (size < 0) {
1307
0
        PyErr_SetString(PyExc_SystemError,
1308
0
                        "Negative size passed to PyUnicode_New");
1309
0
        return NULL;
1310
0
    }
1311
432M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1312
0
        return PyErr_NoMemory();
1313
1314
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1315
     * PyObject_New() so we are able to allocate space for the object and
1316
     * it's data buffer.
1317
     */
1318
432M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1319
432M
    if (obj == NULL) {
1320
0
        return PyErr_NoMemory();
1321
0
    }
1322
432M
    _PyObject_Init(obj, &PyUnicode_Type);
1323
1324
432M
    unicode = (PyCompactUnicodeObject *)obj;
1325
432M
    if (is_ascii)
1326
305M
        data = ((PyASCIIObject*)obj) + 1;
1327
127M
    else
1328
127M
        data = unicode + 1;
1329
432M
    _PyUnicode_LENGTH(unicode) = size;
1330
432M
    _PyUnicode_HASH(unicode) = -1;
1331
432M
    _PyUnicode_STATE(unicode).interned = 0;
1332
432M
    _PyUnicode_STATE(unicode).kind = kind;
1333
432M
    _PyUnicode_STATE(unicode).compact = 1;
1334
432M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1335
432M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1336
432M
    if (is_ascii) {
1337
305M
        ((char*)data)[size] = 0;
1338
305M
    }
1339
127M
    else if (kind == PyUnicode_1BYTE_KIND) {
1340
13.3M
        ((char*)data)[size] = 0;
1341
13.3M
        unicode->utf8 = NULL;
1342
13.3M
        unicode->utf8_length = 0;
1343
13.3M
    }
1344
113M
    else {
1345
113M
        unicode->utf8 = NULL;
1346
113M
        unicode->utf8_length = 0;
1347
113M
        if (kind == PyUnicode_2BYTE_KIND)
1348
104M
            ((Py_UCS2*)data)[size] = 0;
1349
8.87M
        else /* kind == PyUnicode_4BYTE_KIND */
1350
8.87M
            ((Py_UCS4*)data)[size] = 0;
1351
113M
    }
1352
#ifdef Py_DEBUG
1353
    unicode_fill_invalid((PyObject*)unicode, 0);
1354
#endif
1355
432M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1356
432M
    return obj;
1357
432M
}
1358
1359
static int
1360
unicode_check_modifiable(PyObject *unicode)
1361
612
{
1362
612
    if (!_PyUnicode_IsModifiable(unicode)) {
1363
0
        PyErr_SetString(PyExc_SystemError,
1364
0
                        "Cannot modify a string currently used");
1365
0
        return -1;
1366
0
    }
1367
612
    return 0;
1368
612
}
1369
1370
static int
1371
_copy_characters(PyObject *to, Py_ssize_t to_start,
1372
                 PyObject *from, Py_ssize_t from_start,
1373
                 Py_ssize_t how_many, int check_maxchar)
1374
247M
{
1375
247M
    int from_kind, to_kind;
1376
247M
    const void *from_data;
1377
247M
    void *to_data;
1378
1379
247M
    assert(0 <= how_many);
1380
247M
    assert(0 <= from_start);
1381
247M
    assert(0 <= to_start);
1382
247M
    assert(PyUnicode_Check(from));
1383
247M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1384
1385
247M
    assert(to == NULL || PyUnicode_Check(to));
1386
1387
247M
    if (how_many == 0) {
1388
1.56M
        return 0;
1389
1.56M
    }
1390
1391
247M
    assert(to != NULL);
1392
245M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1393
1394
245M
    from_kind = PyUnicode_KIND(from);
1395
245M
    from_data = PyUnicode_DATA(from);
1396
245M
    to_kind = PyUnicode_KIND(to);
1397
245M
    to_data = PyUnicode_DATA(to);
1398
1399
#ifdef Py_DEBUG
1400
    if (!check_maxchar
1401
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1402
    {
1403
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1404
        Py_UCS4 ch;
1405
        Py_ssize_t i;
1406
        for (i=0; i < how_many; i++) {
1407
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1408
            assert(ch <= to_maxchar);
1409
        }
1410
    }
1411
#endif
1412
1413
245M
    if (from_kind == to_kind) {
1414
158M
        if (check_maxchar
1415
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1416
0
        {
1417
            /* Writing Latin-1 characters into an ASCII string requires to
1418
               check that all written characters are pure ASCII */
1419
0
            Py_UCS4 max_char;
1420
0
            max_char = ucs1lib_find_max_char(from_data,
1421
0
                                             (const Py_UCS1*)from_data + how_many);
1422
0
            if (max_char >= 128)
1423
0
                return -1;
1424
0
        }
1425
158M
        memcpy((char*)to_data + to_kind * to_start,
1426
158M
                  (const char*)from_data + from_kind * from_start,
1427
158M
                  to_kind * how_many);
1428
158M
    }
1429
86.7M
    else if (from_kind == PyUnicode_1BYTE_KIND
1430
84.6M
             && to_kind == PyUnicode_2BYTE_KIND)
1431
74.1M
    {
1432
74.1M
        _PyUnicode_CONVERT_BYTES(
1433
74.1M
            Py_UCS1, Py_UCS2,
1434
74.1M
            PyUnicode_1BYTE_DATA(from) + from_start,
1435
74.1M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1436
74.1M
            PyUnicode_2BYTE_DATA(to) + to_start
1437
74.1M
            );
1438
74.1M
    }
1439
12.6M
    else if (from_kind == PyUnicode_1BYTE_KIND
1440
10.4M
             && to_kind == PyUnicode_4BYTE_KIND)
1441
10.4M
    {
1442
10.4M
        _PyUnicode_CONVERT_BYTES(
1443
10.4M
            Py_UCS1, Py_UCS4,
1444
10.4M
            PyUnicode_1BYTE_DATA(from) + from_start,
1445
10.4M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1446
10.4M
            PyUnicode_4BYTE_DATA(to) + to_start
1447
10.4M
            );
1448
10.4M
    }
1449
2.17M
    else if (from_kind == PyUnicode_2BYTE_KIND
1450
2.16M
             && to_kind == PyUnicode_4BYTE_KIND)
1451
2.16M
    {
1452
2.16M
        _PyUnicode_CONVERT_BYTES(
1453
2.16M
            Py_UCS2, Py_UCS4,
1454
2.16M
            PyUnicode_2BYTE_DATA(from) + from_start,
1455
2.16M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1456
2.16M
            PyUnicode_4BYTE_DATA(to) + to_start
1457
2.16M
            );
1458
2.16M
    }
1459
11.7k
    else {
1460
11.7k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1461
1462
11.7k
        if (!check_maxchar) {
1463
11.7k
            if (from_kind == PyUnicode_2BYTE_KIND
1464
2.90k
                && to_kind == PyUnicode_1BYTE_KIND)
1465
2.90k
            {
1466
2.90k
                _PyUnicode_CONVERT_BYTES(
1467
2.90k
                    Py_UCS2, Py_UCS1,
1468
2.90k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1469
2.90k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1470
2.90k
                    PyUnicode_1BYTE_DATA(to) + to_start
1471
2.90k
                    );
1472
2.90k
            }
1473
8.80k
            else if (from_kind == PyUnicode_4BYTE_KIND
1474
8.80k
                     && to_kind == PyUnicode_1BYTE_KIND)
1475
5.87k
            {
1476
5.87k
                _PyUnicode_CONVERT_BYTES(
1477
5.87k
                    Py_UCS4, Py_UCS1,
1478
5.87k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1479
5.87k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1480
5.87k
                    PyUnicode_1BYTE_DATA(to) + to_start
1481
5.87k
                    );
1482
5.87k
            }
1483
2.92k
            else if (from_kind == PyUnicode_4BYTE_KIND
1484
2.92k
                     && to_kind == PyUnicode_2BYTE_KIND)
1485
2.92k
            {
1486
2.92k
                _PyUnicode_CONVERT_BYTES(
1487
2.92k
                    Py_UCS4, Py_UCS2,
1488
2.92k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1489
2.92k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1490
2.92k
                    PyUnicode_2BYTE_DATA(to) + to_start
1491
2.92k
                    );
1492
2.92k
            }
1493
0
            else {
1494
0
                Py_UNREACHABLE();
1495
0
            }
1496
11.7k
        }
1497
0
        else {
1498
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1499
0
            Py_UCS4 ch;
1500
0
            Py_ssize_t i;
1501
1502
0
            for (i=0; i < how_many; i++) {
1503
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1504
0
                if (ch > to_maxchar)
1505
0
                    return -1;
1506
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1507
0
            }
1508
0
        }
1509
11.7k
    }
1510
245M
    return 0;
1511
245M
}
1512
1513
void
1514
_PyUnicode_FastCopyCharacters(
1515
    PyObject *to, Py_ssize_t to_start,
1516
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1517
247M
{
1518
247M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1519
247M
}
1520
1521
Py_ssize_t
1522
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1523
                         PyObject *from, Py_ssize_t from_start,
1524
                         Py_ssize_t how_many)
1525
0
{
1526
0
    int err;
1527
1528
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1529
0
        PyErr_BadInternalCall();
1530
0
        return -1;
1531
0
    }
1532
1533
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1534
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1535
0
        return -1;
1536
0
    }
1537
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1538
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1539
0
        return -1;
1540
0
    }
1541
0
    if (how_many < 0) {
1542
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1543
0
        return -1;
1544
0
    }
1545
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1546
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1547
0
        PyErr_Format(PyExc_SystemError,
1548
0
                     "Cannot write %zi characters at %zi "
1549
0
                     "in a string of %zi characters",
1550
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1551
0
        return -1;
1552
0
    }
1553
1554
0
    if (how_many == 0)
1555
0
        return 0;
1556
1557
0
    if (unicode_check_modifiable(to))
1558
0
        return -1;
1559
1560
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1561
0
    if (err) {
1562
0
        PyErr_Format(PyExc_SystemError,
1563
0
                     "Cannot copy %s characters "
1564
0
                     "into a string of %s characters",
1565
0
                     unicode_kind_name(from),
1566
0
                     unicode_kind_name(to));
1567
0
        return -1;
1568
0
    }
1569
0
    return how_many;
1570
0
}
1571
1572
/* Find the maximum code point and count the number of surrogate pairs so a
1573
   correct string length can be computed before converting a string to UCS4.
1574
   This function counts single surrogates as a character and not as a pair.
1575
1576
   Return 0 on success, or -1 on error. */
1577
static int
1578
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1579
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1580
460k
{
1581
460k
    const wchar_t *iter;
1582
460k
    Py_UCS4 ch;
1583
1584
460k
    assert(num_surrogates != NULL && maxchar != NULL);
1585
460k
    *num_surrogates = 0;
1586
460k
    *maxchar = 0;
1587
1588
12.0M
    for (iter = begin; iter < end; ) {
1589
#if SIZEOF_WCHAR_T == 2
1590
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1591
            && (iter+1) < end
1592
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1593
        {
1594
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1595
            ++(*num_surrogates);
1596
            iter += 2;
1597
        }
1598
        else
1599
#endif
1600
11.6M
        {
1601
11.6M
            ch = *iter;
1602
11.6M
            iter++;
1603
11.6M
        }
1604
11.6M
        if (ch > *maxchar) {
1605
1.85M
            *maxchar = ch;
1606
1.85M
            if (*maxchar > MAX_UNICODE) {
1607
0
                PyErr_Format(PyExc_ValueError,
1608
0
                             "character U+%x is not in range [U+0000; U+%x]",
1609
0
                             ch, MAX_UNICODE);
1610
0
                return -1;
1611
0
            }
1612
1.85M
        }
1613
11.6M
    }
1614
460k
    return 0;
1615
460k
}
1616
1617
static void
1618
unicode_dealloc(PyObject *unicode)
1619
446M
{
1620
#ifdef Py_DEBUG
1621
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1622
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1623
    }
1624
#endif
1625
446M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1626
        /* This should never get called, but we also don't want to SEGV if
1627
        * we accidentally decref an immortal string out of existence. Since
1628
        * the string is an immortal object, just re-set the reference count.
1629
        */
1630
#ifdef Py_DEBUG
1631
        Py_UNREACHABLE();
1632
#endif
1633
0
        _Py_SetImmortal(unicode);
1634
0
        return;
1635
0
    }
1636
446M
    switch (_PyUnicode_STATE(unicode).interned) {
1637
446M
        case SSTATE_NOT_INTERNED:
1638
446M
            break;
1639
489k
        case SSTATE_INTERNED_MORTAL:
1640
            /* Remove the object from the intern dict.
1641
             * Before doing so, we set the refcount to 2: the key and value
1642
             * in the interned_dict.
1643
             */
1644
489k
            assert(Py_REFCNT(unicode) == 0);
1645
489k
            Py_SET_REFCNT(unicode, 2);
1646
#ifdef Py_REF_DEBUG
1647
            /* let's be pedantic with the ref total */
1648
            _Py_IncRefTotal(_PyThreadState_GET());
1649
            _Py_IncRefTotal(_PyThreadState_GET());
1650
#endif
1651
489k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1652
489k
            PyObject *interned = get_interned_dict(interp);
1653
489k
            assert(interned != NULL);
1654
489k
            PyObject *popped;
1655
489k
            int r = PyDict_Pop(interned, unicode, &popped);
1656
489k
            if (r == -1) {
1657
0
                PyErr_FormatUnraisable("Exception ignored while "
1658
0
                                       "removing an interned string %R",
1659
0
                                       unicode);
1660
                // We don't know what happened to the string. It's probably
1661
                // best to leak it:
1662
                // - if it was popped, there are no more references to it
1663
                //   so it can't cause trouble (except wasted memory)
1664
                // - if it wasn't popped, it'll remain interned
1665
0
                _Py_SetImmortal(unicode);
1666
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1667
0
                return;
1668
0
            }
1669
489k
            if (r == 0) {
1670
                // The interned string was not found in the interned_dict.
1671
#ifdef Py_DEBUG
1672
                Py_UNREACHABLE();
1673
#endif
1674
0
                _Py_SetImmortal(unicode);
1675
0
                return;
1676
0
            }
1677
            // Successfully popped.
1678
489k
            assert(popped == unicode);
1679
            // Only our `popped` reference should be left; remove it too.
1680
489k
            assert(Py_REFCNT(unicode) == 1);
1681
489k
            Py_SET_REFCNT(unicode, 0);
1682
#ifdef Py_REF_DEBUG
1683
            /* let's be pedantic with the ref total */
1684
            _Py_DecRefTotal(_PyThreadState_GET());
1685
#endif
1686
489k
            break;
1687
0
        default:
1688
            // As with `statically_allocated` above.
1689
#ifdef Py_REF_DEBUG
1690
            Py_UNREACHABLE();
1691
#endif
1692
0
            _Py_SetImmortal(unicode);
1693
0
            return;
1694
446M
    }
1695
446M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1696
156k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1697
156k
    }
1698
446M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1699
16.1M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1700
16.1M
    }
1701
1702
446M
    Py_TYPE(unicode)->tp_free(unicode);
1703
446M
}
1704
1705
#ifdef Py_DEBUG
1706
static int
1707
unicode_is_singleton(PyObject *unicode)
1708
{
1709
    if (unicode == &_Py_STR(empty)) {
1710
        return 1;
1711
    }
1712
1713
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1714
    if (ascii->length == 1) {
1715
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1716
        if (ch < 256 && LATIN1(ch) == unicode) {
1717
            return 1;
1718
        }
1719
    }
1720
    return 0;
1721
}
1722
#endif
1723
1724
int
1725
_PyUnicode_IsModifiable(PyObject *unicode)
1726
57.3M
{
1727
57.3M
    assert(_PyUnicode_CHECK(unicode));
1728
57.3M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1729
2.79M
        return 0;
1730
54.5M
    if (PyUnicode_HASH(unicode) != -1)
1731
0
        return 0;
1732
54.5M
    if (PyUnicode_CHECK_INTERNED(unicode))
1733
0
        return 0;
1734
54.5M
    if (!PyUnicode_CheckExact(unicode))
1735
0
        return 0;
1736
#ifdef Py_DEBUG
1737
    /* singleton refcount is greater than 1 */
1738
    assert(!unicode_is_singleton(unicode));
1739
#endif
1740
54.5M
    return 1;
1741
54.5M
}
1742
1743
static int
1744
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1745
1.71M
{
1746
1.71M
    PyObject *unicode;
1747
1.71M
    Py_ssize_t old_length;
1748
1749
1.71M
    assert(p_unicode != NULL);
1750
1.71M
    unicode = *p_unicode;
1751
1752
1.71M
    assert(unicode != NULL);
1753
1.71M
    assert(PyUnicode_Check(unicode));
1754
1.71M
    assert(0 <= length);
1755
1756
1.71M
    old_length = PyUnicode_GET_LENGTH(unicode);
1757
1.71M
    if (old_length == length)
1758
0
        return 0;
1759
1760
1.71M
    if (length == 0) {
1761
0
        PyObject *empty = _PyUnicode_GetEmpty();
1762
0
        Py_SETREF(*p_unicode, empty);
1763
0
        return 0;
1764
0
    }
1765
1766
1.71M
    if (!_PyUnicode_IsModifiable(unicode)) {
1767
0
        PyObject *copy = resize_copy(unicode, length);
1768
0
        if (copy == NULL)
1769
0
            return -1;
1770
0
        Py_SETREF(*p_unicode, copy);
1771
0
        return 0;
1772
0
    }
1773
1774
1.71M
    if (PyUnicode_IS_COMPACT(unicode)) {
1775
1.71M
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1776
1.71M
        if (new_unicode == NULL)
1777
0
            return -1;
1778
1.71M
        *p_unicode = new_unicode;
1779
1.71M
        return 0;
1780
1.71M
    }
1781
0
    return resize_inplace(unicode, length);
1782
1.71M
}
1783
1784
int
1785
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1786
0
{
1787
0
    PyObject *unicode;
1788
0
    if (p_unicode == NULL) {
1789
0
        PyErr_BadInternalCall();
1790
0
        return -1;
1791
0
    }
1792
0
    unicode = *p_unicode;
1793
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1794
0
    {
1795
0
        PyErr_BadInternalCall();
1796
0
        return -1;
1797
0
    }
1798
0
    return unicode_resize(p_unicode, length);
1799
0
}
1800
1801
static PyObject*
1802
get_latin1_char(Py_UCS1 ch)
1803
182M
{
1804
182M
    PyObject *o = LATIN1(ch);
1805
182M
    return o;
1806
182M
}
1807
1808
static PyObject*
1809
unicode_char(Py_UCS4 ch)
1810
172M
{
1811
172M
    PyObject *unicode;
1812
1813
172M
    assert(ch <= MAX_UNICODE);
1814
1815
172M
    if (ch < 256) {
1816
102M
        return get_latin1_char(ch);
1817
102M
    }
1818
1819
70.4M
    unicode = PyUnicode_New(1, ch);
1820
70.4M
    if (unicode == NULL)
1821
0
        return NULL;
1822
1823
70.4M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1824
70.4M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1825
63.8M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1826
63.8M
    } else {
1827
6.65M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1828
6.65M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1829
6.65M
    }
1830
70.4M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1831
70.4M
    return unicode;
1832
70.4M
}
1833
1834
1835
static inline void
1836
unicode_write_widechar(int kind, void *data,
1837
                       const wchar_t *u, Py_ssize_t size,
1838
                       Py_ssize_t num_surrogates)
1839
460k
{
1840
460k
    switch (kind) {
1841
433k
    case PyUnicode_1BYTE_KIND:
1842
433k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1843
433k
        break;
1844
1845
26.6k
    case PyUnicode_2BYTE_KIND:
1846
#if SIZEOF_WCHAR_T == 2
1847
        memcpy(data, u, size * 2);
1848
#else
1849
26.6k
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1850
26.6k
#endif
1851
26.6k
        break;
1852
1853
696
    case PyUnicode_4BYTE_KIND:
1854
696
    {
1855
#if SIZEOF_WCHAR_T == 2
1856
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1857
        // surrogate pairs.
1858
        const wchar_t *end = u + size;
1859
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1860
#  ifndef NDEBUG
1861
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1862
#  endif
1863
        for (const wchar_t *iter = u; iter < end; ) {
1864
            assert(ucs4_out < ucs4_end);
1865
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1866
                && (iter+1) < end
1867
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1868
            {
1869
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1870
                iter += 2;
1871
            }
1872
            else {
1873
                *ucs4_out++ = *iter;
1874
                iter++;
1875
            }
1876
        }
1877
        assert(ucs4_out == ucs4_end);
1878
#else
1879
696
        assert(num_surrogates == 0);
1880
696
        memcpy(data, u, size * 4);
1881
696
#endif
1882
696
        break;
1883
0
    }
1884
0
    default:
1885
0
        Py_UNREACHABLE();
1886
460k
    }
1887
460k
}
1888
1889
1890
PyObject *
1891
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1892
849k
{
1893
849k
    PyObject *unicode;
1894
849k
    Py_UCS4 maxchar = 0;
1895
849k
    Py_ssize_t num_surrogates;
1896
1897
849k
    if (u == NULL && size != 0) {
1898
0
        PyErr_BadInternalCall();
1899
0
        return NULL;
1900
0
    }
1901
1902
849k
    if (size == -1) {
1903
1.29k
        size = wcslen(u);
1904
1.29k
    }
1905
1906
    /* If the Unicode data is known at construction time, we can apply
1907
       some optimizations which share commonly used objects. */
1908
1909
    /* Optimization for empty strings */
1910
849k
    if (size == 0)
1911
308k
        _Py_RETURN_UNICODE_EMPTY();
1912
1913
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1914
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1915
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1916
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1917
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1918
        if (!converted) {
1919
            return NULL;
1920
        }
1921
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1922
        PyMem_Free(converted);
1923
        return unicode;
1924
    }
1925
#endif
1926
1927
    /* Single character Unicode objects in the Latin-1 range are
1928
       shared when using this constructor */
1929
540k
    if (size == 1 && (Py_UCS4)*u < 256)
1930
79.6k
        return get_latin1_char((unsigned char)*u);
1931
1932
    /* If not empty and not single character, copy the Unicode data
1933
       into the new object */
1934
460k
    if (find_maxchar_surrogates(u, u + size,
1935
460k
                                &maxchar, &num_surrogates) == -1)
1936
0
        return NULL;
1937
1938
460k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1939
460k
    if (!unicode)
1940
0
        return NULL;
1941
1942
460k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1943
460k
                           u, size, num_surrogates);
1944
1945
460k
    return unicode_result(unicode);
1946
460k
}
1947
1948
1949
int
1950
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1951
                              const wchar_t *str,
1952
                              Py_ssize_t size)
1953
0
{
1954
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1955
1956
0
    if (size < 0) {
1957
0
        size = wcslen(str);
1958
0
    }
1959
1960
0
    if (size == 0) {
1961
0
        return 0;
1962
0
    }
1963
1964
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1965
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1966
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1967
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1968
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1969
        if (!converted) {
1970
            return -1;
1971
        }
1972
1973
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1974
        PyMem_Free(converted);
1975
        return res;
1976
    }
1977
#endif
1978
1979
0
    Py_UCS4 maxchar = 0;
1980
0
    Py_ssize_t num_surrogates;
1981
0
    if (find_maxchar_surrogates(str, str + size,
1982
0
                                &maxchar, &num_surrogates) == -1) {
1983
0
        return -1;
1984
0
    }
1985
1986
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1987
0
        return -1;
1988
0
    }
1989
1990
0
    int kind = writer->kind;
1991
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
1992
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
1993
1994
0
    writer->pos += size - num_surrogates;
1995
0
    return 0;
1996
0
}
1997
1998
1999
PyObject *
2000
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2001
6.37M
{
2002
6.37M
    if (size < 0) {
2003
0
        PyErr_SetString(PyExc_SystemError,
2004
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2005
0
        return NULL;
2006
0
    }
2007
6.37M
    if (u != NULL) {
2008
6.37M
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2009
6.37M
    }
2010
0
    if (size > 0) {
2011
0
        PyErr_SetString(PyExc_SystemError,
2012
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2013
0
        return NULL;
2014
0
    }
2015
0
    return _PyUnicode_GetEmpty();
2016
0
}
2017
2018
PyObject *
2019
PyUnicode_FromString(const char *u)
2020
12.5M
{
2021
12.5M
    size_t size = strlen(u);
2022
12.5M
    if (size > PY_SSIZE_T_MAX) {
2023
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2024
0
        return NULL;
2025
0
    }
2026
12.5M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2027
12.5M
}
2028
2029
2030
PyObject *
2031
_PyUnicode_FromId(_Py_Identifier *id)
2032
0
{
2033
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2034
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2035
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2036
2037
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2038
0
    if (index < 0) {
2039
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2040
2041
0
        PyMutex_Lock(&rt_ids->mutex);
2042
        // Check again to detect concurrent access. Another thread can have
2043
        // initialized the index while this thread waited for the lock.
2044
0
        index = _Py_atomic_load_ssize(&id->index);
2045
0
        if (index < 0) {
2046
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2047
0
            index = rt_ids->next_index;
2048
0
            rt_ids->next_index++;
2049
0
            _Py_atomic_store_ssize(&id->index, index);
2050
0
        }
2051
0
        PyMutex_Unlock(&rt_ids->mutex);
2052
0
    }
2053
0
    assert(index >= 0);
2054
2055
0
    PyObject *obj;
2056
0
    if (index < ids->size) {
2057
0
        obj = ids->array[index];
2058
0
        if (obj) {
2059
            // Return a borrowed reference
2060
0
            goto end;
2061
0
        }
2062
0
    }
2063
2064
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2065
0
                                       NULL, NULL);
2066
0
    if (!obj) {
2067
0
        goto end;
2068
0
    }
2069
0
    _PyUnicode_InternImmortal(interp, &obj);
2070
2071
0
    if (index >= ids->size) {
2072
        // Overallocate to reduce the number of realloc
2073
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2074
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2075
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2076
0
        if (new_array == NULL) {
2077
0
            PyErr_NoMemory();
2078
0
            obj = NULL;
2079
0
            goto end;
2080
0
        }
2081
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2082
0
        ids->array = new_array;
2083
0
        ids->size = new_size;
2084
0
    }
2085
2086
    // The array stores a strong reference
2087
0
    ids->array[index] = obj;
2088
2089
0
end:
2090
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2091
    // Return a borrowed reference
2092
0
    return obj;
2093
0
}
2094
2095
2096
static void
2097
unicode_clear_identifiers(struct _Py_unicode_state *state)
2098
0
{
2099
0
    struct _Py_unicode_ids *ids = &state->ids;
2100
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2101
0
        Py_XDECREF(ids->array[i]);
2102
0
    }
2103
0
    ids->size = 0;
2104
0
    PyMem_Free(ids->array);
2105
0
    ids->array = NULL;
2106
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2107
    // after Py_Finalize().
2108
0
}
2109
2110
2111
/* Internal function, doesn't check maximum character */
2112
2113
PyObject*
2114
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2115
117M
{
2116
117M
    const unsigned char *s = (const unsigned char *)buffer;
2117
117M
    PyObject *unicode;
2118
117M
    if (size == 1) {
2119
#ifdef Py_DEBUG
2120
        assert((unsigned char)s[0] < 128);
2121
#endif
2122
44.5M
        return get_latin1_char(s[0]);
2123
44.5M
    }
2124
72.7M
    unicode = PyUnicode_New(size, 127);
2125
72.7M
    if (!unicode)
2126
0
        return NULL;
2127
72.7M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2128
72.7M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2129
72.7M
    return unicode;
2130
72.7M
}
2131
2132
static Py_UCS4
2133
kind_maxchar_limit(int kind)
2134
0
{
2135
0
    switch (kind) {
2136
0
    case PyUnicode_1BYTE_KIND:
2137
0
        return 0x80;
2138
0
    case PyUnicode_2BYTE_KIND:
2139
0
        return 0x100;
2140
0
    case PyUnicode_4BYTE_KIND:
2141
0
        return 0x10000;
2142
0
    default:
2143
0
        Py_UNREACHABLE();
2144
0
    }
2145
0
}
2146
2147
static PyObject*
2148
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2149
50.8M
{
2150
50.8M
    PyObject *res;
2151
50.8M
    unsigned char max_char;
2152
2153
50.8M
    if (size == 0) {
2154
10.3M
        _Py_RETURN_UNICODE_EMPTY();
2155
10.3M
    }
2156
50.8M
    assert(size > 0);
2157
40.5M
    if (size == 1) {
2158
10.2M
        return get_latin1_char(u[0]);
2159
10.2M
    }
2160
2161
30.3M
    max_char = ucs1lib_find_max_char(u, u + size);
2162
30.3M
    res = PyUnicode_New(size, max_char);
2163
30.3M
    if (!res)
2164
0
        return NULL;
2165
30.3M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2166
30.3M
    assert(_PyUnicode_CheckConsistency(res, 1));
2167
30.3M
    return res;
2168
30.3M
}
2169
2170
static PyObject*
2171
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2172
93.2M
{
2173
93.2M
    PyObject *res;
2174
93.2M
    Py_UCS2 max_char;
2175
2176
93.2M
    if (size == 0)
2177
11.4M
        _Py_RETURN_UNICODE_EMPTY();
2178
93.2M
    assert(size > 0);
2179
81.8M
    if (size == 1)
2180
55.6M
        return unicode_char(u[0]);
2181
2182
26.1M
    max_char = ucs2lib_find_max_char(u, u + size);
2183
26.1M
    res = PyUnicode_New(size, max_char);
2184
26.1M
    if (!res)
2185
0
        return NULL;
2186
26.1M
    if (max_char >= 256)
2187
17.1M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2188
9.02M
    else {
2189
9.02M
        _PyUnicode_CONVERT_BYTES(
2190
9.02M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2191
9.02M
    }
2192
26.1M
    assert(_PyUnicode_CheckConsistency(res, 1));
2193
26.1M
    return res;
2194
26.1M
}
2195
2196
static PyObject*
2197
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2198
55.7M
{
2199
55.7M
    PyObject *res;
2200
55.7M
    Py_UCS4 max_char;
2201
2202
55.7M
    if (size == 0)
2203
6.79M
        _Py_RETURN_UNICODE_EMPTY();
2204
55.7M
    assert(size > 0);
2205
48.9M
    if (size == 1)
2206
31.4M
        return unicode_char(u[0]);
2207
2208
17.4M
    max_char = ucs4lib_find_max_char(u, u + size);
2209
17.4M
    res = PyUnicode_New(size, max_char);
2210
17.4M
    if (!res)
2211
0
        return NULL;
2212
17.4M
    if (max_char < 256)
2213
12.3M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2214
17.4M
                                 PyUnicode_1BYTE_DATA(res));
2215
5.08M
    else if (max_char < 0x10000)
2216
3.77M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2217
5.08M
                                 PyUnicode_2BYTE_DATA(res));
2218
1.30M
    else
2219
1.30M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2220
17.4M
    assert(_PyUnicode_CheckConsistency(res, 1));
2221
17.4M
    return res;
2222
17.4M
}
2223
2224
2225
int
2226
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2227
                          const Py_UCS4 *str,
2228
                          Py_ssize_t size)
2229
0
{
2230
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2231
2232
0
    if (size < 0) {
2233
0
        PyErr_SetString(PyExc_ValueError,
2234
0
                        "size must be positive");
2235
0
        return -1;
2236
0
    }
2237
2238
0
    if (size == 0) {
2239
0
        return 0;
2240
0
    }
2241
2242
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2243
2244
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2245
0
        return -1;
2246
0
    }
2247
2248
0
    int kind = writer->kind;
2249
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2250
0
    if (kind == PyUnicode_1BYTE_KIND) {
2251
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2252
0
                                 str, str + size,
2253
0
                                 data);
2254
0
    }
2255
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2256
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2257
0
                                 str, str + size,
2258
0
                                 data);
2259
0
    }
2260
0
    else {
2261
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2262
0
    }
2263
0
    writer->pos += size;
2264
2265
0
    return 0;
2266
0
}
2267
2268
2269
PyObject*
2270
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2271
147M
{
2272
147M
    if (size < 0) {
2273
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2274
0
        return NULL;
2275
0
    }
2276
147M
    switch (kind) {
2277
20.5M
    case PyUnicode_1BYTE_KIND:
2278
20.5M
        return _PyUnicode_FromUCS1(buffer, size);
2279
80.9M
    case PyUnicode_2BYTE_KIND:
2280
80.9M
        return _PyUnicode_FromUCS2(buffer, size);
2281
46.3M
    case PyUnicode_4BYTE_KIND:
2282
46.3M
        return _PyUnicode_FromUCS4(buffer, size);
2283
0
    default:
2284
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2285
0
        return NULL;
2286
147M
    }
2287
147M
}
2288
2289
Py_UCS4
2290
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2291
13.4M
{
2292
13.4M
    int kind;
2293
13.4M
    const void *startptr, *endptr;
2294
2295
13.4M
    assert(0 <= start);
2296
13.4M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2297
13.4M
    assert(start <= end);
2298
2299
13.4M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2300
69.1k
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2301
2302
13.3M
    if (start == end)
2303
0
        return 127;
2304
2305
13.3M
    if (PyUnicode_IS_ASCII(unicode))
2306
13.3M
        return 127;
2307
2308
21.3k
    kind = PyUnicode_KIND(unicode);
2309
21.3k
    startptr = PyUnicode_DATA(unicode);
2310
21.3k
    endptr = (char *)startptr + end * kind;
2311
21.3k
    startptr = (char *)startptr + start * kind;
2312
21.3k
    switch(kind) {
2313
1.54k
    case PyUnicode_1BYTE_KIND:
2314
1.54k
        return ucs1lib_find_max_char(startptr, endptr);
2315
4.29k
    case PyUnicode_2BYTE_KIND:
2316
4.29k
        return ucs2lib_find_max_char(startptr, endptr);
2317
15.5k
    case PyUnicode_4BYTE_KIND:
2318
15.5k
        return ucs4lib_find_max_char(startptr, endptr);
2319
0
    default:
2320
0
        Py_UNREACHABLE();
2321
21.3k
    }
2322
21.3k
}
2323
2324
/* Ensure that a string uses the most efficient storage, if it is not the
2325
   case: create a new string with of the right kind. Write NULL into *p_unicode
2326
   on error. */
2327
static void
2328
unicode_adjust_maxchar(PyObject **p_unicode)
2329
0
{
2330
0
    PyObject *unicode, *copy;
2331
0
    Py_UCS4 max_char;
2332
0
    Py_ssize_t len;
2333
0
    int kind;
2334
2335
0
    assert(p_unicode != NULL);
2336
0
    unicode = *p_unicode;
2337
0
    if (PyUnicode_IS_ASCII(unicode))
2338
0
        return;
2339
2340
0
    len = PyUnicode_GET_LENGTH(unicode);
2341
0
    kind = PyUnicode_KIND(unicode);
2342
0
    if (kind == PyUnicode_1BYTE_KIND) {
2343
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2344
0
        max_char = ucs1lib_find_max_char(u, u + len);
2345
0
        if (max_char >= 128)
2346
0
            return;
2347
0
    }
2348
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2349
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2350
0
        max_char = ucs2lib_find_max_char(u, u + len);
2351
0
        if (max_char >= 256)
2352
0
            return;
2353
0
    }
2354
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2355
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2356
0
        max_char = ucs4lib_find_max_char(u, u + len);
2357
0
        if (max_char >= 0x10000)
2358
0
            return;
2359
0
    }
2360
0
    else
2361
0
        Py_UNREACHABLE();
2362
2363
0
    copy = PyUnicode_New(len, max_char);
2364
0
    if (copy != NULL)
2365
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2366
0
    Py_DECREF(unicode);
2367
0
    *p_unicode = copy;
2368
0
}
2369
2370
PyObject*
2371
_PyUnicode_Copy(PyObject *unicode)
2372
2.84M
{
2373
2.84M
    Py_ssize_t length;
2374
2.84M
    PyObject *copy;
2375
2376
2.84M
    if (!PyUnicode_Check(unicode)) {
2377
0
        PyErr_BadInternalCall();
2378
0
        return NULL;
2379
0
    }
2380
2381
2.84M
    length = PyUnicode_GET_LENGTH(unicode);
2382
2.84M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2383
2.84M
    if (!copy)
2384
0
        return NULL;
2385
2.84M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
2387
2.84M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2388
2.84M
              length * PyUnicode_KIND(unicode));
2389
2.84M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2390
2.84M
    return copy;
2391
2.84M
}
2392
2393
2394
/* Widen Unicode objects to larger buffers. Don't write terminating null
2395
   character. Return NULL on error. */
2396
2397
static void*
2398
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2399
6.19M
{
2400
6.19M
    void *result;
2401
2402
6.19M
    assert(skind < kind);
2403
6.19M
    switch (kind) {
2404
4.66M
    case PyUnicode_2BYTE_KIND:
2405
4.66M
        result = PyMem_New(Py_UCS2, len);
2406
4.66M
        if (!result)
2407
0
            return PyErr_NoMemory();
2408
4.66M
        assert(skind == PyUnicode_1BYTE_KIND);
2409
4.66M
        _PyUnicode_CONVERT_BYTES(
2410
4.66M
            Py_UCS1, Py_UCS2,
2411
4.66M
            (const Py_UCS1 *)data,
2412
4.66M
            ((const Py_UCS1 *)data) + len,
2413
4.66M
            result);
2414
4.66M
        return result;
2415
1.52M
    case PyUnicode_4BYTE_KIND:
2416
1.52M
        result = PyMem_New(Py_UCS4, len);
2417
1.52M
        if (!result)
2418
0
            return PyErr_NoMemory();
2419
1.52M
        if (skind == PyUnicode_2BYTE_KIND) {
2420
0
            _PyUnicode_CONVERT_BYTES(
2421
0
                Py_UCS2, Py_UCS4,
2422
0
                (const Py_UCS2 *)data,
2423
0
                ((const Py_UCS2 *)data) + len,
2424
0
                result);
2425
0
        }
2426
1.52M
        else {
2427
1.52M
            assert(skind == PyUnicode_1BYTE_KIND);
2428
1.52M
            _PyUnicode_CONVERT_BYTES(
2429
1.52M
                Py_UCS1, Py_UCS4,
2430
1.52M
                (const Py_UCS1 *)data,
2431
1.52M
                ((const Py_UCS1 *)data) + len,
2432
1.52M
                result);
2433
1.52M
        }
2434
1.52M
        return result;
2435
0
    default:
2436
0
        Py_UNREACHABLE();
2437
0
        return NULL;
2438
6.19M
    }
2439
6.19M
}
2440
2441
static Py_UCS4*
2442
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2443
        int copy_null)
2444
75.3k
{
2445
75.3k
    int kind;
2446
75.3k
    const void *data;
2447
75.3k
    Py_ssize_t len, targetlen;
2448
75.3k
    kind = PyUnicode_KIND(string);
2449
75.3k
    data = PyUnicode_DATA(string);
2450
75.3k
    len = PyUnicode_GET_LENGTH(string);
2451
75.3k
    targetlen = len;
2452
75.3k
    if (copy_null)
2453
0
        targetlen++;
2454
75.3k
    if (!target) {
2455
0
        target = PyMem_New(Py_UCS4, targetlen);
2456
0
        if (!target) {
2457
0
            PyErr_NoMemory();
2458
0
            return NULL;
2459
0
        }
2460
0
    }
2461
75.3k
    else {
2462
75.3k
        if (targetsize < targetlen) {
2463
0
            PyErr_Format(PyExc_SystemError,
2464
0
                         "string is longer than the buffer");
2465
0
            if (copy_null && 0 < targetsize)
2466
0
                target[0] = 0;
2467
0
            return NULL;
2468
0
        }
2469
75.3k
    }
2470
75.3k
    if (kind == PyUnicode_1BYTE_KIND) {
2471
52.4k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2472
52.4k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2473
52.4k
    }
2474
22.8k
    else if (kind == PyUnicode_2BYTE_KIND) {
2475
16.4k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2476
16.4k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2477
16.4k
    }
2478
6.38k
    else if (kind == PyUnicode_4BYTE_KIND) {
2479
6.38k
        memcpy(target, data, len * sizeof(Py_UCS4));
2480
6.38k
    }
2481
0
    else {
2482
0
        Py_UNREACHABLE();
2483
0
    }
2484
75.3k
    if (copy_null)
2485
0
        target[len] = 0;
2486
75.3k
    return target;
2487
75.3k
}
2488
2489
Py_UCS4*
2490
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2491
                 int copy_null)
2492
75.3k
{
2493
75.3k
    if (target == NULL || targetsize < 0) {
2494
0
        PyErr_BadInternalCall();
2495
0
        return NULL;
2496
0
    }
2497
75.3k
    return as_ucs4(string, target, targetsize, copy_null);
2498
75.3k
}
2499
2500
Py_UCS4*
2501
PyUnicode_AsUCS4Copy(PyObject *string)
2502
0
{
2503
0
    return as_ucs4(string, NULL, 0, 1);
2504
0
}
2505
2506
/* maximum number of characters required for output of %jo or %jd or %p.
2507
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2508
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2509
   plus 1 for the terminal NUL. */
2510
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2511
2512
static int
2513
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2514
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2515
8.14M
{
2516
8.14M
    Py_ssize_t length, fill, arglen;
2517
8.14M
    Py_UCS4 maxchar;
2518
2519
8.14M
    length = PyUnicode_GET_LENGTH(str);
2520
8.14M
    if ((precision == -1 || precision >= length)
2521
8.14M
        && width <= length)
2522
8.14M
        return _PyUnicodeWriter_WriteStr(writer, str);
2523
2524
50
    if (precision != -1)
2525
50
        length = Py_MIN(precision, length);
2526
2527
50
    arglen = Py_MAX(length, width);
2528
50
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2529
21
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2530
29
    else
2531
29
        maxchar = writer->maxchar;
2532
2533
50
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2534
0
        return -1;
2535
2536
50
    fill = Py_MAX(width - length, 0);
2537
50
    if (fill && !(flags & F_LJUST)) {
2538
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2539
0
            return -1;
2540
0
        writer->pos += fill;
2541
0
    }
2542
2543
50
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2544
50
                                  str, 0, length);
2545
50
    writer->pos += length;
2546
2547
50
    if (fill && (flags & F_LJUST)) {
2548
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2549
0
            return -1;
2550
0
        writer->pos += fill;
2551
0
    }
2552
2553
50
    return 0;
2554
50
}
2555
2556
static int
2557
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2558
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2559
3.15M
{
2560
    /* UTF-8 */
2561
3.15M
    Py_ssize_t *pconsumed = NULL;
2562
3.15M
    Py_ssize_t length;
2563
3.15M
    if (precision == -1) {
2564
946k
        length = strlen(str);
2565
946k
    }
2566
2.21M
    else {
2567
2.21M
        length = 0;
2568
34.9M
        while (length < precision && str[length]) {
2569
32.7M
            length++;
2570
32.7M
        }
2571
2.21M
        if (length == precision) {
2572
            /* The input string is not NUL-terminated.  If it ends with an
2573
             * incomplete UTF-8 sequence, truncate the string just before it.
2574
             * Incomplete sequences in the middle and sequences which cannot
2575
             * be valid prefixes are still treated as errors and replaced
2576
             * with \xfffd. */
2577
1.77k
            pconsumed = &length;
2578
1.77k
        }
2579
2.21M
    }
2580
2581
3.15M
    if (width < 0) {
2582
3.15M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2583
3.15M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2584
3.15M
    }
2585
2586
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2587
0
                                                     "replace", pconsumed);
2588
0
    if (unicode == NULL)
2589
0
        return -1;
2590
2591
0
    int res = unicode_fromformat_write_str(writer, unicode,
2592
0
                                           width, -1, flags);
2593
0
    Py_DECREF(unicode);
2594
0
    return res;
2595
0
}
2596
2597
static int
2598
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2599
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2600
0
{
2601
0
    Py_ssize_t length;
2602
0
    if (precision == -1) {
2603
0
        length = wcslen(str);
2604
0
    }
2605
0
    else {
2606
0
        length = 0;
2607
0
        while (length < precision && str[length]) {
2608
0
            length++;
2609
0
        }
2610
0
    }
2611
2612
0
    if (width < 0) {
2613
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2614
0
                                             str, length);
2615
0
    }
2616
2617
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2618
0
    if (unicode == NULL)
2619
0
        return -1;
2620
2621
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2622
0
    Py_DECREF(unicode);
2623
0
    return res;
2624
0
}
2625
2626
0
#define F_LONG 1
2627
0
#define F_LONGLONG 2
2628
217k
#define F_SIZE 3
2629
0
#define F_PTRDIFF 4
2630
0
#define F_INTMAX 5
2631
2632
static const char*
2633
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2634
                       const char *f, va_list *vargs)
2635
24.6M
{
2636
24.6M
    const char *p;
2637
24.6M
    Py_ssize_t len;
2638
24.6M
    int flags = 0;
2639
24.6M
    Py_ssize_t width;
2640
24.6M
    Py_ssize_t precision;
2641
2642
24.6M
    p = f;
2643
24.6M
    f++;
2644
24.6M
    if (*f == '%') {
2645
754k
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2646
0
            return NULL;
2647
754k
        f++;
2648
754k
        return f;
2649
754k
    }
2650
2651
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2652
    /* Flags '+', ' ' and '#' are not particularly useful.
2653
     * They are not worth the implementation and maintenance costs.
2654
     * In addition, '#' should add "0" for "o" conversions for compatibility
2655
     * with printf, but it would confuse Python users. */
2656
23.9M
    while (1) {
2657
23.9M
        switch (*f++) {
2658
0
        case '-': flags |= F_LJUST; continue;
2659
1.61k
        case '0': flags |= F_ZERO; continue;
2660
0
        case '#': flags |= F_ALT; continue;
2661
23.9M
        }
2662
23.9M
        f--;
2663
23.9M
        break;
2664
23.9M
    }
2665
2666
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2667
23.9M
    width = -1;
2668
23.9M
    if (*f == '*') {
2669
0
        width = va_arg(*vargs, int);
2670
0
        if (width < 0) {
2671
0
            flags |= F_LJUST;
2672
0
            width = -width;
2673
0
        }
2674
0
        f++;
2675
0
    }
2676
23.9M
    else if (Py_ISDIGIT((unsigned)*f)) {
2677
1.61k
        width = *f - '0';
2678
1.61k
        f++;
2679
1.61k
        while (Py_ISDIGIT((unsigned)*f)) {
2680
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2681
0
                PyErr_SetString(PyExc_ValueError,
2682
0
                                "width too big");
2683
0
                return NULL;
2684
0
            }
2685
0
            width = (width * 10) + (*f - '0');
2686
0
            f++;
2687
0
        }
2688
1.61k
    }
2689
23.9M
    precision = -1;
2690
23.9M
    if (*f == '.') {
2691
5.30M
        f++;
2692
5.30M
        if (*f == '*') {
2693
0
            precision = va_arg(*vargs, int);
2694
0
            if (precision < 0) {
2695
0
                precision = -2;
2696
0
            }
2697
0
            f++;
2698
0
        }
2699
5.30M
        else if (Py_ISDIGIT((unsigned)*f)) {
2700
5.30M
            precision = (*f - '0');
2701
5.30M
            f++;
2702
15.9M
            while (Py_ISDIGIT((unsigned)*f)) {
2703
10.6M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2704
0
                    PyErr_SetString(PyExc_ValueError,
2705
0
                                    "precision too big");
2706
0
                    return NULL;
2707
0
                }
2708
10.6M
                precision = (precision * 10) + (*f - '0');
2709
10.6M
                f++;
2710
10.6M
            }
2711
5.30M
        }
2712
5.30M
    }
2713
2714
23.9M
    int sizemod = 0;
2715
23.9M
    if (*f == 'l') {
2716
0
        if (f[1] == 'l') {
2717
0
            sizemod = F_LONGLONG;
2718
0
            f += 2;
2719
0
        }
2720
0
        else {
2721
0
            sizemod = F_LONG;
2722
0
            ++f;
2723
0
        }
2724
0
    }
2725
23.9M
    else if (*f == 'z') {
2726
108k
        sizemod = F_SIZE;
2727
108k
        ++f;
2728
108k
    }
2729
23.8M
    else if (*f == 't') {
2730
0
        sizemod = F_PTRDIFF;
2731
0
        ++f;
2732
0
    }
2733
23.8M
    else if (*f == 'j') {
2734
0
        sizemod = F_INTMAX;
2735
0
        ++f;
2736
0
    }
2737
23.9M
    if (f[0] != '\0' && f[1] == '\0')
2738
4.65M
        writer->overallocate = 0;
2739
2740
23.9M
    switch (*f) {
2741
11.1M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2742
11.1M
        break;
2743
1.46M
    case 'c': case 'p':
2744
1.46M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2745
1.46M
        break;
2746
3.15M
    case 's':
2747
3.15M
    case 'V':
2748
3.15M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2749
3.15M
        break;
2750
8.14M
    default:
2751
8.14M
        if (sizemod) goto invalid_format;
2752
8.14M
        break;
2753
23.9M
    }
2754
2755
23.9M
    switch (*f) {
2756
1.46M
    case 'c':
2757
1.46M
    {
2758
1.46M
        int ordinal = va_arg(*vargs, int);
2759
1.46M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2760
0
            PyErr_SetString(PyExc_OverflowError,
2761
0
                            "character argument not in range(0x110000)");
2762
0
            return NULL;
2763
0
        }
2764
1.46M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2765
0
            return NULL;
2766
1.46M
        break;
2767
1.46M
    }
2768
2769
11.1M
    case 'd': case 'i':
2770
11.1M
    case 'o': case 'u': case 'x': case 'X':
2771
11.1M
    {
2772
11.1M
        char buffer[MAX_INTMAX_CHARS];
2773
2774
        // Fill buffer using sprinf, with one of many possible format
2775
        // strings, like "%llX" for `long long` in hexadecimal.
2776
        // The type/size is in `sizemod`; the format is in `*f`.
2777
2778
        // Use macros with nested switches to keep the sprintf format strings
2779
        // as compile-time literals, avoiding warnings and maybe allowing
2780
        // optimizations.
2781
2782
        // `SPRINT` macro does one sprintf
2783
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2784
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2785
11.1M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2786
11.1M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2787
2788
        // One inner switch to handle all format variants
2789
11.1M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2790
11.1M
            switch (*f) {                                                     \
2791
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2792
16.2k
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2793
1.18k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2794
944
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2795
11.1M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2796
11.1M
            }
2797
2798
        // Outer switch to handle all the sizes/types
2799
11.1M
        switch (sizemod) {
2800
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2801
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2802
108k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2803
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2804
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2805
11.0M
            default:         DO_SPRINTS("", int, unsigned int); break;
2806
11.1M
        }
2807
11.1M
        #undef SPRINT
2808
11.1M
        #undef DO_SPRINTS
2809
2810
11.1M
        assert(len >= 0);
2811
2812
11.1M
        int sign = (buffer[0] == '-');
2813
11.1M
        len -= sign;
2814
2815
11.1M
        precision = Py_MAX(precision, len);
2816
11.1M
        width = Py_MAX(width, precision + sign);
2817
11.1M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2818
1.61k
            precision = width - sign;
2819
1.61k
        }
2820
2821
11.1M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2822
11.1M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2823
2824
11.1M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2825
0
            return NULL;
2826
2827
11.1M
        if (spacepad && !(flags & F_LJUST)) {
2828
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2829
0
                return NULL;
2830
0
            writer->pos += spacepad;
2831
0
        }
2832
2833
11.1M
        if (sign) {
2834
778
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2835
0
                return NULL;
2836
778
        }
2837
2838
11.1M
        if (zeropad) {
2839
612
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2840
0
                return NULL;
2841
612
            writer->pos += zeropad;
2842
612
        }
2843
2844
11.1M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2845
0
            return NULL;
2846
2847
11.1M
        if (spacepad && (flags & F_LJUST)) {
2848
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2849
0
                return NULL;
2850
0
            writer->pos += spacepad;
2851
0
        }
2852
11.1M
        break;
2853
11.1M
    }
2854
2855
11.1M
    case 'p':
2856
2.67k
    {
2857
2.67k
        char number[MAX_INTMAX_CHARS];
2858
2859
2.67k
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2860
2.67k
        assert(len >= 0);
2861
2862
        /* %p is ill-defined:  ensure leading 0x. */
2863
2.67k
        if (number[1] == 'X')
2864
0
            number[1] = 'x';
2865
2.67k
        else if (number[1] != 'x') {
2866
0
            memmove(number + 2, number,
2867
0
                    strlen(number) + 1);
2868
0
            number[0] = '0';
2869
0
            number[1] = 'x';
2870
0
            len += 2;
2871
0
        }
2872
2873
2.67k
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2874
0
            return NULL;
2875
2.67k
        break;
2876
2.67k
    }
2877
2878
3.15M
    case 's':
2879
3.15M
    {
2880
3.15M
        if (sizemod) {
2881
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2882
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2883
0
                return NULL;
2884
0
        }
2885
3.15M
        else {
2886
            /* UTF-8 */
2887
3.15M
            const char *s = va_arg(*vargs, const char*);
2888
3.15M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2889
0
                return NULL;
2890
3.15M
        }
2891
3.15M
        break;
2892
3.15M
    }
2893
2894
3.59M
    case 'U':
2895
3.59M
    {
2896
3.59M
        PyObject *obj = va_arg(*vargs, PyObject *);
2897
3.59M
        assert(obj && _PyUnicode_CHECK(obj));
2898
2899
3.59M
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2900
0
            return NULL;
2901
3.59M
        break;
2902
3.59M
    }
2903
2904
3.59M
    case 'V':
2905
517
    {
2906
517
        PyObject *obj = va_arg(*vargs, PyObject *);
2907
517
        const char *str;
2908
517
        const wchar_t *wstr;
2909
517
        if (sizemod) {
2910
0
            wstr = va_arg(*vargs, const wchar_t*);
2911
0
        }
2912
517
        else {
2913
517
            str = va_arg(*vargs, const char *);
2914
517
        }
2915
517
        if (obj) {
2916
0
            assert(_PyUnicode_CHECK(obj));
2917
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2918
0
                return NULL;
2919
0
        }
2920
517
        else if (sizemod) {
2921
0
            assert(wstr != NULL);
2922
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2923
0
                return NULL;
2924
0
        }
2925
517
        else {
2926
517
            assert(str != NULL);
2927
517
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2928
0
                return NULL;
2929
517
        }
2930
517
        break;
2931
517
    }
2932
2933
1.65k
    case 'S':
2934
1.65k
    {
2935
1.65k
        PyObject *obj = va_arg(*vargs, PyObject *);
2936
1.65k
        PyObject *str;
2937
1.65k
        assert(obj);
2938
1.65k
        str = PyObject_Str(obj);
2939
1.65k
        if (!str)
2940
0
            return NULL;
2941
1.65k
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2942
0
            Py_DECREF(str);
2943
0
            return NULL;
2944
0
        }
2945
1.65k
        Py_DECREF(str);
2946
1.65k
        break;
2947
1.65k
    }
2948
2949
3.09M
    case 'R':
2950
3.09M
    {
2951
3.09M
        PyObject *obj = va_arg(*vargs, PyObject *);
2952
3.09M
        PyObject *repr;
2953
3.09M
        assert(obj);
2954
3.09M
        repr = PyObject_Repr(obj);
2955
3.09M
        if (!repr)
2956
0
            return NULL;
2957
3.09M
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2958
0
            Py_DECREF(repr);
2959
0
            return NULL;
2960
0
        }
2961
3.09M
        Py_DECREF(repr);
2962
3.09M
        break;
2963
3.09M
    }
2964
2965
0
    case 'A':
2966
0
    {
2967
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2968
0
        PyObject *ascii;
2969
0
        assert(obj);
2970
0
        ascii = PyObject_ASCII(obj);
2971
0
        if (!ascii)
2972
0
            return NULL;
2973
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2974
0
            Py_DECREF(ascii);
2975
0
            return NULL;
2976
0
        }
2977
0
        Py_DECREF(ascii);
2978
0
        break;
2979
0
    }
2980
2981
1.45M
    case 'T':
2982
1.45M
    {
2983
1.45M
        PyObject *obj = va_arg(*vargs, PyObject *);
2984
1.45M
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2985
2986
1.45M
        PyObject *type_name;
2987
1.45M
        if (flags & F_ALT) {
2988
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2989
0
        }
2990
1.45M
        else {
2991
1.45M
            type_name = PyType_GetFullyQualifiedName(type);
2992
1.45M
        }
2993
1.45M
        Py_DECREF(type);
2994
1.45M
        if (!type_name) {
2995
0
            return NULL;
2996
0
        }
2997
2998
1.45M
        if (unicode_fromformat_write_str(writer, type_name,
2999
1.45M
                                         width, precision, flags) == -1) {
3000
0
            Py_DECREF(type_name);
3001
0
            return NULL;
3002
0
        }
3003
1.45M
        Py_DECREF(type_name);
3004
1.45M
        break;
3005
1.45M
    }
3006
3007
0
    case 'N':
3008
0
    {
3009
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3010
0
        assert(type_raw != NULL);
3011
3012
0
        if (!PyType_Check(type_raw)) {
3013
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3014
0
            return NULL;
3015
0
        }
3016
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3017
3018
0
        PyObject *type_name;
3019
0
        if (flags & F_ALT) {
3020
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3021
0
        }
3022
0
        else {
3023
0
            type_name = PyType_GetFullyQualifiedName(type);
3024
0
        }
3025
0
        if (!type_name) {
3026
0
            return NULL;
3027
0
        }
3028
0
        if (unicode_fromformat_write_str(writer, type_name,
3029
0
                                         width, precision, flags) == -1) {
3030
0
            Py_DECREF(type_name);
3031
0
            return NULL;
3032
0
        }
3033
0
        Py_DECREF(type_name);
3034
0
        break;
3035
0
    }
3036
3037
0
    default:
3038
0
    invalid_format:
3039
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3040
0
        return NULL;
3041
23.9M
    }
3042
3043
23.9M
    f++;
3044
23.9M
    return f;
3045
23.9M
}
3046
3047
static int
3048
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3049
11.7M
{
3050
11.7M
    Py_ssize_t len = strlen(format);
3051
11.7M
    writer->min_length += len + 100;
3052
11.7M
    writer->overallocate = 1;
3053
3054
    // Copy varags to be able to pass a reference to a subfunction.
3055
11.7M
    va_list vargs2;
3056
11.7M
    va_copy(vargs2, vargs);
3057
3058
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3059
    // to be encoded to ASCII.
3060
11.7M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3061
11.7M
    if (!is_ascii) {
3062
0
        Py_ssize_t i;
3063
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3064
0
        PyErr_Format(PyExc_ValueError,
3065
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3066
0
            "string, got a non-ASCII byte: 0x%02x",
3067
0
            (unsigned char)format[i]);
3068
0
        goto fail;
3069
0
    }
3070
3071
65.1M
    for (const char *f = format; *f; ) {
3072
53.4M
        if (*f == '%') {
3073
24.6M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3074
24.6M
            if (f == NULL)
3075
0
                goto fail;
3076
24.6M
        }
3077
28.7M
        else {
3078
28.7M
            const char *p = strchr(f, '%');
3079
28.7M
            if (p != NULL) {
3080
21.7M
                len = p - f;
3081
21.7M
            }
3082
7.07M
            else {
3083
7.07M
                len = strlen(f);
3084
7.07M
                writer->overallocate = 0;
3085
7.07M
            }
3086
3087
28.7M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3088
0
                goto fail;
3089
0
            }
3090
28.7M
            f += len;
3091
28.7M
        }
3092
53.4M
    }
3093
11.7M
    va_end(vargs2);
3094
11.7M
    return 0;
3095
3096
0
  fail:
3097
0
    va_end(vargs2);
3098
0
    return -1;
3099
11.7M
}
3100
3101
PyObject *
3102
PyUnicode_FromFormatV(const char *format, va_list vargs)
3103
11.7M
{
3104
11.7M
    _PyUnicodeWriter writer;
3105
11.7M
    _PyUnicodeWriter_Init(&writer);
3106
3107
11.7M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3108
0
        _PyUnicodeWriter_Dealloc(&writer);
3109
0
        return NULL;
3110
0
    }
3111
11.7M
    return _PyUnicodeWriter_Finish(&writer);
3112
11.7M
}
3113
3114
PyObject *
3115
PyUnicode_FromFormat(const char *format, ...)
3116
740k
{
3117
740k
    PyObject* ret;
3118
740k
    va_list vargs;
3119
3120
740k
    va_start(vargs, format);
3121
740k
    ret = PyUnicode_FromFormatV(format, vargs);
3122
740k
    va_end(vargs);
3123
740k
    return ret;
3124
740k
}
3125
3126
int
3127
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3128
0
{
3129
0
    va_list vargs;
3130
0
    va_start(vargs, format);
3131
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3132
0
    va_end(vargs);
3133
0
    return res;
3134
0
}
3135
3136
int
3137
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3138
                         va_list vargs)
3139
0
{
3140
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3141
0
    Py_ssize_t old_pos = _writer->pos;
3142
3143
0
    int res = unicode_from_format(_writer, format, vargs);
3144
3145
0
    if (res < 0) {
3146
0
        _writer->pos = old_pos;
3147
0
    }
3148
0
    return res;
3149
0
}
3150
3151
static Py_ssize_t
3152
unicode_get_widechar_size(PyObject *unicode)
3153
207k
{
3154
207k
    Py_ssize_t res;
3155
3156
207k
    assert(unicode != NULL);
3157
207k
    assert(_PyUnicode_CHECK(unicode));
3158
3159
207k
    res = _PyUnicode_LENGTH(unicode);
3160
#if SIZEOF_WCHAR_T == 2
3161
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3162
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3163
        const Py_UCS4 *end = s + res;
3164
        for (; s < end; ++s) {
3165
            if (*s > 0xFFFF) {
3166
                ++res;
3167
            }
3168
        }
3169
    }
3170
#endif
3171
207k
    return res;
3172
207k
}
3173
3174
static void
3175
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3176
207k
{
3177
207k
    assert(unicode != NULL);
3178
207k
    assert(_PyUnicode_CHECK(unicode));
3179
3180
207k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3181
696
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3182
696
        return;
3183
696
    }
3184
3185
207k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3186
180k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3187
4.27M
        for (; size--; ++s, ++w) {
3188
4.09M
            *w = *s;
3189
4.09M
        }
3190
180k
    }
3191
26.6k
    else {
3192
26.6k
#if SIZEOF_WCHAR_T == 4
3193
26.6k
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3194
26.6k
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3195
1.04M
        for (; size--; ++s, ++w) {
3196
1.01M
            *w = *s;
3197
1.01M
        }
3198
#else
3199
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3200
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3201
        for (; size--; ++s, ++w) {
3202
            Py_UCS4 ch = *s;
3203
            if (ch > 0xFFFF) {
3204
                assert(ch <= MAX_UNICODE);
3205
                /* encode surrogate pair in this case */
3206
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3207
                if (!size--)
3208
                    break;
3209
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3210
            }
3211
            else {
3212
                *w = ch;
3213
            }
3214
        }
3215
#endif
3216
26.6k
    }
3217
207k
}
3218
3219
#ifdef HAVE_WCHAR_H
3220
3221
/* Convert a Unicode object to a wide character string.
3222
3223
   - If w is NULL: return the number of wide characters (including the null
3224
     character) required to convert the unicode object. Ignore size argument.
3225
3226
   - Otherwise: return the number of wide characters (excluding the null
3227
     character) written into w. Write at most size wide characters (including
3228
     the null character). */
3229
Py_ssize_t
3230
PyUnicode_AsWideChar(PyObject *unicode,
3231
                     wchar_t *w,
3232
                     Py_ssize_t size)
3233
5.59k
{
3234
5.59k
    Py_ssize_t res;
3235
3236
5.59k
    if (unicode == NULL) {
3237
0
        PyErr_BadInternalCall();
3238
0
        return -1;
3239
0
    }
3240
5.59k
    if (!PyUnicode_Check(unicode)) {
3241
0
        PyErr_BadArgument();
3242
0
        return -1;
3243
0
    }
3244
3245
5.59k
    res = unicode_get_widechar_size(unicode);
3246
5.59k
    if (w == NULL) {
3247
0
        return res + 1;
3248
0
    }
3249
3250
5.59k
    if (size > res) {
3251
5.59k
        size = res + 1;
3252
5.59k
    }
3253
0
    else {
3254
0
        res = size;
3255
0
    }
3256
5.59k
    unicode_copy_as_widechar(unicode, w, size);
3257
3258
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3259
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3260
       non-Unicode locales and hence needs conversion first. */
3261
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3262
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3263
            return -1;
3264
        }
3265
    }
3266
#endif
3267
3268
5.59k
    return res;
3269
5.59k
}
3270
3271
wchar_t*
3272
PyUnicode_AsWideCharString(PyObject *unicode,
3273
                           Py_ssize_t *size)
3274
202k
{
3275
202k
    wchar_t *buffer;
3276
202k
    Py_ssize_t buflen;
3277
3278
202k
    if (unicode == NULL) {
3279
0
        PyErr_BadInternalCall();
3280
0
        return NULL;
3281
0
    }
3282
202k
    if (!PyUnicode_Check(unicode)) {
3283
0
        PyErr_BadArgument();
3284
0
        return NULL;
3285
0
    }
3286
3287
202k
    buflen = unicode_get_widechar_size(unicode);
3288
202k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3289
202k
    if (buffer == NULL) {
3290
0
        PyErr_NoMemory();
3291
0
        return NULL;
3292
0
    }
3293
202k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3294
3295
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3296
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3297
       non-Unicode locales and hence needs conversion first. */
3298
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3299
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3300
            return NULL;
3301
        }
3302
    }
3303
#endif
3304
3305
202k
    if (size != NULL) {
3306
201k
        *size = buflen;
3307
201k
    }
3308
1.11k
    else if (wcslen(buffer) != (size_t)buflen) {
3309
0
        PyMem_Free(buffer);
3310
0
        PyErr_SetString(PyExc_ValueError,
3311
0
                        "embedded null character");
3312
0
        return NULL;
3313
0
    }
3314
202k
    return buffer;
3315
202k
}
3316
3317
#endif /* HAVE_WCHAR_H */
3318
3319
int
3320
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3321
0
{
3322
0
    wchar_t **p = (wchar_t **)ptr;
3323
0
    if (obj == NULL) {
3324
0
        PyMem_Free(*p);
3325
0
        *p = NULL;
3326
0
        return 1;
3327
0
    }
3328
0
    if (PyUnicode_Check(obj)) {
3329
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3330
0
        if (*p == NULL) {
3331
0
            return 0;
3332
0
        }
3333
0
        return Py_CLEANUP_SUPPORTED;
3334
0
    }
3335
0
    PyErr_Format(PyExc_TypeError,
3336
0
                 "argument must be str, not %.50s",
3337
0
                 Py_TYPE(obj)->tp_name);
3338
0
    return 0;
3339
0
}
3340
3341
int
3342
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3343
0
{
3344
0
    wchar_t **p = (wchar_t **)ptr;
3345
0
    if (obj == NULL) {
3346
0
        PyMem_Free(*p);
3347
0
        *p = NULL;
3348
0
        return 1;
3349
0
    }
3350
0
    if (obj == Py_None) {
3351
0
        *p = NULL;
3352
0
        return 1;
3353
0
    }
3354
0
    if (PyUnicode_Check(obj)) {
3355
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3356
0
        if (*p == NULL) {
3357
0
            return 0;
3358
0
        }
3359
0
        return Py_CLEANUP_SUPPORTED;
3360
0
    }
3361
0
    PyErr_Format(PyExc_TypeError,
3362
0
                 "argument must be str or None, not %.50s",
3363
0
                 Py_TYPE(obj)->tp_name);
3364
0
    return 0;
3365
0
}
3366
3367
PyObject *
3368
PyUnicode_FromOrdinal(int ordinal)
3369
3.70M
{
3370
3.70M
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3371
31
        PyErr_SetString(PyExc_ValueError,
3372
31
                        "chr() arg not in range(0x110000)");
3373
31
        return NULL;
3374
31
    }
3375
3376
3.70M
    return unicode_char((Py_UCS4)ordinal);
3377
3.70M
}
3378
3379
PyObject *
3380
PyUnicode_FromObject(PyObject *obj)
3381
1.09M
{
3382
    /* XXX Perhaps we should make this API an alias of
3383
       PyObject_Str() instead ?! */
3384
1.09M
    if (PyUnicode_CheckExact(obj)) {
3385
1.09M
        return Py_NewRef(obj);
3386
1.09M
    }
3387
0
    if (PyUnicode_Check(obj)) {
3388
        /* For a Unicode subtype that's not a Unicode object,
3389
           return a true Unicode object with the same data. */
3390
0
        return _PyUnicode_Copy(obj);
3391
0
    }
3392
0
    PyErr_Format(PyExc_TypeError,
3393
0
                 "Can't convert '%.100s' object to str implicitly",
3394
0
                 Py_TYPE(obj)->tp_name);
3395
0
    return NULL;
3396
0
}
3397
3398
PyObject *
3399
PyUnicode_FromEncodedObject(PyObject *obj,
3400
                            const char *encoding,
3401
                            const char *errors)
3402
21.0M
{
3403
21.0M
    Py_buffer buffer;
3404
21.0M
    PyObject *v;
3405
3406
21.0M
    if (obj == NULL) {
3407
0
        PyErr_BadInternalCall();
3408
0
        return NULL;
3409
0
    }
3410
3411
    /* Decoding bytes objects is the most common case and should be fast */
3412
21.0M
    if (PyBytes_Check(obj)) {
3413
20.6M
        if (PyBytes_GET_SIZE(obj) == 0) {
3414
2.93M
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3415
0
                return NULL;
3416
0
            }
3417
2.93M
            _Py_RETURN_UNICODE_EMPTY();
3418
2.93M
        }
3419
17.6M
        return PyUnicode_Decode(
3420
17.6M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3421
17.6M
                encoding, errors);
3422
20.6M
    }
3423
3424
427k
    if (PyUnicode_Check(obj)) {
3425
0
        PyErr_SetString(PyExc_TypeError,
3426
0
                        "decoding str is not supported");
3427
0
        return NULL;
3428
0
    }
3429
3430
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3431
427k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3432
0
        PyErr_Format(PyExc_TypeError,
3433
0
                     "decoding to str: need a bytes-like object, %.80s found",
3434
0
                     Py_TYPE(obj)->tp_name);
3435
0
        return NULL;
3436
0
    }
3437
3438
427k
    if (buffer.len == 0) {
3439
0
        PyBuffer_Release(&buffer);
3440
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3441
0
            return NULL;
3442
0
        }
3443
0
        _Py_RETURN_UNICODE_EMPTY();
3444
0
    }
3445
3446
427k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3447
427k
    PyBuffer_Release(&buffer);
3448
427k
    return v;
3449
427k
}
3450
3451
/* Normalize an encoding name like encodings.normalize_encoding()
3452
   but allow to convert to lowercase if *to_lower* is true.
3453
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3454
int
3455
_Py_normalize_encoding(const char *encoding,
3456
                       char *lower,
3457
                       size_t lower_len,
3458
                       int to_lower)
3459
23.7M
{
3460
23.7M
    const char *e;
3461
23.7M
    char *l;
3462
23.7M
    char *l_end;
3463
23.7M
    int punct;
3464
3465
23.7M
    assert(encoding != NULL);
3466
3467
23.7M
    e = encoding;
3468
23.7M
    l = lower;
3469
23.7M
    l_end = &lower[lower_len - 1];
3470
23.7M
    punct = 0;
3471
229M
    while (1) {
3472
229M
        char c = *e;
3473
229M
        if (c == 0) {
3474
23.1M
            break;
3475
23.1M
        }
3476
3477
206M
        if (Py_ISALNUM(c) || c == '.') {
3478
127M
            if (punct && l != lower) {
3479
12.3M
                if (l == l_end) {
3480
846
                    return 0;
3481
846
                }
3482
12.3M
                *l++ = '_';
3483
12.3M
            }
3484
127M
            punct = 0;
3485
3486
127M
            if (l == l_end) {
3487
630k
                return 0;
3488
630k
            }
3489
127M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3490
127M
        }
3491
78.7M
        else {
3492
78.7M
            punct = 1;
3493
78.7M
        }
3494
3495
205M
        e++;
3496
205M
    }
3497
23.1M
    *l = '\0';
3498
23.1M
    return 1;
3499
23.7M
}
3500
3501
PyObject *
3502
PyUnicode_Decode(const char *s,
3503
                 Py_ssize_t size,
3504
                 const char *encoding,
3505
                 const char *errors)
3506
18.1M
{
3507
18.1M
    PyObject *buffer = NULL, *unicode;
3508
18.1M
    Py_buffer info;
3509
18.1M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3510
3511
18.1M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3512
0
        return NULL;
3513
0
    }
3514
3515
18.1M
    if (size == 0) {
3516
0
        _Py_RETURN_UNICODE_EMPTY();
3517
0
    }
3518
3519
18.1M
    if (encoding == NULL) {
3520
40.0k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3521
40.0k
    }
3522
3523
    /* Shortcuts for common default encodings */
3524
18.1M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3525
18.1M
        char *lower = buflower;
3526
3527
        /* Fast paths */
3528
18.1M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3529
3.80M
            lower += 3;
3530
3.80M
            if (*lower == '_') {
3531
                /* Match "utf8" and "utf_8" */
3532
3.80M
                lower++;
3533
3.80M
            }
3534
3535
3.80M
            if (lower[0] == '8' && lower[1] == 0) {
3536
3.80M
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3537
3.80M
            }
3538
484
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3539
89
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3540
89
            }
3541
395
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3542
89
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3543
89
            }
3544
3.80M
        }
3545
14.3M
        else {
3546
14.3M
            if (strcmp(lower, "ascii") == 0
3547
10.8M
                || strcmp(lower, "us_ascii") == 0) {
3548
10.8M
                return PyUnicode_DecodeASCII(s, size, errors);
3549
10.8M
            }
3550
    #ifdef MS_WINDOWS
3551
            else if (strcmp(lower, "mbcs") == 0) {
3552
                return PyUnicode_DecodeMBCS(s, size, errors);
3553
            }
3554
    #endif
3555
3.45M
            else if (strcmp(lower, "latin1") == 0
3556
3.45M
                     || strcmp(lower, "latin_1") == 0
3557
845k
                     || strcmp(lower, "iso_8859_1") == 0
3558
2.63M
                     || strcmp(lower, "iso8859_1") == 0) {
3559
2.63M
                return PyUnicode_DecodeLatin1(s, size, errors);
3560
2.63M
            }
3561
14.3M
        }
3562
18.1M
    }
3563
3564
    /* Decode via the codec registry */
3565
824k
    buffer = NULL;
3566
824k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3567
0
        goto onError;
3568
824k
    buffer = PyMemoryView_FromBuffer(&info);
3569
824k
    if (buffer == NULL)
3570
0
        goto onError;
3571
824k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3572
824k
    if (unicode == NULL)
3573
77.0k
        goto onError;
3574
747k
    if (!PyUnicode_Check(unicode)) {
3575
0
        PyErr_Format(PyExc_TypeError,
3576
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3577
0
                     "use codecs.decode() to decode to arbitrary types",
3578
0
                     encoding,
3579
0
                     Py_TYPE(unicode)->tp_name);
3580
0
        Py_DECREF(unicode);
3581
0
        goto onError;
3582
0
    }
3583
747k
    Py_DECREF(buffer);
3584
747k
    return unicode_result(unicode);
3585
3586
77.0k
  onError:
3587
77.0k
    Py_XDECREF(buffer);
3588
77.0k
    return NULL;
3589
747k
}
3590
3591
PyAPI_FUNC(PyObject *)
3592
PyUnicode_AsDecodedObject(PyObject *unicode,
3593
                          const char *encoding,
3594
                          const char *errors)
3595
0
{
3596
0
    if (!PyUnicode_Check(unicode)) {
3597
0
        PyErr_BadArgument();
3598
0
        return NULL;
3599
0
    }
3600
3601
0
    if (encoding == NULL)
3602
0
        encoding = PyUnicode_GetDefaultEncoding();
3603
3604
    /* Decode via the codec registry */
3605
0
    return PyCodec_Decode(unicode, encoding, errors);
3606
0
}
3607
3608
PyAPI_FUNC(PyObject *)
3609
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3610
                           const char *encoding,
3611
                           const char *errors)
3612
0
{
3613
0
    PyObject *v;
3614
3615
0
    if (!PyUnicode_Check(unicode)) {
3616
0
        PyErr_BadArgument();
3617
0
        goto onError;
3618
0
    }
3619
3620
0
    if (encoding == NULL)
3621
0
        encoding = PyUnicode_GetDefaultEncoding();
3622
3623
    /* Decode via the codec registry */
3624
0
    v = PyCodec_Decode(unicode, encoding, errors);
3625
0
    if (v == NULL)
3626
0
        goto onError;
3627
0
    if (!PyUnicode_Check(v)) {
3628
0
        PyErr_Format(PyExc_TypeError,
3629
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3630
0
                     "use codecs.decode() to decode to arbitrary types",
3631
0
                     encoding,
3632
0
                     Py_TYPE(unicode)->tp_name);
3633
0
        Py_DECREF(v);
3634
0
        goto onError;
3635
0
    }
3636
0
    return unicode_result(v);
3637
3638
0
  onError:
3639
0
    return NULL;
3640
0
}
3641
3642
PyAPI_FUNC(PyObject *)
3643
PyUnicode_AsEncodedObject(PyObject *unicode,
3644
                          const char *encoding,
3645
                          const char *errors)
3646
0
{
3647
0
    PyObject *v;
3648
3649
0
    if (!PyUnicode_Check(unicode)) {
3650
0
        PyErr_BadArgument();
3651
0
        goto onError;
3652
0
    }
3653
3654
0
    if (encoding == NULL)
3655
0
        encoding = PyUnicode_GetDefaultEncoding();
3656
3657
    /* Encode via the codec registry */
3658
0
    v = PyCodec_Encode(unicode, encoding, errors);
3659
0
    if (v == NULL)
3660
0
        goto onError;
3661
0
    return v;
3662
3663
0
  onError:
3664
0
    return NULL;
3665
0
}
3666
3667
3668
static PyObject *
3669
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3670
                      int current_locale)
3671
864
{
3672
864
    Py_ssize_t wlen;
3673
864
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3674
864
    if (wstr == NULL) {
3675
0
        return NULL;
3676
0
    }
3677
3678
864
    if ((size_t)wlen != wcslen(wstr)) {
3679
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3680
0
        PyMem_Free(wstr);
3681
0
        return NULL;
3682
0
    }
3683
3684
864
    char *str;
3685
864
    size_t error_pos;
3686
864
    const char *reason;
3687
864
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3688
864
                                 current_locale, error_handler);
3689
864
    PyMem_Free(wstr);
3690
3691
864
    if (res != 0) {
3692
0
        if (res == -2) {
3693
0
            PyObject *exc;
3694
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3695
0
                    "locale", unicode,
3696
0
                    (Py_ssize_t)error_pos,
3697
0
                    (Py_ssize_t)(error_pos+1),
3698
0
                    reason);
3699
0
            if (exc != NULL) {
3700
0
                PyCodec_StrictErrors(exc);
3701
0
                Py_DECREF(exc);
3702
0
            }
3703
0
        }
3704
0
        else if (res == -3) {
3705
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3706
0
        }
3707
0
        else {
3708
0
            PyErr_NoMemory();
3709
0
        }
3710
0
        return NULL;
3711
0
    }
3712
3713
864
    PyObject *bytes = PyBytes_FromString(str);
3714
864
    PyMem_RawFree(str);
3715
864
    return bytes;
3716
864
}
3717
3718
PyObject *
3719
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3720
0
{
3721
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3722
0
    return unicode_encode_locale(unicode, error_handler, 1);
3723
0
}
3724
3725
PyObject *
3726
PyUnicode_EncodeFSDefault(PyObject *unicode)
3727
995k
{
3728
995k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3729
995k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3730
995k
    if (fs_codec->utf8) {
3731
994k
        return unicode_encode_utf8(unicode,
3732
994k
                                   fs_codec->error_handler,
3733
994k
                                   fs_codec->errors);
3734
994k
    }
3735
864
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3736
864
    else if (fs_codec->encoding) {
3737
0
        return PyUnicode_AsEncodedString(unicode,
3738
0
                                         fs_codec->encoding,
3739
0
                                         fs_codec->errors);
3740
0
    }
3741
864
#endif
3742
864
    else {
3743
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3744
           machinery is not ready and so cannot be used:
3745
           use wcstombs() in this case. */
3746
864
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3747
864
        const wchar_t *filesystem_errors = config->filesystem_errors;
3748
864
        assert(filesystem_errors != NULL);
3749
864
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3750
864
        assert(errors != _Py_ERROR_UNKNOWN);
3751
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3752
        return unicode_encode_utf8(unicode, errors, NULL);
3753
#else
3754
864
        return unicode_encode_locale(unicode, errors, 0);
3755
864
#endif
3756
864
    }
3757
995k
}
3758
3759
PyObject *
3760
PyUnicode_AsEncodedString(PyObject *unicode,
3761
                          const char *encoding,
3762
                          const char *errors)
3763
18.0M
{
3764
18.0M
    PyObject *v;
3765
18.0M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3766
3767
18.0M
    if (!PyUnicode_Check(unicode)) {
3768
0
        PyErr_BadArgument();
3769
0
        return NULL;
3770
0
    }
3771
3772
18.0M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3773
0
        return NULL;
3774
0
    }
3775
3776
18.0M
    if (encoding == NULL) {
3777
12.4M
        return _PyUnicode_AsUTF8String(unicode, errors);
3778
12.4M
    }
3779
3780
    /* Shortcuts for common default encodings */
3781
5.63M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3782
5.01M
        char *lower = buflower;
3783
3784
        /* Fast paths */
3785
5.01M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3786
4.69M
            lower += 3;
3787
4.69M
            if (*lower == '_') {
3788
                /* Match "utf8" and "utf_8" */
3789
4.69M
                lower++;
3790
4.69M
            }
3791
3792
4.69M
            if (lower[0] == '8' && lower[1] == 0) {
3793
4.69M
                return _PyUnicode_AsUTF8String(unicode, errors);
3794
4.69M
            }
3795
6.17k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3796
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3797
0
            }
3798
6.17k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3799
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3800
0
            }
3801
4.69M
        }
3802
311k
        else {
3803
311k
            if (strcmp(lower, "ascii") == 0
3804
294k
                || strcmp(lower, "us_ascii") == 0) {
3805
294k
                return _PyUnicode_AsASCIIString(unicode, errors);
3806
294k
            }
3807
#ifdef MS_WINDOWS
3808
            else if (strcmp(lower, "mbcs") == 0) {
3809
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3810
            }
3811
#endif
3812
16.2k
            else if (strcmp(lower, "latin1") == 0 ||
3813
16.2k
                     strcmp(lower, "latin_1") == 0 ||
3814
16.2k
                     strcmp(lower, "iso_8859_1") == 0 ||
3815
16.2k
                     strcmp(lower, "iso8859_1") == 0) {
3816
10
                return _PyUnicode_AsLatin1String(unicode, errors);
3817
10
            }
3818
311k
        }
3819
5.01M
    }
3820
3821
    /* Encode via the codec registry */
3822
651k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3823
651k
    if (v == NULL)
3824
0
        return NULL;
3825
3826
    /* The normal path */
3827
651k
    if (PyBytes_Check(v))
3828
651k
        return v;
3829
3830
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3831
0
    if (PyByteArray_Check(v)) {
3832
0
        int error;
3833
0
        PyObject *b;
3834
3835
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3836
0
            "encoder %s returned bytearray instead of bytes; "
3837
0
            "use codecs.encode() to encode to arbitrary types",
3838
0
            encoding);
3839
0
        if (error) {
3840
0
            Py_DECREF(v);
3841
0
            return NULL;
3842
0
        }
3843
3844
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3845
0
                                      PyByteArray_GET_SIZE(v));
3846
0
        Py_DECREF(v);
3847
0
        return b;
3848
0
    }
3849
3850
0
    PyErr_Format(PyExc_TypeError,
3851
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3852
0
                 "use codecs.encode() to encode to arbitrary types",
3853
0
                 encoding,
3854
0
                 Py_TYPE(v)->tp_name);
3855
0
    Py_DECREF(v);
3856
0
    return NULL;
3857
0
}
3858
3859
PyAPI_FUNC(PyObject *)
3860
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3861
                           const char *encoding,
3862
                           const char *errors)
3863
0
{
3864
0
    PyObject *v;
3865
3866
0
    if (!PyUnicode_Check(unicode)) {
3867
0
        PyErr_BadArgument();
3868
0
        goto onError;
3869
0
    }
3870
3871
0
    if (encoding == NULL)
3872
0
        encoding = PyUnicode_GetDefaultEncoding();
3873
3874
    /* Encode via the codec registry */
3875
0
    v = PyCodec_Encode(unicode, encoding, errors);
3876
0
    if (v == NULL)
3877
0
        goto onError;
3878
0
    if (!PyUnicode_Check(v)) {
3879
0
        PyErr_Format(PyExc_TypeError,
3880
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3881
0
                     "use codecs.encode() to encode to arbitrary types",
3882
0
                     encoding,
3883
0
                     Py_TYPE(v)->tp_name);
3884
0
        Py_DECREF(v);
3885
0
        goto onError;
3886
0
    }
3887
0
    return v;
3888
3889
0
  onError:
3890
0
    return NULL;
3891
0
}
3892
3893
static PyObject*
3894
unicode_decode_locale(const char *str, Py_ssize_t len,
3895
                      _Py_error_handler errors, int current_locale)
3896
311k
{
3897
311k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3898
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3899
0
        return NULL;
3900
0
    }
3901
3902
311k
    wchar_t *wstr;
3903
311k
    size_t wlen;
3904
311k
    const char *reason;
3905
311k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3906
311k
                                 current_locale, errors);
3907
311k
    if (res != 0) {
3908
0
        if (res == -2) {
3909
0
            PyObject *exc;
3910
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3911
0
                                        "locale", str, len,
3912
0
                                        (Py_ssize_t)wlen,
3913
0
                                        (Py_ssize_t)(wlen + 1),
3914
0
                                        reason);
3915
0
            if (exc != NULL) {
3916
0
                PyCodec_StrictErrors(exc);
3917
0
                Py_DECREF(exc);
3918
0
            }
3919
0
        }
3920
0
        else if (res == -3) {
3921
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3922
0
        }
3923
0
        else {
3924
0
            PyErr_NoMemory();
3925
0
        }
3926
0
        return NULL;
3927
0
    }
3928
3929
311k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3930
311k
    PyMem_RawFree(wstr);
3931
311k
    return unicode;
3932
311k
}
3933
3934
PyObject*
3935
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3936
                              const char *errors)
3937
0
{
3938
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3939
0
    return unicode_decode_locale(str, len, error_handler, 1);
3940
0
}
3941
3942
PyObject*
3943
PyUnicode_DecodeLocale(const char *str, const char *errors)
3944
299k
{
3945
299k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3946
299k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3947
299k
    return unicode_decode_locale(str, size, error_handler, 1);
3948
299k
}
3949
3950
3951
PyObject*
3952
200
PyUnicode_DecodeFSDefault(const char *s) {
3953
200
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3954
200
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3955
200
}
3956
3957
PyObject*
3958
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3959
181k
{
3960
181k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3961
181k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3962
181k
    if (fs_codec->utf8) {
3963
169k
        return unicode_decode_utf8(s, size,
3964
169k
                                   fs_codec->error_handler,
3965
169k
                                   fs_codec->errors,
3966
169k
                                   NULL);
3967
169k
    }
3968
11.5k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3969
11.5k
    else if (fs_codec->encoding) {
3970
0
        return PyUnicode_Decode(s, size,
3971
0
                                fs_codec->encoding,
3972
0
                                fs_codec->errors);
3973
0
    }
3974
11.5k
#endif
3975
11.5k
    else {
3976
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3977
           machinery is not ready and so cannot be used:
3978
           use mbstowcs() in this case. */
3979
11.5k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3980
11.5k
        const wchar_t *filesystem_errors = config->filesystem_errors;
3981
11.5k
        assert(filesystem_errors != NULL);
3982
11.5k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3983
11.5k
        assert(errors != _Py_ERROR_UNKNOWN);
3984
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3985
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3986
#else
3987
11.5k
        return unicode_decode_locale(s, size, errors, 0);
3988
11.5k
#endif
3989
11.5k
    }
3990
181k
}
3991
3992
3993
int
3994
PyUnicode_FSConverter(PyObject* arg, void* addr)
3995
252k
{
3996
252k
    PyObject *path = NULL;
3997
252k
    PyObject *output = NULL;
3998
252k
    Py_ssize_t size;
3999
252k
    const char *data;
4000
252k
    if (arg == NULL) {
4001
0
        Py_DECREF(*(PyObject**)addr);
4002
0
        *(PyObject**)addr = NULL;
4003
0
        return 1;
4004
0
    }
4005
252k
    path = PyOS_FSPath(arg);
4006
252k
    if (path == NULL) {
4007
0
        return 0;
4008
0
    }
4009
252k
    if (PyBytes_Check(path)) {
4010
0
        output = path;
4011
0
    }
4012
252k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4013
252k
        output = PyUnicode_EncodeFSDefault(path);
4014
252k
        Py_DECREF(path);
4015
252k
        if (!output) {
4016
0
            return 0;
4017
0
        }
4018
252k
        assert(PyBytes_Check(output));
4019
252k
    }
4020
4021
252k
    size = PyBytes_GET_SIZE(output);
4022
252k
    data = PyBytes_AS_STRING(output);
4023
252k
    if ((size_t)size != strlen(data)) {
4024
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4025
0
        Py_DECREF(output);
4026
0
        return 0;
4027
0
    }
4028
252k
    *(PyObject**)addr = output;
4029
252k
    return Py_CLEANUP_SUPPORTED;
4030
252k
}
4031
4032
4033
int
4034
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4035
92.1k
{
4036
92.1k
    if (arg == NULL) {
4037
0
        Py_DECREF(*(PyObject**)addr);
4038
0
        *(PyObject**)addr = NULL;
4039
0
        return 1;
4040
0
    }
4041
4042
92.1k
    PyObject *path = PyOS_FSPath(arg);
4043
92.1k
    if (path == NULL) {
4044
0
        return 0;
4045
0
    }
4046
4047
92.1k
    PyObject *output = NULL;
4048
92.1k
    if (PyUnicode_Check(path)) {
4049
92.1k
        output = path;
4050
92.1k
    }
4051
0
    else if (PyBytes_Check(path)) {
4052
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4053
0
                                                  PyBytes_GET_SIZE(path));
4054
0
        Py_DECREF(path);
4055
0
        if (!output) {
4056
0
            return 0;
4057
0
        }
4058
0
    }
4059
0
    else {
4060
0
        PyErr_Format(PyExc_TypeError,
4061
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4062
0
                     Py_TYPE(arg)->tp_name);
4063
0
        Py_DECREF(path);
4064
0
        return 0;
4065
0
    }
4066
4067
92.1k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4068
92.1k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4069
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4070
0
        Py_DECREF(output);
4071
0
        return 0;
4072
0
    }
4073
92.1k
    *(PyObject**)addr = output;
4074
92.1k
    return Py_CLEANUP_SUPPORTED;
4075
92.1k
}
4076
4077
4078
static int unicode_fill_utf8(PyObject *unicode);
4079
4080
4081
static int
4082
unicode_ensure_utf8(PyObject *unicode)
4083
62.7M
{
4084
62.7M
    int err = 0;
4085
62.7M
    if (PyUnicode_UTF8(unicode) == NULL) {
4086
162k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4087
162k
        if (PyUnicode_UTF8(unicode) == NULL) {
4088
162k
            err = unicode_fill_utf8(unicode);
4089
162k
        }
4090
162k
        Py_END_CRITICAL_SECTION();
4091
162k
    }
4092
62.7M
    return err;
4093
62.7M
}
4094
4095
const char *
4096
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4097
62.7M
{
4098
62.7M
    if (!PyUnicode_Check(unicode)) {
4099
0
        PyErr_BadArgument();
4100
0
        if (psize) {
4101
0
            *psize = -1;
4102
0
        }
4103
0
        return NULL;
4104
0
    }
4105
4106
62.7M
    if (unicode_ensure_utf8(unicode) == -1) {
4107
207
        if (psize) {
4108
207
            *psize = -1;
4109
207
        }
4110
207
        return NULL;
4111
207
    }
4112
4113
62.7M
    if (psize) {
4114
62.5M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4115
62.5M
    }
4116
62.7M
    return PyUnicode_UTF8(unicode);
4117
62.7M
}
4118
4119
const char *
4120
PyUnicode_AsUTF8(PyObject *unicode)
4121
235k
{
4122
235k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4123
235k
}
4124
4125
const char *
4126
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4127
648k
{
4128
648k
    Py_ssize_t size;
4129
648k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4130
648k
    if (s && strlen(s) != (size_t)size) {
4131
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4132
0
        return NULL;
4133
0
    }
4134
648k
    return s;
4135
648k
}
4136
4137
/*
4138
PyUnicode_GetSize() has been deprecated since Python 3.3
4139
because it returned length of Py_UNICODE.
4140
4141
But this function is part of stable abi, because it doesn't
4142
include Py_UNICODE in signature and it was not excluded from
4143
stable ABI in PEP 384.
4144
*/
4145
PyAPI_FUNC(Py_ssize_t)
4146
PyUnicode_GetSize(PyObject *unicode)
4147
0
{
4148
0
    PyErr_SetString(PyExc_RuntimeError,
4149
0
                    "PyUnicode_GetSize has been removed.");
4150
0
    return -1;
4151
0
}
4152
4153
Py_ssize_t
4154
PyUnicode_GetLength(PyObject *unicode)
4155
25.7k
{
4156
25.7k
    if (!PyUnicode_Check(unicode)) {
4157
0
        PyErr_BadArgument();
4158
0
        return -1;
4159
0
    }
4160
25.7k
    return PyUnicode_GET_LENGTH(unicode);
4161
25.7k
}
4162
4163
Py_UCS4
4164
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4165
20
{
4166
20
    const void *data;
4167
20
    int kind;
4168
4169
20
    if (!PyUnicode_Check(unicode)) {
4170
0
        PyErr_BadArgument();
4171
0
        return (Py_UCS4)-1;
4172
0
    }
4173
20
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4174
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4175
0
        return (Py_UCS4)-1;
4176
0
    }
4177
20
    data = PyUnicode_DATA(unicode);
4178
20
    kind = PyUnicode_KIND(unicode);
4179
20
    return PyUnicode_READ(kind, data, index);
4180
20
}
4181
4182
int
4183
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4184
0
{
4185
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4186
0
        PyErr_BadArgument();
4187
0
        return -1;
4188
0
    }
4189
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4190
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4191
0
        return -1;
4192
0
    }
4193
0
    if (unicode_check_modifiable(unicode))
4194
0
        return -1;
4195
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4196
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4197
0
        return -1;
4198
0
    }
4199
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4200
0
                    index, ch);
4201
0
    return 0;
4202
0
}
4203
4204
const char *
4205
PyUnicode_GetDefaultEncoding(void)
4206
0
{
4207
0
    return "utf-8";
4208
0
}
4209
4210
/* create or adjust a UnicodeDecodeError */
4211
static void
4212
make_decode_exception(PyObject **exceptionObject,
4213
                      const char *encoding,
4214
                      const char *input, Py_ssize_t length,
4215
                      Py_ssize_t startpos, Py_ssize_t endpos,
4216
                      const char *reason)
4217
2.35M
{
4218
2.35M
    if (*exceptionObject == NULL) {
4219
2.12M
        *exceptionObject = PyUnicodeDecodeError_Create(
4220
2.12M
            encoding, input, length, startpos, endpos, reason);
4221
2.12M
    }
4222
223k
    else {
4223
223k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4224
0
            goto onError;
4225
223k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4226
0
            goto onError;
4227
223k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4228
0
            goto onError;
4229
223k
    }
4230
2.35M
    return;
4231
4232
2.35M
onError:
4233
0
    Py_CLEAR(*exceptionObject);
4234
0
}
4235
4236
#ifdef MS_WINDOWS
4237
static int
4238
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4239
{
4240
    if (newsize > *size) {
4241
        wchar_t *newbuf = *buf;
4242
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4243
            PyErr_NoMemory();
4244
            return -1;
4245
        }
4246
        *buf = newbuf;
4247
    }
4248
    *size = newsize;
4249
    return 0;
4250
}
4251
4252
/* error handling callback helper:
4253
   build arguments, call the callback and check the arguments,
4254
   if no exception occurred, copy the replacement to the output
4255
   and adjust various state variables.
4256
   return 0 on success, -1 on error
4257
*/
4258
4259
static int
4260
unicode_decode_call_errorhandler_wchar(
4261
    const char *errors, PyObject **errorHandler,
4262
    const char *encoding, const char *reason,
4263
    const char **input, const char **inend, Py_ssize_t *startinpos,
4264
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4265
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4266
{
4267
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4268
4269
    PyObject *restuple = NULL;
4270
    PyObject *repunicode = NULL;
4271
    Py_ssize_t outsize;
4272
    Py_ssize_t insize;
4273
    Py_ssize_t requiredsize;
4274
    Py_ssize_t newpos;
4275
    PyObject *inputobj = NULL;
4276
    Py_ssize_t repwlen;
4277
4278
    if (*errorHandler == NULL) {
4279
        *errorHandler = PyCodec_LookupError(errors);
4280
        if (*errorHandler == NULL)
4281
            goto onError;
4282
    }
4283
4284
    make_decode_exception(exceptionObject,
4285
        encoding,
4286
        *input, *inend - *input,
4287
        *startinpos, *endinpos,
4288
        reason);
4289
    if (*exceptionObject == NULL)
4290
        goto onError;
4291
4292
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4293
    if (restuple == NULL)
4294
        goto onError;
4295
    if (!PyTuple_Check(restuple)) {
4296
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4297
        goto onError;
4298
    }
4299
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4300
        goto onError;
4301
4302
    /* Copy back the bytes variables, which might have been modified by the
4303
       callback */
4304
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4305
    if (!inputobj)
4306
        goto onError;
4307
    *input = PyBytes_AS_STRING(inputobj);
4308
    insize = PyBytes_GET_SIZE(inputobj);
4309
    *inend = *input + insize;
4310
    /* we can DECREF safely, as the exception has another reference,
4311
       so the object won't go away. */
4312
    Py_DECREF(inputobj);
4313
4314
    if (newpos<0)
4315
        newpos = insize+newpos;
4316
    if (newpos<0 || newpos>insize) {
4317
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4318
        goto onError;
4319
    }
4320
4321
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4322
    if (repwlen < 0)
4323
        goto onError;
4324
    repwlen--;
4325
    /* need more space? (at least enough for what we
4326
       have+the replacement+the rest of the string (starting
4327
       at the new input position), so we won't have to check space
4328
       when there are no errors in the rest of the string) */
4329
    requiredsize = *outpos;
4330
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4331
        goto overflow;
4332
    requiredsize += repwlen;
4333
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4334
        goto overflow;
4335
    requiredsize += insize - newpos;
4336
    outsize = *bufsize;
4337
    if (requiredsize > outsize) {
4338
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4339
            requiredsize = 2*outsize;
4340
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4341
            goto onError;
4342
        }
4343
    }
4344
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4345
    *outpos += repwlen;
4346
    *endinpos = newpos;
4347
    *inptr = *input + newpos;
4348
4349
    /* we made it! */
4350
    Py_DECREF(restuple);
4351
    return 0;
4352
4353
  overflow:
4354
    PyErr_SetString(PyExc_OverflowError,
4355
                    "decoded result is too long for a Python string");
4356
4357
  onError:
4358
    Py_XDECREF(restuple);
4359
    return -1;
4360
}
4361
#endif   /* MS_WINDOWS */
4362
4363
static int
4364
unicode_decode_call_errorhandler_writer(
4365
    const char *errors, PyObject **errorHandler,
4366
    const char *encoding, const char *reason,
4367
    const char **input, const char **inend, Py_ssize_t *startinpos,
4368
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4369
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4370
2.35M
{
4371
2.35M
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4372
4373
2.35M
    PyObject *restuple = NULL;
4374
2.35M
    PyObject *repunicode = NULL;
4375
2.35M
    Py_ssize_t insize;
4376
2.35M
    Py_ssize_t newpos;
4377
2.35M
    Py_ssize_t replen;
4378
2.35M
    Py_ssize_t remain;
4379
2.35M
    PyObject *inputobj = NULL;
4380
2.35M
    int need_to_grow = 0;
4381
2.35M
    const char *new_inptr;
4382
4383
2.35M
    if (*errorHandler == NULL) {
4384
2.12M
        *errorHandler = PyCodec_LookupError(errors);
4385
2.12M
        if (*errorHandler == NULL)
4386
0
            goto onError;
4387
2.12M
    }
4388
4389
2.35M
    make_decode_exception(exceptionObject,
4390
2.35M
        encoding,
4391
2.35M
        *input, *inend - *input,
4392
2.35M
        *startinpos, *endinpos,
4393
2.35M
        reason);
4394
2.35M
    if (*exceptionObject == NULL)
4395
0
        goto onError;
4396
4397
2.35M
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4398
2.35M
    if (restuple == NULL)
4399
2.08M
        goto onError;
4400
262k
    if (!PyTuple_Check(restuple)) {
4401
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4402
0
        goto onError;
4403
0
    }
4404
262k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4405
0
        goto onError;
4406
4407
    /* Copy back the bytes variables, which might have been modified by the
4408
       callback */
4409
262k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4410
262k
    if (!inputobj)
4411
0
        goto onError;
4412
262k
    remain = *inend - *input - *endinpos;
4413
262k
    *input = PyBytes_AS_STRING(inputobj);
4414
262k
    insize = PyBytes_GET_SIZE(inputobj);
4415
262k
    *inend = *input + insize;
4416
    /* we can DECREF safely, as the exception has another reference,
4417
       so the object won't go away. */
4418
262k
    Py_DECREF(inputobj);
4419
4420
262k
    if (newpos<0)
4421
0
        newpos = insize+newpos;
4422
262k
    if (newpos<0 || newpos>insize) {
4423
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4424
0
        goto onError;
4425
0
    }
4426
4427
262k
    replen = PyUnicode_GET_LENGTH(repunicode);
4428
262k
    if (replen > 1) {
4429
28.5k
        writer->min_length += replen - 1;
4430
28.5k
        need_to_grow = 1;
4431
28.5k
    }
4432
262k
    new_inptr = *input + newpos;
4433
262k
    if (*inend - new_inptr > remain) {
4434
        /* We don't know the decoding algorithm here so we make the worst
4435
           assumption that one byte decodes to one unicode character.
4436
           If unfortunately one byte could decode to more unicode characters,
4437
           the decoder may write out-of-bound then.  Is it possible for the
4438
           algorithms using this function? */
4439
14.7k
        writer->min_length += *inend - new_inptr - remain;
4440
14.7k
        need_to_grow = 1;
4441
14.7k
    }
4442
262k
    if (need_to_grow) {
4443
28.7k
        writer->overallocate = 1;
4444
28.7k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4445
28.7k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4446
0
            goto onError;
4447
28.7k
    }
4448
262k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4449
0
        goto onError;
4450
4451
262k
    *endinpos = newpos;
4452
262k
    *inptr = new_inptr;
4453
4454
    /* we made it! */
4455
262k
    Py_DECREF(restuple);
4456
262k
    return 0;
4457
4458
2.08M
  onError:
4459
2.08M
    Py_XDECREF(restuple);
4460
2.08M
    return -1;
4461
262k
}
4462
4463
/* --- UTF-7 Codec -------------------------------------------------------- */
4464
4465
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4466
4467
/* Three simple macros defining base-64. */
4468
4469
/* Is c a base-64 character? */
4470
4471
#define IS_BASE64(c) \
4472
261k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4473
261k
     ((c) >= 'a' && (c) <= 'z') ||     \
4474
261k
     ((c) >= '0' && (c) <= '9') ||     \
4475
261k
     (c) == '+' || (c) == '/')
4476
4477
/* given that c is a base-64 character, what is its base-64 value? */
4478
4479
#define FROM_BASE64(c)                                                  \
4480
227k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4481
227k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4482
182k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4483
99.8k
     (c) == '+' ? 62 : 63)
4484
4485
/* What is the base-64 character of the bottom 6 bits of n? */
4486
4487
#define TO_BASE64(n)  \
4488
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4489
4490
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4491
 * decoded as itself.  We are permissive on decoding; the only ASCII
4492
 * byte not decoding to itself is the + which begins a base64
4493
 * string. */
4494
4495
#define DECODE_DIRECT(c)                                \
4496
4.14M
    ((c) <= 127 && (c) != '+')
4497
4498
/* The UTF-7 encoder treats ASCII characters differently according to
4499
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4500
 * the above).  See RFC2152.  This array identifies these different
4501
 * sets:
4502
 * 0 : "Set D"
4503
 *     alphanumeric and '(),-./:?
4504
 * 1 : "Set O"
4505
 *     !"#$%&*;<=>@[]^_`{|}
4506
 * 2 : "whitespace"
4507
 *     ht nl cr sp
4508
 * 3 : special (must be base64 encoded)
4509
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4510
 */
4511
4512
static
4513
char utf7_category[128] = {
4514
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4515
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4516
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4517
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4518
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4519
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4520
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4521
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4522
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4523
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4524
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4525
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4526
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4527
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4528
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4529
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4530
};
4531
4532
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4533
 * answer depends on whether we are encoding set O as itself, and also
4534
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4535
 * clear that the answers to these questions vary between
4536
 * applications, so this code needs to be flexible.  */
4537
4538
#define ENCODE_DIRECT(c) \
4539
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4540
4541
PyObject *
4542
PyUnicode_DecodeUTF7(const char *s,
4543
                     Py_ssize_t size,
4544
                     const char *errors)
4545
0
{
4546
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4547
0
}
4548
4549
/* The decoder.  The only state we preserve is our read position,
4550
 * i.e. how many characters we have consumed.  So if we end in the
4551
 * middle of a shift sequence we have to back off the read position
4552
 * and the output to the beginning of the sequence, otherwise we lose
4553
 * all the shift state (seen bits, number of bits seen, high
4554
 * surrogate). */
4555
4556
PyObject *
4557
PyUnicode_DecodeUTF7Stateful(const char *s,
4558
                             Py_ssize_t size,
4559
                             const char *errors,
4560
                             Py_ssize_t *consumed)
4561
20.4k
{
4562
20.4k
    const char *starts = s;
4563
20.4k
    Py_ssize_t startinpos;
4564
20.4k
    Py_ssize_t endinpos;
4565
20.4k
    const char *e;
4566
20.4k
    _PyUnicodeWriter writer;
4567
20.4k
    const char *errmsg = "";
4568
20.4k
    int inShift = 0;
4569
20.4k
    Py_ssize_t shiftOutStart;
4570
20.4k
    unsigned int base64bits = 0;
4571
20.4k
    unsigned long base64buffer = 0;
4572
20.4k
    Py_UCS4 surrogate = 0;
4573
20.4k
    PyObject *errorHandler = NULL;
4574
20.4k
    PyObject *exc = NULL;
4575
4576
20.4k
    if (size == 0) {
4577
0
        if (consumed)
4578
0
            *consumed = 0;
4579
0
        _Py_RETURN_UNICODE_EMPTY();
4580
0
    }
4581
4582
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4583
20.4k
    _PyUnicodeWriter_Init(&writer);
4584
20.4k
    writer.min_length = size;
4585
4586
20.4k
    shiftOutStart = 0;
4587
20.4k
    e = s + size;
4588
4589
4.41M
    while (s < e) {
4590
4.40M
        Py_UCS4 ch;
4591
4.40M
      restart:
4592
4.40M
        ch = (unsigned char) *s;
4593
4594
4.40M
        if (inShift) { /* in a base-64 section */
4595
241k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4596
227k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4597
227k
                base64bits += 6;
4598
227k
                s++;
4599
227k
                if (base64bits >= 16) {
4600
                    /* we have enough bits for a UTF-16 value */
4601
79.4k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4602
79.4k
                    base64bits -= 16;
4603
79.4k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4604
79.4k
                    assert(outCh <= 0xffff);
4605
79.4k
                    if (surrogate) {
4606
                        /* expecting a second surrogate */
4607
7.96k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4608
2.63k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4609
2.63k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4610
0
                                goto onError;
4611
2.63k
                            surrogate = 0;
4612
2.63k
                            continue;
4613
2.63k
                        }
4614
5.32k
                        else {
4615
5.32k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4616
0
                                goto onError;
4617
5.32k
                            surrogate = 0;
4618
5.32k
                        }
4619
7.96k
                    }
4620
76.8k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4621
                        /* first surrogate */
4622
11.6k
                        surrogate = outCh;
4623
11.6k
                    }
4624
65.1k
                    else {
4625
65.1k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4626
0
                            goto onError;
4627
65.1k
                    }
4628
76.8k
                }
4629
227k
            }
4630
14.0k
            else { /* now leaving a base-64 section */
4631
14.0k
                inShift = 0;
4632
14.0k
                if (base64bits > 0) { /* left-over bits */
4633
11.8k
                    if (base64bits >= 6) {
4634
                        /* We've seen at least one base-64 character */
4635
6.27k
                        s++;
4636
6.27k
                        errmsg = "partial character in shift sequence";
4637
6.27k
                        goto utf7Error;
4638
6.27k
                    }
4639
5.61k
                    else {
4640
                        /* Some bits remain; they should be zero */
4641
5.61k
                        if (base64buffer != 0) {
4642
1.53k
                            s++;
4643
1.53k
                            errmsg = "non-zero padding bits in shift sequence";
4644
1.53k
                            goto utf7Error;
4645
1.53k
                        }
4646
5.61k
                    }
4647
11.8k
                }
4648
6.26k
                if (surrogate && DECODE_DIRECT(ch)) {
4649
2.65k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4650
0
                        goto onError;
4651
2.65k
                }
4652
6.26k
                surrogate = 0;
4653
6.26k
                if (ch == '-') {
4654
                    /* '-' is absorbed; other terminating
4655
                       characters are preserved */
4656
1.72k
                    s++;
4657
1.72k
                }
4658
6.26k
            }
4659
241k
        }
4660
4.16M
        else if ( ch == '+' ) {
4661
22.3k
            startinpos = s-starts;
4662
22.3k
            s++; /* consume '+' */
4663
22.3k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4664
1.81k
                s++;
4665
1.81k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4666
0
                    goto onError;
4667
1.81k
            }
4668
20.5k
            else if (s < e && !IS_BASE64(*s)) {
4669
2.85k
                s++;
4670
2.85k
                errmsg = "ill-formed sequence";
4671
2.85k
                goto utf7Error;
4672
2.85k
            }
4673
17.7k
            else { /* begin base64-encoded section */
4674
17.7k
                inShift = 1;
4675
17.7k
                surrogate = 0;
4676
17.7k
                shiftOutStart = writer.pos;
4677
17.7k
                base64bits = 0;
4678
17.7k
                base64buffer = 0;
4679
17.7k
            }
4680
22.3k
        }
4681
4.14M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4682
4.03M
            s++;
4683
4.03M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4684
0
                goto onError;
4685
4.03M
        }
4686
105k
        else {
4687
105k
            startinpos = s-starts;
4688
105k
            s++;
4689
105k
            errmsg = "unexpected special character";
4690
105k
            goto utf7Error;
4691
105k
        }
4692
4.28M
        continue;
4693
4.28M
utf7Error:
4694
116k
        endinpos = s-starts;
4695
116k
        if (unicode_decode_call_errorhandler_writer(
4696
116k
                errors, &errorHandler,
4697
116k
                "utf7", errmsg,
4698
116k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4699
116k
                &writer))
4700
9.05k
            goto onError;
4701
116k
    }
4702
4703
    /* end of string */
4704
4705
11.4k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4706
        /* if we're in an inconsistent state, that's an error */
4707
3.63k
        inShift = 0;
4708
3.63k
        if (surrogate ||
4709
3.07k
                (base64bits >= 6) ||
4710
2.34k
                (base64bits > 0 && base64buffer != 0)) {
4711
2.34k
            endinpos = size;
4712
2.34k
            if (unicode_decode_call_errorhandler_writer(
4713
2.34k
                    errors, &errorHandler,
4714
2.34k
                    "utf7", "unterminated shift sequence",
4715
2.34k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4716
2.34k
                    &writer))
4717
1.94k
                goto onError;
4718
405
            if (s < e)
4719
0
                goto restart;
4720
405
        }
4721
3.63k
    }
4722
4723
    /* return state */
4724
9.46k
    if (consumed) {
4725
0
        if (inShift) {
4726
0
            *consumed = startinpos;
4727
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4728
0
                PyObject *result = PyUnicode_FromKindAndData(
4729
0
                        writer.kind, writer.data, shiftOutStart);
4730
0
                Py_XDECREF(errorHandler);
4731
0
                Py_XDECREF(exc);
4732
0
                _PyUnicodeWriter_Dealloc(&writer);
4733
0
                return result;
4734
0
            }
4735
0
            writer.pos = shiftOutStart; /* back off output */
4736
0
        }
4737
0
        else {
4738
0
            *consumed = s-starts;
4739
0
        }
4740
0
    }
4741
4742
9.46k
    Py_XDECREF(errorHandler);
4743
9.46k
    Py_XDECREF(exc);
4744
9.46k
    return _PyUnicodeWriter_Finish(&writer);
4745
4746
10.9k
  onError:
4747
10.9k
    Py_XDECREF(errorHandler);
4748
10.9k
    Py_XDECREF(exc);
4749
10.9k
    _PyUnicodeWriter_Dealloc(&writer);
4750
10.9k
    return NULL;
4751
9.46k
}
4752
4753
4754
PyObject *
4755
_PyUnicode_EncodeUTF7(PyObject *str,
4756
                      const char *errors)
4757
0
{
4758
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4759
0
    if (len == 0) {
4760
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4761
0
    }
4762
0
    int kind = PyUnicode_KIND(str);
4763
0
    const void *data = PyUnicode_DATA(str);
4764
4765
    /* It might be possible to tighten this worst case */
4766
0
    if (len > PY_SSIZE_T_MAX / 8) {
4767
0
        return PyErr_NoMemory();
4768
0
    }
4769
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4770
0
    if (writer == NULL) {
4771
0
        return NULL;
4772
0
    }
4773
4774
0
    int inShift = 0;
4775
0
    unsigned int base64bits = 0;
4776
0
    unsigned long base64buffer = 0;
4777
0
    char *out = PyBytesWriter_GetData(writer);
4778
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4779
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4780
4781
0
        if (inShift) {
4782
0
            if (ENCODE_DIRECT(ch)) {
4783
                /* shifting out */
4784
0
                if (base64bits) { /* output remaining bits */
4785
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4786
0
                    base64buffer = 0;
4787
0
                    base64bits = 0;
4788
0
                }
4789
0
                inShift = 0;
4790
                /* Characters not in the BASE64 set implicitly unshift the sequence
4791
                   so no '-' is required, except if the character is itself a '-' */
4792
0
                if (IS_BASE64(ch) || ch == '-') {
4793
0
                    *out++ = '-';
4794
0
                }
4795
0
                *out++ = (char) ch;
4796
0
            }
4797
0
            else {
4798
0
                goto encode_char;
4799
0
            }
4800
0
        }
4801
0
        else { /* not in a shift sequence */
4802
0
            if (ch == '+') {
4803
0
                *out++ = '+';
4804
0
                        *out++ = '-';
4805
0
            }
4806
0
            else if (ENCODE_DIRECT(ch)) {
4807
0
                *out++ = (char) ch;
4808
0
            }
4809
0
            else {
4810
0
                *out++ = '+';
4811
0
                inShift = 1;
4812
0
                goto encode_char;
4813
0
            }
4814
0
        }
4815
0
        continue;
4816
0
encode_char:
4817
0
        if (ch >= 0x10000) {
4818
0
            assert(ch <= MAX_UNICODE);
4819
4820
            /* code first surrogate */
4821
0
            base64bits += 16;
4822
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4823
0
            while (base64bits >= 6) {
4824
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4825
0
                base64bits -= 6;
4826
0
            }
4827
            /* prepare second surrogate */
4828
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4829
0
        }
4830
0
        base64bits += 16;
4831
0
        base64buffer = (base64buffer << 16) | ch;
4832
0
        while (base64bits >= 6) {
4833
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4834
0
            base64bits -= 6;
4835
0
        }
4836
0
    }
4837
0
    if (base64bits)
4838
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4839
0
    if (inShift)
4840
0
        *out++ = '-';
4841
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4842
0
}
4843
4844
#undef IS_BASE64
4845
#undef FROM_BASE64
4846
#undef TO_BASE64
4847
#undef DECODE_DIRECT
4848
#undef ENCODE_DIRECT
4849
4850
/* --- UTF-8 Codec -------------------------------------------------------- */
4851
4852
PyObject *
4853
PyUnicode_DecodeUTF8(const char *s,
4854
                     Py_ssize_t size,
4855
                     const char *errors)
4856
47.1M
{
4857
47.1M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4858
47.1M
}
4859
4860
#include "stringlib/asciilib.h"
4861
#include "stringlib/codecs.h"
4862
#include "stringlib/undef.h"
4863
4864
#include "stringlib/ucs1lib.h"
4865
#include "stringlib/codecs.h"
4866
#include "stringlib/undef.h"
4867
4868
#include "stringlib/ucs2lib.h"
4869
#include "stringlib/codecs.h"
4870
#include "stringlib/undef.h"
4871
4872
#include "stringlib/ucs4lib.h"
4873
#include "stringlib/codecs.h"
4874
#include "stringlib/undef.h"
4875
4876
#if (SIZEOF_SIZE_T == 8)
4877
/* Mask to quickly check whether a C 'size_t' contains a
4878
   non-ASCII, UTF8-encoded char. */
4879
148M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4880
// used to count codepoints in UTF-8 string.
4881
38.5M
# define VECTOR_0101     0x0101010101010101ULL
4882
819k
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4883
#elif (SIZEOF_SIZE_T == 4)
4884
# define ASCII_CHAR_MASK 0x80808080U
4885
# define VECTOR_0101     0x01010101U
4886
# define VECTOR_00FF     0x00ff00ffU
4887
#else
4888
# error C 'size_t' size should be either 4 or 8!
4889
#endif
4890
4891
#if (defined(__clang__) || defined(__GNUC__))
4892
#define HAVE_CTZ 1
4893
static inline unsigned int
4894
ctz(size_t v)
4895
6.79M
{
4896
6.79M
    return __builtin_ctzll((unsigned long long)v);
4897
6.79M
}
4898
#elif defined(_MSC_VER)
4899
#define HAVE_CTZ 1
4900
static inline unsigned int
4901
ctz(size_t v)
4902
{
4903
    unsigned long pos;
4904
#if SIZEOF_SIZE_T == 4
4905
    _BitScanForward(&pos, v);
4906
#else
4907
    _BitScanForward64(&pos, v);
4908
#endif /* SIZEOF_SIZE_T */
4909
    return pos;
4910
}
4911
#else
4912
#define HAVE_CTZ 0
4913
#endif
4914
4915
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4916
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4917
static size_t
4918
load_unaligned(const unsigned char *p, size_t size)
4919
41.7M
{
4920
41.7M
    union {
4921
41.7M
        size_t s;
4922
41.7M
        unsigned char b[SIZEOF_SIZE_T];
4923
41.7M
    } u;
4924
41.7M
    u.s = 0;
4925
    // This switch statement assumes little endian because:
4926
    // * union is faster than bitwise or and shift.
4927
    // * big endian machine is rare and hard to maintain.
4928
41.7M
    switch (size) {
4929
0
    default:
4930
0
#if SIZEOF_SIZE_T == 8
4931
0
    case 8:
4932
0
        u.b[7] = p[7];
4933
0
        _Py_FALLTHROUGH;
4934
3.93M
    case 7:
4935
3.93M
        u.b[6] = p[6];
4936
3.93M
        _Py_FALLTHROUGH;
4937
11.3M
    case 6:
4938
11.3M
        u.b[5] = p[5];
4939
11.3M
        _Py_FALLTHROUGH;
4940
17.5M
    case 5:
4941
17.5M
        u.b[4] = p[4];
4942
17.5M
        _Py_FALLTHROUGH;
4943
17.5M
#endif
4944
22.0M
    case 4:
4945
22.0M
        u.b[3] = p[3];
4946
22.0M
        _Py_FALLTHROUGH;
4947
31.8M
    case 3:
4948
31.8M
        u.b[2] = p[2];
4949
31.8M
        _Py_FALLTHROUGH;
4950
37.2M
    case 2:
4951
37.2M
        u.b[1] = p[1];
4952
37.2M
        _Py_FALLTHROUGH;
4953
38.9M
    case 1:
4954
38.9M
        u.b[0] = p[0];
4955
38.9M
        break;
4956
2.82M
    case 0:
4957
2.82M
        break;
4958
41.7M
    }
4959
41.7M
    return u.s;
4960
41.7M
}
4961
#endif
4962
4963
/*
4964
 * Find the first non-ASCII character in a byte sequence.
4965
 *
4966
 * This function scans a range of bytes from `start` to `end` and returns the
4967
 * index of the first byte that is not an ASCII character (i.e., has the most
4968
 * significant bit set). If all characters in the range are ASCII, it returns
4969
 * `end - start`.
4970
 */
4971
static Py_ssize_t
4972
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4973
45.4M
{
4974
    // The search is done in `size_t` chunks.
4975
    // The start and end might not be aligned at `size_t` boundaries,
4976
    // so they're handled specially.
4977
4978
45.4M
    const unsigned char *p = start;
4979
4980
45.4M
    if (end - start >= SIZEOF_SIZE_T) {
4981
        // Avoid unaligned read.
4982
18.4M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4983
18.4M
        size_t u;
4984
18.4M
        memcpy(&u, p, sizeof(size_t));
4985
18.4M
        u &= ASCII_CHAR_MASK;
4986
18.4M
        if (u) {
4987
2.69M
            return (ctz(u) - 7) / 8;
4988
2.69M
        }
4989
15.7M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4990
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4991
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
4992
        while (p < p2) {
4993
            if (*p & 0x80) {
4994
                return p - start;
4995
            }
4996
            p++;
4997
        }
4998
#endif
4999
5000
15.7M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5001
97.5M
        while (p <= e) {
5002
82.8M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5003
82.8M
            if (u) {
5004
1.04M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5005
1.04M
                return p - start + (ctz(u) - 7) / 8;
5006
#else
5007
                // big endian and minor compilers are difficult to test.
5008
                // fallback to per byte check.
5009
                break;
5010
#endif
5011
1.04M
            }
5012
81.8M
            p += SIZEOF_SIZE_T;
5013
81.8M
        }
5014
15.7M
    }
5015
41.7M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5016
45.4M
    assert((end - p) < SIZEOF_SIZE_T);
5017
    // we can not use *(const size_t*)p to avoid buffer overrun.
5018
41.7M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5019
41.7M
    if (u) {
5020
3.05M
        return p - start + (ctz(u) - 7) / 8;
5021
3.05M
    }
5022
38.6M
    return end - start;
5023
#else
5024
    while (p < end) {
5025
        if (*p & 0x80) {
5026
            break;
5027
        }
5028
        p++;
5029
    }
5030
    return p - start;
5031
#endif
5032
41.7M
}
5033
5034
static inline int
5035
scalar_utf8_start_char(unsigned int ch)
5036
856k
{
5037
    // 0xxxxxxx or 11xxxxxx are first byte.
5038
856k
    return (~ch >> 7 | ch >> 6) & 1;
5039
856k
}
5040
5041
static inline size_t
5042
vector_utf8_start_chars(size_t v)
5043
38.5M
{
5044
38.5M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5045
38.5M
}
5046
5047
5048
// Count the number of UTF-8 code points in a given byte sequence.
5049
static Py_ssize_t
5050
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5051
338k
{
5052
338k
    Py_ssize_t len = 0;
5053
5054
338k
    if (end - s >= SIZEOF_SIZE_T) {
5055
283k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5056
17.1k
            len += scalar_utf8_start_char(*s++);
5057
17.1k
        }
5058
5059
676k
        while (s + SIZEOF_SIZE_T <= end) {
5060
409k
            const unsigned char *e = end;
5061
409k
            if (e - s > SIZEOF_SIZE_T * 255) {
5062
144k
                e = s + SIZEOF_SIZE_T * 255;
5063
144k
            }
5064
409k
            Py_ssize_t vstart = 0;
5065
38.9M
            while (s + SIZEOF_SIZE_T <= e) {
5066
38.5M
                size_t v = *(size_t*)s;
5067
38.5M
                size_t vs = vector_utf8_start_chars(v);
5068
38.5M
                vstart += vs;
5069
38.5M
                s += SIZEOF_SIZE_T;
5070
38.5M
            }
5071
409k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5072
409k
            vstart += vstart >> 16;
5073
409k
#if SIZEOF_SIZE_T == 8
5074
409k
            vstart += vstart >> 32;
5075
409k
#endif
5076
409k
            len += vstart & 0x7ff;
5077
409k
        }
5078
266k
    }
5079
1.17M
    while (s < end) {
5080
839k
        len += scalar_utf8_start_char(*s++);
5081
839k
    }
5082
338k
    return len;
5083
338k
}
5084
5085
static Py_ssize_t
5086
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5087
14.0M
{
5088
14.0M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5089
14.0M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5090
13.9M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5091
10.9M
    {
5092
        /* Fast path, see in STRINGLIB(utf8_decode) for
5093
           an explanation. */
5094
10.9M
        const char *p = start;
5095
10.9M
        Py_UCS1 *q = dest;
5096
14.1M
        while (p + SIZEOF_SIZE_T <= end) {
5097
5.39M
            size_t value = *(const size_t *) p;
5098
5.39M
            if (value & ASCII_CHAR_MASK)
5099
2.18M
                break;
5100
3.20M
            *((size_t *)q) = value;
5101
3.20M
            p += SIZEOF_SIZE_T;
5102
3.20M
            q += SIZEOF_SIZE_T;
5103
3.20M
        }
5104
52.8M
        while (p < end) {
5105
44.1M
            if ((unsigned char)*p & 0x80)
5106
2.20M
                break;
5107
41.9M
            *q++ = *p++;
5108
41.9M
        }
5109
10.9M
        return p - start;
5110
10.9M
    }
5111
3.07M
#endif
5112
3.07M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5113
3.07M
                                         (const unsigned char*)end);
5114
3.07M
    memcpy(dest, start, pos);
5115
3.07M
    return pos;
5116
14.0M
}
5117
5118
static int
5119
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5120
                         const char *starts, const char *s, const char *end,
5121
                         _Py_error_handler error_handler,
5122
                         const char *errors,
5123
                         Py_ssize_t *consumed)
5124
6.79M
{
5125
6.79M
    Py_ssize_t startinpos, endinpos;
5126
6.79M
    const char *errmsg = "";
5127
6.79M
    PyObject *error_handler_obj = NULL;
5128
6.79M
    PyObject *exc = NULL;
5129
5130
279M
    while (s < end) {
5131
276M
        Py_UCS4 ch;
5132
276M
        int kind = writer->kind;
5133
5134
276M
        if (kind == PyUnicode_1BYTE_KIND) {
5135
7.06M
            if (PyUnicode_IS_ASCII(writer->buffer))
5136
6.45M
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5137
614k
            else
5138
614k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5139
269M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5140
99.5M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5141
169M
        } else {
5142
169M
            assert(kind == PyUnicode_4BYTE_KIND);
5143
169M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5144
169M
        }
5145
5146
276M
        switch (ch) {
5147
4.01M
        case 0:
5148
4.01M
            if (s == end || consumed)
5149
3.98M
                goto End;
5150
25.6k
            errmsg = "unexpected end of data";
5151
25.6k
            startinpos = s - starts;
5152
25.6k
            endinpos = end - starts;
5153
25.6k
            break;
5154
199M
        case 1:
5155
199M
            errmsg = "invalid start byte";
5156
199M
            startinpos = s - starts;
5157
199M
            endinpos = startinpos + 1;
5158
199M
            break;
5159
64.6M
        case 2:
5160
64.6M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5161
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5162
0
            {
5163
                /* Truncated surrogate code in range D800-DFFF */
5164
0
                goto End;
5165
0
            }
5166
64.6M
            _Py_FALLTHROUGH;
5167
66.0M
        case 3:
5168
66.2M
        case 4:
5169
66.2M
            errmsg = "invalid continuation byte";
5170
66.2M
            startinpos = s - starts;
5171
66.2M
            endinpos = startinpos + ch - 1;
5172
66.2M
            break;
5173
6.47M
        default:
5174
            // ch doesn't fit into kind, so change the buffer kind to write
5175
            // the character
5176
6.47M
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5177
0
                goto onError;
5178
6.47M
            continue;
5179
276M
        }
5180
5181
265M
        if (error_handler == _Py_ERROR_UNKNOWN)
5182
160k
            error_handler = _Py_GetErrorHandler(errors);
5183
5184
265M
        switch (error_handler) {
5185
0
        case _Py_ERROR_IGNORE:
5186
0
            s += (endinpos - startinpos);
5187
0
            break;
5188
5189
263M
        case _Py_ERROR_REPLACE:
5190
263M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5191
0
                goto onError;
5192
263M
            s += (endinpos - startinpos);
5193
263M
            break;
5194
5195
2.34M
        case _Py_ERROR_SURROGATEESCAPE:
5196
2.34M
        {
5197
2.34M
            Py_ssize_t i;
5198
5199
2.34M
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5200
0
                goto onError;
5201
4.68M
            for (i=startinpos; i<endinpos; i++) {
5202
2.34M
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5203
2.34M
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5204
2.34M
                                ch + 0xdc00);
5205
2.34M
                writer->pos++;
5206
2.34M
            }
5207
2.34M
            s += (endinpos - startinpos);
5208
2.34M
            break;
5209
2.34M
        }
5210
5211
1.50k
        default:
5212
1.50k
            if (unicode_decode_call_errorhandler_writer(
5213
1.50k
                    errors, &error_handler_obj,
5214
1.50k
                    "utf-8", errmsg,
5215
1.50k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5216
1.50k
                    writer)) {
5217
1.50k
                goto onError;
5218
1.50k
            }
5219
5220
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5221
0
                goto onError;
5222
0
            }
5223
265M
        }
5224
265M
    }
5225
5226
6.79M
End:
5227
6.79M
    if (consumed)
5228
663
        *consumed = s - starts;
5229
5230
6.79M
    Py_XDECREF(error_handler_obj);
5231
6.79M
    Py_XDECREF(exc);
5232
6.79M
    return 0;
5233
5234
1.50k
onError:
5235
1.50k
    Py_XDECREF(error_handler_obj);
5236
1.50k
    Py_XDECREF(exc);
5237
1.50k
    return -1;
5238
6.79M
}
5239
5240
5241
static PyObject *
5242
unicode_decode_utf8(const char *s, Py_ssize_t size,
5243
                    _Py_error_handler error_handler, const char *errors,
5244
                    Py_ssize_t *consumed)
5245
70.2M
{
5246
70.2M
    if (size == 0) {
5247
1.93M
        if (consumed) {
5248
0
            *consumed = 0;
5249
0
        }
5250
1.93M
        _Py_RETURN_UNICODE_EMPTY();
5251
1.93M
    }
5252
5253
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5254
68.3M
    if (size == 1 && (unsigned char)s[0] < 128) {
5255
25.8M
        if (consumed) {
5256
0
            *consumed = 1;
5257
0
        }
5258
25.8M
        return get_latin1_char((unsigned char)s[0]);
5259
25.8M
    }
5260
5261
    // I don't know this check is necessary or not. But there is a test
5262
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5263
42.4M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5264
0
        PyErr_NoMemory();
5265
0
        return NULL;
5266
0
    }
5267
5268
42.4M
    const char *starts = s;
5269
42.4M
    const char *end = s + size;
5270
5271
42.4M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5272
42.4M
    if (pos == size) {  // fast path: ASCII string.
5273
35.6M
        PyObject *u = PyUnicode_New(size, 127);
5274
35.6M
        if (u == NULL) {
5275
0
            return NULL;
5276
0
        }
5277
35.6M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5278
35.6M
        if (consumed) {
5279
102
            *consumed = size;
5280
102
        }
5281
35.6M
        return u;
5282
35.6M
    }
5283
5284
6.74M
    int maxchr = 127;
5285
6.74M
    Py_ssize_t maxsize = size;
5286
5287
6.74M
    unsigned char ch = (unsigned char)(s[pos]);
5288
    // error handler other than strict may remove/replace the invalid byte.
5289
    // consumed != NULL allows 1~3 bytes remainings.
5290
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5291
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5292
    // reallocation and copy.
5293
6.74M
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5294
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5295
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5296
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5297
        // means that it is no longer necessary to allocate several times the required amount
5298
        // of memory.
5299
338k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5300
338k
        if (ch < 0xc4) { // latin1
5301
224k
            maxchr = 0xff;
5302
224k
        }
5303
113k
        else if (ch < 0xf0) { // ucs2
5304
101k
            maxchr = 0xffff;
5305
101k
        }
5306
12.3k
        else { // ucs4
5307
12.3k
            maxchr = 0x10ffff;
5308
12.3k
        }
5309
338k
    }
5310
6.74M
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5311
6.74M
    if (!u) {
5312
0
        return NULL;
5313
0
    }
5314
5315
    // Use _PyUnicodeWriter after fast path is failed.
5316
6.74M
    _PyUnicodeWriter writer;
5317
6.74M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5318
6.74M
    if (maxchr <= 255) {
5319
6.63M
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5320
6.63M
        s += pos;
5321
6.63M
        writer.pos = pos;
5322
6.63M
    }
5323
5324
6.74M
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5325
6.74M
                                 error_handler, errors,
5326
6.74M
                                 consumed) < 0) {
5327
1.50k
        _PyUnicodeWriter_Dealloc(&writer);
5328
1.50k
        return NULL;
5329
1.50k
    }
5330
6.74M
    return _PyUnicodeWriter_Finish(&writer);
5331
6.74M
}
5332
5333
5334
// Used by PyUnicodeWriter_WriteUTF8() implementation
5335
int
5336
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5337
                            const char *s, Py_ssize_t size,
5338
                            _Py_error_handler error_handler, const char *errors,
5339
                            Py_ssize_t *consumed)
5340
3.15M
{
5341
3.15M
    if (size == 0) {
5342
8.55k
        if (consumed) {
5343
0
            *consumed = 0;
5344
0
        }
5345
8.55k
        return 0;
5346
8.55k
    }
5347
5348
    // fast path: try ASCII string.
5349
3.14M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5350
0
        return -1;
5351
0
    }
5352
5353
3.14M
    const char *starts = s;
5354
3.14M
    const char *end = s + size;
5355
3.14M
    Py_ssize_t decoded = 0;
5356
3.14M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5357
3.14M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5358
3.14M
        decoded = ascii_decode(s, end, dest);
5359
3.14M
        writer->pos += decoded;
5360
5361
3.14M
        if (decoded == size) {
5362
3.10M
            if (consumed) {
5363
1.11k
                *consumed = size;
5364
1.11k
            }
5365
3.10M
            return 0;
5366
3.10M
        }
5367
43.7k
        s += decoded;
5368
43.7k
    }
5369
5370
45.7k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5371
45.7k
                                    error_handler, errors, consumed);
5372
3.14M
}
5373
5374
5375
PyObject *
5376
PyUnicode_DecodeUTF8Stateful(const char *s,
5377
                             Py_ssize_t size,
5378
                             const char *errors,
5379
                             Py_ssize_t *consumed)
5380
70.0M
{
5381
70.0M
    return unicode_decode_utf8(s, size,
5382
70.0M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5383
70.0M
                               errors, consumed);
5384
70.0M
}
5385
5386
5387
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5388
   non-zero, use strict error handler otherwise.
5389
5390
   On success, write a pointer to a newly allocated wide character string into
5391
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5392
   (in number of wchar_t units) into *wlen (if wlen is set).
5393
5394
   On memory allocation failure, return -1.
5395
5396
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5397
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5398
   is not NULL, write the decoding error message into *reason. */
5399
int
5400
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5401
                 const char **reason, _Py_error_handler errors)
5402
11.8k
{
5403
11.8k
    const char *orig_s = s;
5404
11.8k
    const char *e;
5405
11.8k
    wchar_t *unicode;
5406
11.8k
    Py_ssize_t outpos;
5407
5408
11.8k
    int surrogateescape = 0;
5409
11.8k
    int surrogatepass = 0;
5410
11.8k
    switch (errors)
5411
11.8k
    {
5412
0
    case _Py_ERROR_STRICT:
5413
0
        break;
5414
11.8k
    case _Py_ERROR_SURROGATEESCAPE:
5415
11.8k
        surrogateescape = 1;
5416
11.8k
        break;
5417
0
    case _Py_ERROR_SURROGATEPASS:
5418
0
        surrogatepass = 1;
5419
0
        break;
5420
0
    default:
5421
0
        return -3;
5422
11.8k
    }
5423
5424
    /* Note: size will always be longer than the resulting Unicode
5425
       character count */
5426
11.8k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5427
0
        return -1;
5428
0
    }
5429
5430
11.8k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5431
11.8k
    if (!unicode) {
5432
0
        return -1;
5433
0
    }
5434
5435
    /* Unpack UTF-8 encoded data */
5436
11.8k
    e = s + size;
5437
11.8k
    outpos = 0;
5438
11.8k
    while (s < e) {
5439
11.8k
        Py_UCS4 ch;
5440
11.8k
#if SIZEOF_WCHAR_T == 4
5441
11.8k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5442
#else
5443
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5444
#endif
5445
11.8k
        if (ch > 0xFF) {
5446
0
#if SIZEOF_WCHAR_T == 4
5447
0
            Py_UNREACHABLE();
5448
#else
5449
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5450
            /* write a surrogate pair */
5451
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5452
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5453
#endif
5454
0
        }
5455
11.8k
        else {
5456
11.8k
            if (!ch && s == e) {
5457
11.8k
                break;
5458
11.8k
            }
5459
5460
0
            if (surrogateescape) {
5461
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5462
0
            }
5463
0
            else {
5464
                /* Is it a valid three-byte code? */
5465
0
                if (surrogatepass
5466
0
                    && (e - s) >= 3
5467
0
                    && (s[0] & 0xf0) == 0xe0
5468
0
                    && (s[1] & 0xc0) == 0x80
5469
0
                    && (s[2] & 0xc0) == 0x80)
5470
0
                {
5471
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5472
0
                    s += 3;
5473
0
                    unicode[outpos++] = ch;
5474
0
                }
5475
0
                else {
5476
0
                    PyMem_RawFree(unicode );
5477
0
                    if (reason != NULL) {
5478
0
                        switch (ch) {
5479
0
                        case 0:
5480
0
                            *reason = "unexpected end of data";
5481
0
                            break;
5482
0
                        case 1:
5483
0
                            *reason = "invalid start byte";
5484
0
                            break;
5485
                        /* 2, 3, 4 */
5486
0
                        default:
5487
0
                            *reason = "invalid continuation byte";
5488
0
                            break;
5489
0
                        }
5490
0
                    }
5491
0
                    if (wlen != NULL) {
5492
0
                        *wlen = s - orig_s;
5493
0
                    }
5494
0
                    return -2;
5495
0
                }
5496
0
            }
5497
0
        }
5498
11.8k
    }
5499
11.8k
    unicode[outpos] = L'\0';
5500
11.8k
    if (wlen) {
5501
11.8k
        *wlen = outpos;
5502
11.8k
    }
5503
11.8k
    *wstr = unicode;
5504
11.8k
    return 0;
5505
11.8k
}
5506
5507
5508
wchar_t*
5509
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5510
                               size_t *wlen)
5511
0
{
5512
0
    wchar_t *wstr;
5513
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5514
0
                               &wstr, wlen,
5515
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5516
0
    if (res != 0) {
5517
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5518
0
        assert(res != -3);
5519
0
        if (wlen) {
5520
0
            *wlen = (size_t)res;
5521
0
        }
5522
0
        return NULL;
5523
0
    }
5524
0
    return wstr;
5525
0
}
5526
5527
5528
/* UTF-8 encoder.
5529
5530
   On success, return 0 and write the newly allocated character string (use
5531
   PyMem_Free() to free the memory) into *str.
5532
5533
   On encoding failure, return -2 and write the position of the invalid
5534
   surrogate character into *error_pos (if error_pos is set) and the decoding
5535
   error message into *reason (if reason is set).
5536
5537
   On memory allocation failure, return -1. */
5538
int
5539
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5540
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5541
1.47k
{
5542
1.47k
    const Py_ssize_t max_char_size = 4;
5543
1.47k
    Py_ssize_t len = wcslen(text);
5544
5545
1.47k
    assert(len >= 0);
5546
5547
1.47k
    int surrogateescape = 0;
5548
1.47k
    int surrogatepass = 0;
5549
1.47k
    switch (errors)
5550
1.47k
    {
5551
144
    case _Py_ERROR_STRICT:
5552
144
        break;
5553
1.33k
    case _Py_ERROR_SURROGATEESCAPE:
5554
1.33k
        surrogateescape = 1;
5555
1.33k
        break;
5556
0
    case _Py_ERROR_SURROGATEPASS:
5557
0
        surrogatepass = 1;
5558
0
        break;
5559
0
    default:
5560
0
        return -3;
5561
1.47k
    }
5562
5563
1.47k
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5564
0
        return -1;
5565
0
    }
5566
1.47k
    char *bytes;
5567
1.47k
    if (raw_malloc) {
5568
1.47k
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5569
1.47k
    }
5570
0
    else {
5571
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5572
0
    }
5573
1.47k
    if (bytes == NULL) {
5574
0
        return -1;
5575
0
    }
5576
5577
1.47k
    char *p = bytes;
5578
1.47k
    Py_ssize_t i;
5579
96.0k
    for (i = 0; i < len; ) {
5580
94.5k
        Py_ssize_t ch_pos = i;
5581
94.5k
        Py_UCS4 ch = text[i];
5582
94.5k
        i++;
5583
#if Py_UNICODE_SIZE == 2
5584
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5585
            && i < len
5586
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5587
        {
5588
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5589
            i++;
5590
        }
5591
#endif
5592
5593
94.5k
        if (ch < 0x80) {
5594
            /* Encode ASCII */
5595
94.5k
            *p++ = (char) ch;
5596
5597
94.5k
        }
5598
0
        else if (ch < 0x0800) {
5599
            /* Encode Latin-1 */
5600
0
            *p++ = (char)(0xc0 | (ch >> 6));
5601
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5602
0
        }
5603
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5604
            /* surrogateescape error handler */
5605
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5606
0
                if (error_pos != NULL) {
5607
0
                    *error_pos = (size_t)ch_pos;
5608
0
                }
5609
0
                if (reason != NULL) {
5610
0
                    *reason = "encoding error";
5611
0
                }
5612
0
                if (raw_malloc) {
5613
0
                    PyMem_RawFree(bytes);
5614
0
                }
5615
0
                else {
5616
0
                    PyMem_Free(bytes);
5617
0
                }
5618
0
                return -2;
5619
0
            }
5620
0
            *p++ = (char)(ch & 0xff);
5621
0
        }
5622
0
        else if (ch < 0x10000) {
5623
0
            *p++ = (char)(0xe0 | (ch >> 12));
5624
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5625
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5626
0
        }
5627
0
        else {  /* ch >= 0x10000 */
5628
0
            assert(ch <= MAX_UNICODE);
5629
            /* Encode UCS4 Unicode ordinals */
5630
0
            *p++ = (char)(0xf0 | (ch >> 18));
5631
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5632
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5633
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5634
0
        }
5635
94.5k
    }
5636
1.47k
    *p++ = '\0';
5637
5638
1.47k
    size_t final_size = (p - bytes);
5639
1.47k
    char *bytes2;
5640
1.47k
    if (raw_malloc) {
5641
1.47k
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5642
1.47k
    }
5643
0
    else {
5644
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5645
0
    }
5646
1.47k
    if (bytes2 == NULL) {
5647
0
        if (error_pos != NULL) {
5648
0
            *error_pos = (size_t)-1;
5649
0
        }
5650
0
        if (raw_malloc) {
5651
0
            PyMem_RawFree(bytes);
5652
0
        }
5653
0
        else {
5654
0
            PyMem_Free(bytes);
5655
0
        }
5656
0
        return -1;
5657
0
    }
5658
1.47k
    *str = bytes2;
5659
1.47k
    return 0;
5660
1.47k
}
5661
5662
5663
/* Primary internal function which creates utf8 encoded bytes objects.
5664
5665
   Allocation strategy:  if the string is short, convert into a stack buffer
5666
   and allocate exactly as much space needed at the end.  Else allocate the
5667
   maximum possible needed (4 result bytes per Unicode character), and return
5668
   the excess memory at the end.
5669
*/
5670
static PyObject *
5671
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5672
                    const char *errors)
5673
18.1M
{
5674
18.1M
    if (!PyUnicode_Check(unicode)) {
5675
0
        PyErr_BadArgument();
5676
0
        return NULL;
5677
0
    }
5678
5679
18.1M
    if (PyUnicode_UTF8(unicode))
5680
9.21M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5681
9.21M
                                         PyUnicode_UTF8_LENGTH(unicode));
5682
5683
8.91M
    int kind = PyUnicode_KIND(unicode);
5684
8.91M
    const void *data = PyUnicode_DATA(unicode);
5685
8.91M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5686
5687
8.91M
    PyBytesWriter *writer;
5688
8.91M
    char *end;
5689
5690
8.91M
    switch (kind) {
5691
0
    default:
5692
0
        Py_UNREACHABLE();
5693
5.96M
    case PyUnicode_1BYTE_KIND:
5694
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5695
5.96M
        assert(!PyUnicode_IS_ASCII(unicode));
5696
5.96M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5697
5.96M
                                      error_handler, errors, &end);
5698
5.96M
        break;
5699
1.76M
    case PyUnicode_2BYTE_KIND:
5700
1.76M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5701
1.76M
                                      error_handler, errors, &end);
5702
1.76M
        break;
5703
1.18M
    case PyUnicode_4BYTE_KIND:
5704
1.18M
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5705
1.18M
                                      error_handler, errors, &end);
5706
1.18M
        break;
5707
8.91M
    }
5708
5709
8.91M
    if (writer == NULL) {
5710
154k
        PyBytesWriter_Discard(writer);
5711
154k
        return NULL;
5712
154k
    }
5713
8.76M
    return PyBytesWriter_FinishWithPointer(writer, end);
5714
8.91M
}
5715
5716
static int
5717
unicode_fill_utf8(PyObject *unicode)
5718
162k
{
5719
162k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5720
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5721
162k
    assert(!PyUnicode_IS_ASCII(unicode));
5722
5723
162k
    int kind = PyUnicode_KIND(unicode);
5724
162k
    const void *data = PyUnicode_DATA(unicode);
5725
162k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5726
5727
162k
    PyBytesWriter *writer;
5728
162k
    char *end;
5729
5730
162k
    switch (kind) {
5731
0
    default:
5732
0
        Py_UNREACHABLE();
5733
114k
    case PyUnicode_1BYTE_KIND:
5734
114k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5735
114k
                                      _Py_ERROR_STRICT, NULL, &end);
5736
114k
        break;
5737
38.8k
    case PyUnicode_2BYTE_KIND:
5738
38.8k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5739
38.8k
                                      _Py_ERROR_STRICT, NULL, &end);
5740
38.8k
        break;
5741
8.27k
    case PyUnicode_4BYTE_KIND:
5742
8.27k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5743
8.27k
                                      _Py_ERROR_STRICT, NULL, &end);
5744
8.27k
        break;
5745
162k
    }
5746
162k
    if (writer == NULL) {
5747
207
        return -1;
5748
207
    }
5749
5750
161k
    const char *start = PyBytesWriter_GetData(writer);
5751
161k
    Py_ssize_t len = end - start;
5752
5753
161k
    char *cache = PyMem_Malloc(len + 1);
5754
161k
    if (cache == NULL) {
5755
0
        PyBytesWriter_Discard(writer);
5756
0
        PyErr_NoMemory();
5757
0
        return -1;
5758
0
    }
5759
161k
    memcpy(cache, start, len);
5760
161k
    cache[len] = '\0';
5761
161k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5762
161k
    PyUnicode_SET_UTF8(unicode, cache);
5763
161k
    PyBytesWriter_Discard(writer);
5764
161k
    return 0;
5765
161k
}
5766
5767
PyObject *
5768
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5769
17.1M
{
5770
17.1M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5771
17.1M
}
5772
5773
5774
PyObject *
5775
PyUnicode_AsUTF8String(PyObject *unicode)
5776
2.23k
{
5777
2.23k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5778
2.23k
}
5779
5780
/* --- UTF-32 Codec ------------------------------------------------------- */
5781
5782
PyObject *
5783
PyUnicode_DecodeUTF32(const char *s,
5784
                      Py_ssize_t size,
5785
                      const char *errors,
5786
                      int *byteorder)
5787
89
{
5788
89
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5789
89
}
5790
5791
PyObject *
5792
PyUnicode_DecodeUTF32Stateful(const char *s,
5793
                              Py_ssize_t size,
5794
                              const char *errors,
5795
                              int *byteorder,
5796
                              Py_ssize_t *consumed)
5797
37.3k
{
5798
37.3k
    const char *starts = s;
5799
37.3k
    Py_ssize_t startinpos;
5800
37.3k
    Py_ssize_t endinpos;
5801
37.3k
    _PyUnicodeWriter writer;
5802
37.3k
    const unsigned char *q, *e;
5803
37.3k
    int le, bo = 0;       /* assume native ordering by default */
5804
37.3k
    const char *encoding;
5805
37.3k
    const char *errmsg = "";
5806
37.3k
    PyObject *errorHandler = NULL;
5807
37.3k
    PyObject *exc = NULL;
5808
5809
37.3k
    q = (const unsigned char *)s;
5810
37.3k
    e = q + size;
5811
5812
37.3k
    if (byteorder)
5813
37.2k
        bo = *byteorder;
5814
5815
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5816
       byte order setting accordingly. In native mode, the leading BOM
5817
       mark is skipped, in all other modes, it is copied to the output
5818
       stream as-is (giving a ZWNBSP character). */
5819
37.3k
    if (bo == 0 && size >= 4) {
5820
35.0k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5821
35.0k
        if (bom == 0x0000FEFF) {
5822
186
            bo = -1;
5823
186
            q += 4;
5824
186
        }
5825
34.8k
        else if (bom == 0xFFFE0000) {
5826
317
            bo = 1;
5827
317
            q += 4;
5828
317
        }
5829
35.0k
        if (byteorder)
5830
34.9k
            *byteorder = bo;
5831
35.0k
    }
5832
5833
37.3k
    if (q == e) {
5834
102
        if (consumed)
5835
0
            *consumed = size;
5836
102
        _Py_RETURN_UNICODE_EMPTY();
5837
102
    }
5838
5839
#ifdef WORDS_BIGENDIAN
5840
    le = bo < 0;
5841
#else
5842
37.2k
    le = bo <= 0;
5843
37.2k
#endif
5844
37.2k
    encoding = le ? "utf-32-le" : "utf-32-be";
5845
5846
37.2k
    _PyUnicodeWriter_Init(&writer);
5847
37.2k
    writer.min_length = (e - q + 3) / 4;
5848
37.2k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5849
0
        goto onError;
5850
5851
115k
    while (1) {
5852
115k
        Py_UCS4 ch = 0;
5853
115k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5854
5855
115k
        if (e - q >= 4) {
5856
90.2k
            int kind = writer.kind;
5857
90.2k
            void *data = writer.data;
5858
90.2k
            const unsigned char *last = e - 4;
5859
90.2k
            Py_ssize_t pos = writer.pos;
5860
90.2k
            if (le) {
5861
2.19M
                do {
5862
2.19M
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5863
2.19M
                    if (ch > maxch)
5864
85.4k
                        break;
5865
2.11M
                    if (kind != PyUnicode_1BYTE_KIND &&
5866
2.08M
                        Py_UNICODE_IS_SURROGATE(ch))
5867
139
                        break;
5868
2.11M
                    PyUnicode_WRITE(kind, data, pos++, ch);
5869
2.11M
                    q += 4;
5870
2.11M
                } while (q <= last);
5871
86.7k
            }
5872
3.50k
            else {
5873
6.43k
                do {
5874
6.43k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5875
6.43k
                    if (ch > maxch)
5876
3.19k
                        break;
5877
3.23k
                    if (kind != PyUnicode_1BYTE_KIND &&
5878
2.52k
                        Py_UNICODE_IS_SURROGATE(ch))
5879
112
                        break;
5880
3.12k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5881
3.12k
                    q += 4;
5882
3.12k
                } while (q <= last);
5883
3.50k
            }
5884
90.2k
            writer.pos = pos;
5885
90.2k
        }
5886
5887
115k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5888
255
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5889
255
            startinpos = ((const char *)q) - starts;
5890
255
            endinpos = startinpos + 4;
5891
255
        }
5892
115k
        else if (ch <= maxch) {
5893
26.4k
            if (q == e || consumed)
5894
5.14k
                break;
5895
            /* remaining bytes at the end? (size should be divisible by 4) */
5896
21.3k
            errmsg = "truncated data";
5897
21.3k
            startinpos = ((const char *)q) - starts;
5898
21.3k
            endinpos = ((const char *)e) - starts;
5899
21.3k
        }
5900
88.6k
        else {
5901
88.6k
            if (ch < 0x110000) {
5902
5.11k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5903
0
                    goto onError;
5904
5.11k
                q += 4;
5905
5.11k
                continue;
5906
5.11k
            }
5907
83.5k
            errmsg = "code point not in range(0x110000)";
5908
83.5k
            startinpos = ((const char *)q) - starts;
5909
83.5k
            endinpos = startinpos + 4;
5910
83.5k
        }
5911
5912
        /* The remaining input chars are ignored if the callback
5913
           chooses to skip the input */
5914
105k
        if (unicode_decode_call_errorhandler_writer(
5915
105k
                errors, &errorHandler,
5916
105k
                encoding, errmsg,
5917
105k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5918
105k
                &writer))
5919
32.0k
            goto onError;
5920
105k
    }
5921
5922
5.14k
    if (consumed)
5923
0
        *consumed = (const char *)q-starts;
5924
5925
5.14k
    Py_XDECREF(errorHandler);
5926
5.14k
    Py_XDECREF(exc);
5927
5.14k
    return _PyUnicodeWriter_Finish(&writer);
5928
5929
32.0k
  onError:
5930
32.0k
    _PyUnicodeWriter_Dealloc(&writer);
5931
32.0k
    Py_XDECREF(errorHandler);
5932
32.0k
    Py_XDECREF(exc);
5933
32.0k
    return NULL;
5934
37.2k
}
5935
5936
PyObject *
5937
_PyUnicode_EncodeUTF32(PyObject *str,
5938
                       const char *errors,
5939
                       int byteorder)
5940
0
{
5941
0
    if (!PyUnicode_Check(str)) {
5942
0
        PyErr_BadArgument();
5943
0
        return NULL;
5944
0
    }
5945
0
    int kind = PyUnicode_KIND(str);
5946
0
    const void *data = PyUnicode_DATA(str);
5947
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5948
5949
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5950
0
        return PyErr_NoMemory();
5951
0
    Py_ssize_t nsize = len + (byteorder == 0);
5952
5953
0
#if PY_LITTLE_ENDIAN
5954
0
    int native_ordering = byteorder <= 0;
5955
#else
5956
    int native_ordering = byteorder >= 0;
5957
#endif
5958
5959
0
    if (kind == PyUnicode_1BYTE_KIND) {
5960
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5961
        // on short strings
5962
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5963
0
        if (v == NULL) {
5964
0
            return NULL;
5965
0
        }
5966
5967
        /* output buffer is 4-bytes aligned */
5968
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5969
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5970
0
        if (byteorder == 0) {
5971
0
            *out++ = 0xFEFF;
5972
0
        }
5973
0
        if (len > 0) {
5974
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5975
0
                                 &out, native_ordering);
5976
0
        }
5977
0
        return v;
5978
0
    }
5979
5980
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5981
0
    if (writer == NULL) {
5982
0
        return NULL;
5983
0
    }
5984
5985
    /* output buffer is 4-bytes aligned */
5986
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5987
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5988
0
    if (byteorder == 0) {
5989
0
        *out++ = 0xFEFF;
5990
0
    }
5991
0
    if (len == 0) {
5992
0
        return PyBytesWriter_Finish(writer);
5993
0
    }
5994
5995
0
    const char *encoding;
5996
0
    if (byteorder == -1)
5997
0
        encoding = "utf-32-le";
5998
0
    else if (byteorder == 1)
5999
0
        encoding = "utf-32-be";
6000
0
    else
6001
0
        encoding = "utf-32";
6002
6003
0
    PyObject *errorHandler = NULL;
6004
0
    PyObject *exc = NULL;
6005
0
    PyObject *rep = NULL;
6006
6007
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6008
0
        if (kind == PyUnicode_2BYTE_KIND) {
6009
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6010
0
                                        &out, native_ordering);
6011
0
        }
6012
0
        else {
6013
0
            assert(kind == PyUnicode_4BYTE_KIND);
6014
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6015
0
                                        &out, native_ordering);
6016
0
        }
6017
0
        if (pos == len)
6018
0
            break;
6019
6020
0
        Py_ssize_t newpos;
6021
0
        rep = unicode_encode_call_errorhandler(
6022
0
                errors, &errorHandler,
6023
0
                encoding, "surrogates not allowed",
6024
0
                str, &exc, pos, pos + 1, &newpos);
6025
0
        if (!rep)
6026
0
            goto error;
6027
6028
0
        Py_ssize_t repsize, moreunits;
6029
0
        if (PyBytes_Check(rep)) {
6030
0
            repsize = PyBytes_GET_SIZE(rep);
6031
0
            if (repsize & 3) {
6032
0
                raise_encode_exception(&exc, encoding,
6033
0
                                       str, pos, pos + 1,
6034
0
                                       "surrogates not allowed");
6035
0
                goto error;
6036
0
            }
6037
0
            moreunits = repsize / 4;
6038
0
        }
6039
0
        else {
6040
0
            assert(PyUnicode_Check(rep));
6041
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6042
0
            if (!PyUnicode_IS_ASCII(rep)) {
6043
0
                raise_encode_exception(&exc, encoding,
6044
0
                                       str, pos, pos + 1,
6045
0
                                       "surrogates not allowed");
6046
0
                goto error;
6047
0
            }
6048
0
        }
6049
0
        moreunits += pos - newpos;
6050
0
        pos = newpos;
6051
6052
        /* four bytes are reserved for each surrogate */
6053
0
        if (moreunits > 0) {
6054
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6055
0
            if (out == NULL) {
6056
0
                goto error;
6057
0
            }
6058
0
        }
6059
6060
0
        if (PyBytes_Check(rep)) {
6061
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6062
0
            out += repsize / 4;
6063
0
        }
6064
0
        else {
6065
            /* rep is unicode */
6066
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6067
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6068
0
                                 &out, native_ordering);
6069
0
        }
6070
6071
0
        Py_CLEAR(rep);
6072
0
    }
6073
6074
0
    Py_XDECREF(errorHandler);
6075
0
    Py_XDECREF(exc);
6076
6077
    /* Cut back to size actually needed. This is necessary for, for example,
6078
       encoding of a string containing isolated surrogates and the 'ignore'
6079
       handler is used. */
6080
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6081
6082
0
  error:
6083
0
    Py_XDECREF(rep);
6084
0
    Py_XDECREF(errorHandler);
6085
0
    Py_XDECREF(exc);
6086
0
    PyBytesWriter_Discard(writer);
6087
0
    return NULL;
6088
0
}
6089
6090
PyObject *
6091
PyUnicode_AsUTF32String(PyObject *unicode)
6092
0
{
6093
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6094
0
}
6095
6096
/* --- UTF-16 Codec ------------------------------------------------------- */
6097
6098
PyObject *
6099
PyUnicode_DecodeUTF16(const char *s,
6100
                      Py_ssize_t size,
6101
                      const char *errors,
6102
                      int *byteorder)
6103
89
{
6104
89
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6105
89
}
6106
6107
PyObject *
6108
PyUnicode_DecodeUTF16Stateful(const char *s,
6109
                              Py_ssize_t size,
6110
                              const char *errors,
6111
                              int *byteorder,
6112
                              Py_ssize_t *consumed)
6113
14.8k
{
6114
14.8k
    const char *starts = s;
6115
14.8k
    Py_ssize_t startinpos;
6116
14.8k
    Py_ssize_t endinpos;
6117
14.8k
    _PyUnicodeWriter writer;
6118
14.8k
    const unsigned char *q, *e;
6119
14.8k
    int bo = 0;       /* assume native ordering by default */
6120
14.8k
    int native_ordering;
6121
14.8k
    const char *errmsg = "";
6122
14.8k
    PyObject *errorHandler = NULL;
6123
14.8k
    PyObject *exc = NULL;
6124
14.8k
    const char *encoding;
6125
6126
14.8k
    q = (const unsigned char *)s;
6127
14.8k
    e = q + size;
6128
6129
14.8k
    if (byteorder)
6130
14.7k
        bo = *byteorder;
6131
6132
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6133
       byte order setting accordingly. In native mode, the leading BOM
6134
       mark is skipped, in all other modes, it is copied to the output
6135
       stream as-is (giving a ZWNBSP character). */
6136
14.8k
    if (bo == 0 && size >= 2) {
6137
14.0k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6138
14.0k
        if (bom == 0xFEFF) {
6139
286
            q += 2;
6140
286
            bo = -1;
6141
286
        }
6142
13.7k
        else if (bom == 0xFFFE) {
6143
2.54k
            q += 2;
6144
2.54k
            bo = 1;
6145
2.54k
        }
6146
14.0k
        if (byteorder)
6147
13.9k
            *byteorder = bo;
6148
14.0k
    }
6149
6150
14.8k
    if (q == e) {
6151
96
        if (consumed)
6152
0
            *consumed = size;
6153
96
        _Py_RETURN_UNICODE_EMPTY();
6154
96
    }
6155
6156
14.7k
#if PY_LITTLE_ENDIAN
6157
14.7k
    native_ordering = bo <= 0;
6158
14.7k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6159
#else
6160
    native_ordering = bo >= 0;
6161
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6162
#endif
6163
6164
    /* Note: size will always be longer than the resulting Unicode
6165
       character count normally.  Error handler will take care of
6166
       resizing when needed. */
6167
14.7k
    _PyUnicodeWriter_Init(&writer);
6168
14.7k
    writer.min_length = (e - q + 1) / 2;
6169
14.7k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6170
0
        goto onError;
6171
6172
53.7k
    while (1) {
6173
53.7k
        Py_UCS4 ch = 0;
6174
53.7k
        if (e - q >= 2) {
6175
45.5k
            int kind = writer.kind;
6176
45.5k
            if (kind == PyUnicode_1BYTE_KIND) {
6177
17.8k
                if (PyUnicode_IS_ASCII(writer.buffer))
6178
14.1k
                    ch = asciilib_utf16_decode(&q, e,
6179
14.1k
                            (Py_UCS1*)writer.data, &writer.pos,
6180
14.1k
                            native_ordering);
6181
3.64k
                else
6182
3.64k
                    ch = ucs1lib_utf16_decode(&q, e,
6183
3.64k
                            (Py_UCS1*)writer.data, &writer.pos,
6184
3.64k
                            native_ordering);
6185
27.7k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6186
12.3k
                ch = ucs2lib_utf16_decode(&q, e,
6187
12.3k
                        (Py_UCS2*)writer.data, &writer.pos,
6188
12.3k
                        native_ordering);
6189
15.3k
            } else {
6190
15.3k
                assert(kind == PyUnicode_4BYTE_KIND);
6191
15.3k
                ch = ucs4lib_utf16_decode(&q, e,
6192
15.3k
                        (Py_UCS4*)writer.data, &writer.pos,
6193
15.3k
                        native_ordering);
6194
15.3k
            }
6195
45.5k
        }
6196
6197
53.7k
        switch (ch)
6198
53.7k
        {
6199
15.3k
        case 0:
6200
            /* remaining byte at the end? (size should be even) */
6201
15.3k
            if (q == e || consumed)
6202
9.92k
                goto End;
6203
5.46k
            errmsg = "truncated data";
6204
5.46k
            startinpos = ((const char *)q) - starts;
6205
5.46k
            endinpos = ((const char *)e) - starts;
6206
5.46k
            break;
6207
            /* The remaining input chars are ignored if the callback
6208
               chooses to skip the input */
6209
1.61k
        case 1:
6210
1.61k
            q -= 2;
6211
1.61k
            if (consumed)
6212
0
                goto End;
6213
1.61k
            errmsg = "unexpected end of data";
6214
1.61k
            startinpos = ((const char *)q) - starts;
6215
1.61k
            endinpos = ((const char *)e) - starts;
6216
1.61k
            break;
6217
13.0k
        case 2:
6218
13.0k
            errmsg = "illegal encoding";
6219
13.0k
            startinpos = ((const char *)q) - 2 - starts;
6220
13.0k
            endinpos = startinpos + 2;
6221
13.0k
            break;
6222
7.39k
        case 3:
6223
7.39k
            errmsg = "illegal UTF-16 surrogate";
6224
7.39k
            startinpos = ((const char *)q) - 4 - starts;
6225
7.39k
            endinpos = startinpos + 2;
6226
7.39k
            break;
6227
16.2k
        default:
6228
16.2k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6229
0
                goto onError;
6230
16.2k
            continue;
6231
53.7k
        }
6232
6233
27.5k
        if (unicode_decode_call_errorhandler_writer(
6234
27.5k
                errors,
6235
27.5k
                &errorHandler,
6236
27.5k
                encoding, errmsg,
6237
27.5k
                &starts,
6238
27.5k
                (const char **)&e,
6239
27.5k
                &startinpos,
6240
27.5k
                &endinpos,
6241
27.5k
                &exc,
6242
27.5k
                (const char **)&q,
6243
27.5k
                &writer))
6244
4.81k
            goto onError;
6245
27.5k
    }
6246
6247
9.92k
End:
6248
9.92k
    if (consumed)
6249
0
        *consumed = (const char *)q-starts;
6250
6251
9.92k
    Py_XDECREF(errorHandler);
6252
9.92k
    Py_XDECREF(exc);
6253
9.92k
    return _PyUnicodeWriter_Finish(&writer);
6254
6255
4.81k
  onError:
6256
4.81k
    _PyUnicodeWriter_Dealloc(&writer);
6257
4.81k
    Py_XDECREF(errorHandler);
6258
4.81k
    Py_XDECREF(exc);
6259
4.81k
    return NULL;
6260
14.7k
}
6261
6262
PyObject *
6263
_PyUnicode_EncodeUTF16(PyObject *str,
6264
                       const char *errors,
6265
                       int byteorder)
6266
6.17k
{
6267
6.17k
    if (!PyUnicode_Check(str)) {
6268
0
        PyErr_BadArgument();
6269
0
        return NULL;
6270
0
    }
6271
6.17k
    int kind = PyUnicode_KIND(str);
6272
6.17k
    const void *data = PyUnicode_DATA(str);
6273
6.17k
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6274
6275
6.17k
    Py_ssize_t pairs = 0;
6276
6.17k
    if (kind == PyUnicode_4BYTE_KIND) {
6277
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6278
0
        const Py_UCS4 *end = in + len;
6279
0
        while (in < end) {
6280
0
            if (*in++ >= 0x10000) {
6281
0
                pairs++;
6282
0
            }
6283
0
        }
6284
0
    }
6285
6.17k
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6286
0
        return PyErr_NoMemory();
6287
0
    }
6288
6.17k
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6289
6290
#if PY_BIG_ENDIAN
6291
    int native_ordering = byteorder >= 0;
6292
#else
6293
6.17k
    int native_ordering = byteorder <= 0;
6294
6.17k
#endif
6295
6296
6.17k
    if (kind == PyUnicode_1BYTE_KIND) {
6297
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6298
        // on short strings
6299
6.12k
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6300
6.12k
        if (v == NULL) {
6301
0
            return NULL;
6302
0
        }
6303
6304
        /* output buffer is 2-bytes aligned */
6305
6.12k
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6306
6.12k
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6307
6.12k
        if (byteorder == 0) {
6308
0
            *out++ = 0xFEFF;
6309
0
        }
6310
6.12k
        if (len > 0) {
6311
6.12k
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6312
6.12k
        }
6313
6.12k
        return v;
6314
6.12k
    }
6315
6316
54
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6317
54
    if (writer == NULL) {
6318
0
        return NULL;
6319
0
    }
6320
6321
    /* output buffer is 2-bytes aligned */
6322
54
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6323
54
    unsigned short *out = PyBytesWriter_GetData(writer);
6324
54
    if (byteorder == 0) {
6325
0
        *out++ = 0xFEFF;
6326
0
    }
6327
54
    if (len == 0) {
6328
0
        return PyBytesWriter_Finish(writer);
6329
0
    }
6330
6331
54
    const char *encoding;
6332
54
    if (byteorder < 0) {
6333
0
        encoding = "utf-16-le";
6334
0
    }
6335
54
    else if (byteorder > 0) {
6336
54
        encoding = "utf-16-be";
6337
54
    }
6338
0
    else {
6339
0
        encoding = "utf-16";
6340
0
    }
6341
6342
54
    PyObject *errorHandler = NULL;
6343
54
    PyObject *exc = NULL;
6344
54
    PyObject *rep = NULL;
6345
6346
54
    for (Py_ssize_t pos = 0; pos < len; ) {
6347
54
        if (kind == PyUnicode_2BYTE_KIND) {
6348
54
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6349
54
                                        &out, native_ordering);
6350
54
        }
6351
0
        else {
6352
0
            assert(kind == PyUnicode_4BYTE_KIND);
6353
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6354
0
                                        &out, native_ordering);
6355
0
        }
6356
54
        if (pos == len)
6357
54
            break;
6358
6359
0
        Py_ssize_t newpos;
6360
0
        rep = unicode_encode_call_errorhandler(
6361
0
                errors, &errorHandler,
6362
0
                encoding, "surrogates not allowed",
6363
0
                str, &exc, pos, pos + 1, &newpos);
6364
0
        if (!rep)
6365
0
            goto error;
6366
6367
0
        Py_ssize_t repsize, moreunits;
6368
0
        if (PyBytes_Check(rep)) {
6369
0
            repsize = PyBytes_GET_SIZE(rep);
6370
0
            if (repsize & 1) {
6371
0
                raise_encode_exception(&exc, encoding,
6372
0
                                       str, pos, pos + 1,
6373
0
                                       "surrogates not allowed");
6374
0
                goto error;
6375
0
            }
6376
0
            moreunits = repsize / 2;
6377
0
        }
6378
0
        else {
6379
0
            assert(PyUnicode_Check(rep));
6380
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6381
0
            if (!PyUnicode_IS_ASCII(rep)) {
6382
0
                raise_encode_exception(&exc, encoding,
6383
0
                                       str, pos, pos + 1,
6384
0
                                       "surrogates not allowed");
6385
0
                goto error;
6386
0
            }
6387
0
        }
6388
0
        moreunits += pos - newpos;
6389
0
        pos = newpos;
6390
6391
        /* two bytes are reserved for each surrogate */
6392
0
        if (moreunits > 0) {
6393
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6394
0
            if (out == NULL) {
6395
0
                goto error;
6396
0
            }
6397
0
        }
6398
6399
0
        if (PyBytes_Check(rep)) {
6400
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6401
0
            out += repsize / 2;
6402
0
        } else {
6403
            /* rep is unicode */
6404
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6405
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6406
0
                                 &out, native_ordering);
6407
0
        }
6408
6409
0
        Py_CLEAR(rep);
6410
0
    }
6411
6412
54
    Py_XDECREF(errorHandler);
6413
54
    Py_XDECREF(exc);
6414
6415
    /* Cut back to size actually needed. This is necessary for, for example,
6416
    encoding of a string containing isolated surrogates and the 'ignore' handler
6417
    is used. */
6418
54
    return PyBytesWriter_FinishWithPointer(writer, out);
6419
6420
0
  error:
6421
0
    Py_XDECREF(rep);
6422
0
    Py_XDECREF(errorHandler);
6423
0
    Py_XDECREF(exc);
6424
0
    PyBytesWriter_Discard(writer);
6425
0
    return NULL;
6426
54
}
6427
6428
PyObject *
6429
PyUnicode_AsUTF16String(PyObject *unicode)
6430
0
{
6431
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6432
0
}
6433
6434
_PyUnicode_Name_CAPI *
6435
_PyUnicode_GetNameCAPI(void)
6436
2.09k
{
6437
2.09k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6438
2.09k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6439
6440
2.09k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6441
2.09k
    if (ucnhash_capi == NULL) {
6442
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6443
1
                PyUnicodeData_CAPSULE_NAME, 1);
6444
6445
        // It's fine if we overwrite the value here. It's always the same value.
6446
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6447
1
    }
6448
2.09k
    return ucnhash_capi;
6449
2.09k
}
6450
6451
/* --- Unicode Escape Codec ----------------------------------------------- */
6452
6453
PyObject *
6454
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6455
                               Py_ssize_t size,
6456
                               const char *errors,
6457
                               Py_ssize_t *consumed,
6458
                               int *first_invalid_escape_char,
6459
                               const char **first_invalid_escape_ptr)
6460
27.6k
{
6461
27.6k
    const char *starts = s;
6462
27.6k
    const char *initial_starts = starts;
6463
27.6k
    _PyUnicodeWriter writer;
6464
27.6k
    const char *end;
6465
27.6k
    PyObject *errorHandler = NULL;
6466
27.6k
    PyObject *exc = NULL;
6467
27.6k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6468
6469
    // so we can remember if we've seen an invalid escape char or not
6470
27.6k
    *first_invalid_escape_char = -1;
6471
27.6k
    *first_invalid_escape_ptr = NULL;
6472
6473
27.6k
    if (size == 0) {
6474
1.80k
        if (consumed) {
6475
0
            *consumed = 0;
6476
0
        }
6477
1.80k
        _Py_RETURN_UNICODE_EMPTY();
6478
1.80k
    }
6479
    /* Escaped strings will always be longer than the resulting
6480
       Unicode string, so we start with size here and then reduce the
6481
       length after conversion to the true value.
6482
       (but if the error callback returns a long replacement string
6483
       we'll have to allocate more space) */
6484
25.8k
    _PyUnicodeWriter_Init(&writer);
6485
25.8k
    writer.min_length = size;
6486
25.8k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6487
0
        goto onError;
6488
0
    }
6489
6490
25.8k
    end = s + size;
6491
199k
    while (s < end) {
6492
173k
        unsigned char c = (unsigned char) *s++;
6493
173k
        Py_UCS4 ch;
6494
173k
        int count;
6495
173k
        const char *message;
6496
6497
173k
#define WRITE_ASCII_CHAR(ch)                                                  \
6498
173k
            do {                                                              \
6499
17.0k
                assert(ch <= 127);                                            \
6500
17.0k
                assert(writer.pos < writer.size);                             \
6501
17.0k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6502
17.0k
            } while(0)
6503
6504
173k
#define WRITE_CHAR(ch)                                                        \
6505
173k
            do {                                                              \
6506
161k
                if (ch <= writer.maxchar) {                                   \
6507
146k
                    assert(writer.pos < writer.size);                         \
6508
146k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6509
146k
                }                                                             \
6510
161k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6511
0
                    goto onError;                                             \
6512
0
                }                                                             \
6513
161k
            } while(0)
6514
6515
        /* Non-escape characters are interpreted as Unicode ordinals */
6516
173k
        if (c != '\\') {
6517
123k
            WRITE_CHAR(c);
6518
123k
            continue;
6519
123k
        }
6520
6521
50.3k
        Py_ssize_t startinpos = s - starts - 1;
6522
        /* \ - Escapes */
6523
50.3k
        if (s >= end) {
6524
0
            message = "\\ at end of string";
6525
0
            goto incomplete;
6526
0
        }
6527
50.3k
        c = (unsigned char) *s++;
6528
6529
50.3k
        assert(writer.pos < writer.size);
6530
50.3k
        switch (c) {
6531
6532
            /* \x escapes */
6533
764
        case '\n': continue;
6534
1.58k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6535
925
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6536
1.56k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6537
1.20k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6538
        /* FF */
6539
858
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6540
852
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6541
945
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6542
1.87k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6543
        /* VT */
6544
884
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6545
        /* BEL, not classic C */
6546
879
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6547
6548
            /* \OOO (octal) escapes */
6549
4.49k
        case '0': case '1': case '2': case '3':
6550
7.90k
        case '4': case '5': case '6': case '7':
6551
7.90k
            ch = c - '0';
6552
7.90k
            if (s < end && '0' <= *s && *s <= '7') {
6553
4.70k
                ch = (ch<<3) + *s++ - '0';
6554
4.70k
                if (s < end && '0' <= *s && *s <= '7') {
6555
3.32k
                    ch = (ch<<3) + *s++ - '0';
6556
3.32k
                }
6557
4.70k
            }
6558
7.90k
            if (ch > 0377) {
6559
1.67k
                if (*first_invalid_escape_char == -1) {
6560
1.08k
                    *first_invalid_escape_char = ch;
6561
1.08k
                    if (starts == initial_starts) {
6562
                        /* Back up 3 chars, since we've already incremented s. */
6563
1.08k
                        *first_invalid_escape_ptr = s - 3;
6564
1.08k
                    }
6565
1.08k
                }
6566
1.67k
            }
6567
7.90k
            WRITE_CHAR(ch);
6568
7.90k
            continue;
6569
6570
            /* hex escapes */
6571
            /* \xXX */
6572
7.90k
        case 'x':
6573
5.55k
            count = 2;
6574
5.55k
            message = "truncated \\xXX escape";
6575
5.55k
            goto hexescape;
6576
6577
            /* \uXXXX */
6578
6.50k
        case 'u':
6579
6.50k
            count = 4;
6580
6.50k
            message = "truncated \\uXXXX escape";
6581
6.50k
            goto hexescape;
6582
6583
            /* \UXXXXXXXX */
6584
10.5k
        case 'U':
6585
10.5k
            count = 8;
6586
10.5k
            message = "truncated \\UXXXXXXXX escape";
6587
22.5k
        hexescape:
6588
143k
            for (ch = 0; count; ++s, --count) {
6589
121k
                if (s >= end) {
6590
8
                    goto incomplete;
6591
8
                }
6592
121k
                c = (unsigned char)*s;
6593
121k
                ch <<= 4;
6594
121k
                if (c >= '0' && c <= '9') {
6595
92.1k
                    ch += c - '0';
6596
92.1k
                }
6597
29.2k
                else if (c >= 'a' && c <= 'f') {
6598
29.0k
                    ch += c - ('a' - 10);
6599
29.0k
                }
6600
229
                else if (c >= 'A' && c <= 'F') {
6601
220
                    ch += c - ('A' - 10);
6602
220
                }
6603
9
                else {
6604
9
                    goto error;
6605
9
                }
6606
121k
            }
6607
6608
            /* when we get here, ch is a 32-bit unicode character */
6609
22.5k
            if (ch > MAX_UNICODE) {
6610
1
                message = "illegal Unicode character";
6611
1
                goto error;
6612
1
            }
6613
6614
22.5k
            WRITE_CHAR(ch);
6615
22.5k
            continue;
6616
6617
            /* \N{name} */
6618
22.5k
        case 'N':
6619
2.09k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6620
2.09k
            if (ucnhash_capi == NULL) {
6621
0
                PyErr_SetString(
6622
0
                        PyExc_UnicodeError,
6623
0
                        "\\N escapes not supported (can't load unicodedata module)"
6624
0
                );
6625
0
                goto onError;
6626
0
            }
6627
6628
2.09k
            message = "malformed \\N character escape";
6629
2.09k
            if (s >= end) {
6630
2
                goto incomplete;
6631
2
            }
6632
2.09k
            if (*s == '{') {
6633
2.09k
                const char *start = ++s;
6634
2.09k
                size_t namelen;
6635
                /* look for the closing brace */
6636
28.7k
                while (s < end && *s != '}')
6637
26.6k
                    s++;
6638
2.09k
                if (s >= end) {
6639
7
                    goto incomplete;
6640
7
                }
6641
2.08k
                namelen = s - start;
6642
2.08k
                if (namelen) {
6643
                    /* found a name.  look it up in the unicode database */
6644
2.08k
                    s++;
6645
2.08k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6646
2.08k
                    if (namelen <= INT_MAX &&
6647
2.08k
                        ucnhash_capi->getcode(start, (int)namelen,
6648
2.08k
                                              &ch, 0)) {
6649
2.02k
                        assert(ch <= MAX_UNICODE);
6650
2.02k
                        WRITE_CHAR(ch);
6651
2.02k
                        continue;
6652
2.02k
                    }
6653
60
                    message = "unknown Unicode character name";
6654
60
                }
6655
2.08k
            }
6656
66
            goto error;
6657
6658
5.43k
        default:
6659
5.43k
            if (*first_invalid_escape_char == -1) {
6660
3.34k
                *first_invalid_escape_char = c;
6661
3.34k
                if (starts == initial_starts) {
6662
                    /* Back up one char, since we've already incremented s. */
6663
3.34k
                    *first_invalid_escape_ptr = s - 1;
6664
3.34k
                }
6665
3.34k
            }
6666
5.43k
            WRITE_ASCII_CHAR('\\');
6667
5.43k
            WRITE_CHAR(c);
6668
5.43k
            continue;
6669
50.3k
        }
6670
6671
17
      incomplete:
6672
17
        if (consumed) {
6673
0
            *consumed = startinpos;
6674
0
            break;
6675
0
        }
6676
93
      error:;
6677
93
        Py_ssize_t endinpos = s-starts;
6678
93
        writer.min_length = end - s + writer.pos;
6679
93
        if (unicode_decode_call_errorhandler_writer(
6680
93
                errors, &errorHandler,
6681
93
                "unicodeescape", message,
6682
93
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6683
93
                &writer)) {
6684
93
            goto onError;
6685
93
        }
6686
93
        assert(end - s <= writer.size - writer.pos);
6687
6688
0
#undef WRITE_ASCII_CHAR
6689
0
#undef WRITE_CHAR
6690
0
    }
6691
6692
25.7k
    Py_XDECREF(errorHandler);
6693
25.7k
    Py_XDECREF(exc);
6694
25.7k
    return _PyUnicodeWriter_Finish(&writer);
6695
6696
93
  onError:
6697
93
    _PyUnicodeWriter_Dealloc(&writer);
6698
93
    Py_XDECREF(errorHandler);
6699
93
    Py_XDECREF(exc);
6700
93
    return NULL;
6701
25.8k
}
6702
6703
PyObject *
6704
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6705
                              Py_ssize_t size,
6706
                              const char *errors,
6707
                              Py_ssize_t *consumed)
6708
0
{
6709
0
    int first_invalid_escape_char;
6710
0
    const char *first_invalid_escape_ptr;
6711
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6712
0
                                                      consumed,
6713
0
                                                      &first_invalid_escape_char,
6714
0
                                                      &first_invalid_escape_ptr);
6715
0
    if (result == NULL)
6716
0
        return NULL;
6717
0
    if (first_invalid_escape_char != -1) {
6718
0
        if (first_invalid_escape_char > 0xff) {
6719
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6720
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6721
0
                                 "Such sequences will not work in the future. ",
6722
0
                                 first_invalid_escape_char) < 0)
6723
0
            {
6724
0
                Py_DECREF(result);
6725
0
                return NULL;
6726
0
            }
6727
0
        }
6728
0
        else {
6729
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6730
0
                                 "\"\\%c\" is an invalid escape sequence. "
6731
0
                                 "Such sequences will not work in the future. ",
6732
0
                                 first_invalid_escape_char) < 0)
6733
0
            {
6734
0
                Py_DECREF(result);
6735
0
                return NULL;
6736
0
            }
6737
0
        }
6738
0
    }
6739
0
    return result;
6740
0
}
6741
6742
PyObject *
6743
PyUnicode_DecodeUnicodeEscape(const char *s,
6744
                              Py_ssize_t size,
6745
                              const char *errors)
6746
0
{
6747
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6748
0
}
6749
6750
/* Return a Unicode-Escape string version of the Unicode object. */
6751
6752
PyObject *
6753
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6754
383k
{
6755
383k
    if (!PyUnicode_Check(unicode)) {
6756
0
        PyErr_BadArgument();
6757
0
        return NULL;
6758
0
    }
6759
6760
383k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6761
383k
    if (len == 0) {
6762
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6763
0
    }
6764
383k
    int kind = PyUnicode_KIND(unicode);
6765
383k
    const void *data = PyUnicode_DATA(unicode);
6766
6767
    /* Initial allocation is based on the longest-possible character
6768
     * escape.
6769
     *
6770
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6771
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6772
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6773
383k
    Py_ssize_t expandsize = kind * 2 + 2;
6774
383k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6775
0
        return PyErr_NoMemory();
6776
0
    }
6777
6778
383k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6779
383k
    if (writer == NULL) {
6780
0
        return NULL;
6781
0
    }
6782
383k
    char *p = PyBytesWriter_GetData(writer);
6783
6784
766k
    for (Py_ssize_t i = 0; i < len; i++) {
6785
383k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6786
6787
        /* U+0000-U+00ff range */
6788
383k
        if (ch < 0x100) {
6789
377k
            if (ch >= ' ' && ch < 127) {
6790
39.5k
                if (ch != '\\') {
6791
                    /* Copy printable US ASCII as-is */
6792
0
                    *p++ = (char) ch;
6793
0
                }
6794
                /* Escape backslashes */
6795
39.5k
                else {
6796
39.5k
                    *p++ = '\\';
6797
39.5k
                    *p++ = '\\';
6798
39.5k
                }
6799
39.5k
            }
6800
6801
            /* Map special whitespace to '\t', \n', '\r' */
6802
337k
            else if (ch == '\t') {
6803
3.20k
                *p++ = '\\';
6804
3.20k
                *p++ = 't';
6805
3.20k
            }
6806
334k
            else if (ch == '\n') {
6807
2.02k
                *p++ = '\\';
6808
2.02k
                *p++ = 'n';
6809
2.02k
            }
6810
332k
            else if (ch == '\r') {
6811
1.13k
                *p++ = '\\';
6812
1.13k
                *p++ = 'r';
6813
1.13k
            }
6814
6815
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6816
331k
            else {
6817
331k
                *p++ = '\\';
6818
331k
                *p++ = 'x';
6819
331k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6820
331k
                *p++ = Py_hexdigits[ch & 0x000F];
6821
331k
            }
6822
377k
        }
6823
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6824
5.77k
        else if (ch < 0x10000) {
6825
4.82k
            *p++ = '\\';
6826
4.82k
            *p++ = 'u';
6827
4.82k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6828
4.82k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6829
4.82k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6830
4.82k
            *p++ = Py_hexdigits[ch & 0x000F];
6831
4.82k
        }
6832
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6833
942
        else {
6834
6835
            /* Make sure that the first two digits are zero */
6836
942
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6837
942
            *p++ = '\\';
6838
942
            *p++ = 'U';
6839
942
            *p++ = '0';
6840
942
            *p++ = '0';
6841
942
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6842
942
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6843
942
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6844
942
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6845
942
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6846
942
            *p++ = Py_hexdigits[ch & 0x0000000F];
6847
942
        }
6848
383k
    }
6849
6850
383k
    return PyBytesWriter_FinishWithPointer(writer, p);
6851
383k
}
6852
6853
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6854
6855
PyObject *
6856
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6857
                                          Py_ssize_t size,
6858
                                          const char *errors,
6859
                                          Py_ssize_t *consumed)
6860
0
{
6861
0
    const char *starts = s;
6862
0
    _PyUnicodeWriter writer;
6863
0
    const char *end;
6864
0
    PyObject *errorHandler = NULL;
6865
0
    PyObject *exc = NULL;
6866
6867
0
    if (size == 0) {
6868
0
        if (consumed) {
6869
0
            *consumed = 0;
6870
0
        }
6871
0
        _Py_RETURN_UNICODE_EMPTY();
6872
0
    }
6873
6874
    /* Escaped strings will always be longer than the resulting
6875
       Unicode string, so we start with size here and then reduce the
6876
       length after conversion to the true value. (But decoding error
6877
       handler might have to resize the string) */
6878
0
    _PyUnicodeWriter_Init(&writer);
6879
0
    writer.min_length = size;
6880
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6881
0
        goto onError;
6882
0
    }
6883
6884
0
    end = s + size;
6885
0
    while (s < end) {
6886
0
        unsigned char c = (unsigned char) *s++;
6887
0
        Py_UCS4 ch;
6888
0
        int count;
6889
0
        const char *message;
6890
6891
0
#define WRITE_CHAR(ch)                                                        \
6892
0
            do {                                                              \
6893
0
                if (ch <= writer.maxchar) {                                   \
6894
0
                    assert(writer.pos < writer.size);                         \
6895
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6896
0
                }                                                             \
6897
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6898
0
                    goto onError;                                             \
6899
0
                }                                                             \
6900
0
            } while(0)
6901
6902
        /* Non-escape characters are interpreted as Unicode ordinals */
6903
0
        if (c != '\\' || (s >= end && !consumed)) {
6904
0
            WRITE_CHAR(c);
6905
0
            continue;
6906
0
        }
6907
6908
0
        Py_ssize_t startinpos = s - starts - 1;
6909
        /* \ - Escapes */
6910
0
        if (s >= end) {
6911
0
            assert(consumed);
6912
            // Set message to silent compiler warning.
6913
            // Actually it is never used.
6914
0
            message = "\\ at end of string";
6915
0
            goto incomplete;
6916
0
        }
6917
6918
0
        c = (unsigned char) *s++;
6919
0
        if (c == 'u') {
6920
0
            count = 4;
6921
0
            message = "truncated \\uXXXX escape";
6922
0
        }
6923
0
        else if (c == 'U') {
6924
0
            count = 8;
6925
0
            message = "truncated \\UXXXXXXXX escape";
6926
0
        }
6927
0
        else {
6928
0
            assert(writer.pos < writer.size);
6929
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6930
0
            WRITE_CHAR(c);
6931
0
            continue;
6932
0
        }
6933
6934
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6935
0
        for (ch = 0; count; ++s, --count) {
6936
0
            if (s >= end) {
6937
0
                goto incomplete;
6938
0
            }
6939
0
            c = (unsigned char)*s;
6940
0
            ch <<= 4;
6941
0
            if (c >= '0' && c <= '9') {
6942
0
                ch += c - '0';
6943
0
            }
6944
0
            else if (c >= 'a' && c <= 'f') {
6945
0
                ch += c - ('a' - 10);
6946
0
            }
6947
0
            else if (c >= 'A' && c <= 'F') {
6948
0
                ch += c - ('A' - 10);
6949
0
            }
6950
0
            else {
6951
0
                goto error;
6952
0
            }
6953
0
        }
6954
0
        if (ch > MAX_UNICODE) {
6955
0
            message = "\\Uxxxxxxxx out of range";
6956
0
            goto error;
6957
0
        }
6958
0
        WRITE_CHAR(ch);
6959
0
        continue;
6960
6961
0
      incomplete:
6962
0
        if (consumed) {
6963
0
            *consumed = startinpos;
6964
0
            break;
6965
0
        }
6966
0
      error:;
6967
0
        Py_ssize_t endinpos = s-starts;
6968
0
        writer.min_length = end - s + writer.pos;
6969
0
        if (unicode_decode_call_errorhandler_writer(
6970
0
                errors, &errorHandler,
6971
0
                "rawunicodeescape", message,
6972
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6973
0
                &writer)) {
6974
0
            goto onError;
6975
0
        }
6976
0
        assert(end - s <= writer.size - writer.pos);
6977
6978
0
#undef WRITE_CHAR
6979
0
    }
6980
0
    Py_XDECREF(errorHandler);
6981
0
    Py_XDECREF(exc);
6982
0
    return _PyUnicodeWriter_Finish(&writer);
6983
6984
0
  onError:
6985
0
    _PyUnicodeWriter_Dealloc(&writer);
6986
0
    Py_XDECREF(errorHandler);
6987
0
    Py_XDECREF(exc);
6988
0
    return NULL;
6989
0
}
6990
6991
PyObject *
6992
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6993
                                 Py_ssize_t size,
6994
                                 const char *errors)
6995
0
{
6996
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6997
0
}
6998
6999
7000
PyObject *
7001
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7002
245k
{
7003
245k
    if (!PyUnicode_Check(unicode)) {
7004
0
        PyErr_BadArgument();
7005
0
        return NULL;
7006
0
    }
7007
245k
    int kind = PyUnicode_KIND(unicode);
7008
245k
    const void *data = PyUnicode_DATA(unicode);
7009
245k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7010
245k
    if (len == 0) {
7011
497
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7012
497
    }
7013
245k
    if (kind == PyUnicode_1BYTE_KIND) {
7014
244k
        return PyBytes_FromStringAndSize(data, len);
7015
244k
    }
7016
7017
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7018
       bytes, and 1 byte characters 4. */
7019
288
    Py_ssize_t expandsize = kind * 2 + 2;
7020
288
    if (len > PY_SSIZE_T_MAX / expandsize) {
7021
0
        return PyErr_NoMemory();
7022
0
    }
7023
7024
288
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7025
288
    if (writer == NULL) {
7026
0
        return NULL;
7027
0
    }
7028
288
    char *p = PyBytesWriter_GetData(writer);
7029
7030
5.04M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7031
5.04M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7032
7033
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7034
5.04M
        if (ch < 0x100) {
7035
5.00M
            *p++ = (char) ch;
7036
5.00M
        }
7037
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7038
36.8k
        else if (ch < 0x10000) {
7039
36.3k
            *p++ = '\\';
7040
36.3k
            *p++ = 'u';
7041
36.3k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7042
36.3k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7043
36.3k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7044
36.3k
            *p++ = Py_hexdigits[ch & 15];
7045
36.3k
        }
7046
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7047
574
        else {
7048
574
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7049
574
            *p++ = '\\';
7050
574
            *p++ = 'U';
7051
574
            *p++ = '0';
7052
574
            *p++ = '0';
7053
574
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7054
574
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7055
574
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7056
574
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7057
574
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7058
574
            *p++ = Py_hexdigits[ch & 15];
7059
574
        }
7060
5.04M
    }
7061
7062
288
    return PyBytesWriter_FinishWithPointer(writer, p);
7063
288
}
7064
7065
/* --- Latin-1 Codec ------------------------------------------------------ */
7066
7067
PyObject *
7068
PyUnicode_DecodeLatin1(const char *s,
7069
                       Py_ssize_t size,
7070
                       const char *errors)
7071
2.63M
{
7072
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7073
2.63M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7074
2.63M
}
7075
7076
/* create or adjust a UnicodeEncodeError */
7077
static void
7078
make_encode_exception(PyObject **exceptionObject,
7079
                      const char *encoding,
7080
                      PyObject *unicode,
7081
                      Py_ssize_t startpos, Py_ssize_t endpos,
7082
                      const char *reason)
7083
227k
{
7084
227k
    if (*exceptionObject == NULL) {
7085
227k
        *exceptionObject = PyObject_CallFunction(
7086
227k
            PyExc_UnicodeEncodeError, "sOnns",
7087
227k
            encoding, unicode, startpos, endpos, reason);
7088
227k
    }
7089
0
    else {
7090
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7091
0
            goto onError;
7092
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7093
0
            goto onError;
7094
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7095
0
            goto onError;
7096
0
        return;
7097
0
      onError:
7098
0
        Py_CLEAR(*exceptionObject);
7099
0
    }
7100
227k
}
7101
7102
/* raises a UnicodeEncodeError */
7103
static void
7104
raise_encode_exception(PyObject **exceptionObject,
7105
                       const char *encoding,
7106
                       PyObject *unicode,
7107
                       Py_ssize_t startpos, Py_ssize_t endpos,
7108
                       const char *reason)
7109
62.2k
{
7110
62.2k
    make_encode_exception(exceptionObject,
7111
62.2k
                          encoding, unicode, startpos, endpos, reason);
7112
62.2k
    if (*exceptionObject != NULL)
7113
62.2k
        PyCodec_StrictErrors(*exceptionObject);
7114
62.2k
}
7115
7116
/* error handling callback helper:
7117
   build arguments, call the callback and check the arguments,
7118
   put the result into newpos and return the replacement string, which
7119
   has to be freed by the caller */
7120
static PyObject *
7121
unicode_encode_call_errorhandler(const char *errors,
7122
                                 PyObject **errorHandler,
7123
                                 const char *encoding, const char *reason,
7124
                                 PyObject *unicode, PyObject **exceptionObject,
7125
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7126
                                 Py_ssize_t *newpos)
7127
165k
{
7128
165k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7129
165k
    Py_ssize_t len;
7130
165k
    PyObject *restuple;
7131
165k
    PyObject *resunicode;
7132
7133
165k
    if (*errorHandler == NULL) {
7134
165k
        *errorHandler = PyCodec_LookupError(errors);
7135
165k
        if (*errorHandler == NULL)
7136
0
            return NULL;
7137
165k
    }
7138
7139
165k
    len = PyUnicode_GET_LENGTH(unicode);
7140
7141
165k
    make_encode_exception(exceptionObject,
7142
165k
                          encoding, unicode, startpos, endpos, reason);
7143
165k
    if (*exceptionObject == NULL)
7144
0
        return NULL;
7145
7146
165k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7147
165k
    if (restuple == NULL)
7148
165k
        return NULL;
7149
0
    if (!PyTuple_Check(restuple)) {
7150
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7151
0
        Py_DECREF(restuple);
7152
0
        return NULL;
7153
0
    }
7154
0
    if (!PyArg_ParseTuple(restuple, argparse,
7155
0
                          &resunicode, newpos)) {
7156
0
        Py_DECREF(restuple);
7157
0
        return NULL;
7158
0
    }
7159
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7160
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7161
0
        Py_DECREF(restuple);
7162
0
        return NULL;
7163
0
    }
7164
0
    if (*newpos<0)
7165
0
        *newpos = len + *newpos;
7166
0
    if (*newpos<0 || *newpos>len) {
7167
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7168
0
        Py_DECREF(restuple);
7169
0
        return NULL;
7170
0
    }
7171
0
    Py_INCREF(resunicode);
7172
0
    Py_DECREF(restuple);
7173
0
    return resunicode;
7174
0
}
7175
7176
static PyObject *
7177
unicode_encode_ucs1(PyObject *unicode,
7178
                    const char *errors,
7179
                    const Py_UCS4 limit)
7180
77.0k
{
7181
    /* input state */
7182
77.0k
    Py_ssize_t pos=0, size;
7183
77.0k
    int kind;
7184
77.0k
    const void *data;
7185
77.0k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7186
77.0k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7187
77.0k
    PyObject *error_handler_obj = NULL;
7188
77.0k
    PyObject *exc = NULL;
7189
77.0k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7190
77.0k
    PyObject *rep = NULL;
7191
7192
77.0k
    size = PyUnicode_GET_LENGTH(unicode);
7193
77.0k
    kind = PyUnicode_KIND(unicode);
7194
77.0k
    data = PyUnicode_DATA(unicode);
7195
    /* allocate enough for a simple encoding without
7196
       replacements, if we need more, we'll resize */
7197
77.0k
    if (size == 0)
7198
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7199
7200
    /* output object */
7201
77.0k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7202
77.0k
    if (writer == NULL) {
7203
0
        return NULL;
7204
0
    }
7205
    /* pointer into the output */
7206
77.0k
    char *str = PyBytesWriter_GetData(writer);
7207
7208
4.99M
    while (pos < size) {
7209
4.99M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7210
7211
        /* can we encode this? */
7212
4.99M
        if (ch < limit) {
7213
            /* no overflow check, because we know that the space is enough */
7214
4.91M
            *str++ = (char)ch;
7215
4.91M
            ++pos;
7216
4.91M
        }
7217
77.2k
        else {
7218
77.2k
            Py_ssize_t newpos, i;
7219
            /* startpos for collecting unencodable chars */
7220
77.2k
            Py_ssize_t collstart = pos;
7221
77.2k
            Py_ssize_t collend = collstart + 1;
7222
            /* find all unecodable characters */
7223
7224
624k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7225
547k
                ++collend;
7226
7227
            /* Only overallocate the buffer if it's not the last write */
7228
77.2k
            writer->overallocate = (collend < size);
7229
7230
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7231
77.2k
            if (error_handler == _Py_ERROR_UNKNOWN)
7232
77.0k
                error_handler = _Py_GetErrorHandler(errors);
7233
7234
77.2k
            switch (error_handler) {
7235
62.2k
            case _Py_ERROR_STRICT:
7236
62.2k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7237
62.2k
                goto onError;
7238
7239
4.88k
            case _Py_ERROR_REPLACE:
7240
4.88k
                memset(str, '?', collend - collstart);
7241
4.88k
                str += (collend - collstart);
7242
4.88k
                _Py_FALLTHROUGH;
7243
4.88k
            case _Py_ERROR_IGNORE:
7244
4.88k
                pos = collend;
7245
4.88k
                break;
7246
7247
0
            case _Py_ERROR_BACKSLASHREPLACE:
7248
                /* subtract preallocated bytes */
7249
0
                writer->size -= (collend - collstart);
7250
0
                str = backslashreplace(writer, str,
7251
0
                                       unicode, collstart, collend);
7252
0
                if (str == NULL)
7253
0
                    goto onError;
7254
0
                pos = collend;
7255
0
                break;
7256
7257
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7258
                /* subtract preallocated bytes */
7259
0
                writer->size -= (collend - collstart);
7260
0
                str = xmlcharrefreplace(writer, str,
7261
0
                                        unicode, collstart, collend);
7262
0
                if (str == NULL)
7263
0
                    goto onError;
7264
0
                pos = collend;
7265
0
                break;
7266
7267
10.0k
            case _Py_ERROR_SURROGATEESCAPE:
7268
10.0k
                for (i = collstart; i < collend; ++i) {
7269
10.0k
                    ch = PyUnicode_READ(kind, data, i);
7270
10.0k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7271
                        /* Not a UTF-8b surrogate */
7272
10.0k
                        break;
7273
10.0k
                    }
7274
0
                    *str++ = (char)(ch - 0xdc00);
7275
0
                    ++pos;
7276
0
                }
7277
10.0k
                if (i >= collend)
7278
0
                    break;
7279
10.0k
                collstart = pos;
7280
10.0k
                assert(collstart != collend);
7281
10.0k
                _Py_FALLTHROUGH;
7282
7283
10.0k
            default:
7284
10.0k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7285
10.0k
                                                       encoding, reason, unicode, &exc,
7286
10.0k
                                                       collstart, collend, &newpos);
7287
10.0k
                if (rep == NULL)
7288
10.0k
                    goto onError;
7289
7290
0
                if (newpos < collstart) {
7291
0
                    writer->overallocate = 1;
7292
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7293
0
                                                             collstart - newpos,
7294
0
                                                             str);
7295
0
                    if (str == NULL) {
7296
0
                        goto onError;
7297
0
                    }
7298
0
                }
7299
0
                else {
7300
                    /* subtract preallocated bytes */
7301
0
                    writer->size -= newpos - collstart;
7302
                    /* Only overallocate the buffer if it's not the last write */
7303
0
                    writer->overallocate = (newpos < size);
7304
0
                }
7305
7306
0
                char *rep_str;
7307
0
                Py_ssize_t rep_len;
7308
0
                if (PyBytes_Check(rep)) {
7309
                    /* Directly copy bytes result to output. */
7310
0
                    rep_str = PyBytes_AS_STRING(rep);
7311
0
                    rep_len = PyBytes_GET_SIZE(rep);
7312
0
                }
7313
0
                else {
7314
0
                    assert(PyUnicode_Check(rep));
7315
7316
0
                    if (limit == 256 ?
7317
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7318
0
                        !PyUnicode_IS_ASCII(rep))
7319
0
                    {
7320
                        /* Not all characters are smaller than limit */
7321
0
                        raise_encode_exception(&exc, encoding, unicode,
7322
0
                                               collstart, collend, reason);
7323
0
                        goto onError;
7324
0
                    }
7325
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7326
0
                    rep_str = PyUnicode_DATA(rep);
7327
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7328
0
                }
7329
7330
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7331
0
                if (str == NULL) {
7332
0
                    goto onError;
7333
0
                }
7334
0
                memcpy(str, rep_str, rep_len);
7335
0
                str += rep_len;
7336
7337
0
                pos = newpos;
7338
0
                Py_CLEAR(rep);
7339
77.2k
            }
7340
7341
            /* If overallocation was disabled, ensure that it was the last
7342
               write. Otherwise, we missed an optimization */
7343
77.2k
            assert(writer->overallocate || pos == size);
7344
4.88k
        }
7345
4.99M
    }
7346
7347
4.73k
    Py_XDECREF(error_handler_obj);
7348
4.73k
    Py_XDECREF(exc);
7349
4.73k
    return PyBytesWriter_FinishWithPointer(writer, str);
7350
7351
72.3k
  onError:
7352
72.3k
    Py_XDECREF(rep);
7353
72.3k
    PyBytesWriter_Discard(writer);
7354
72.3k
    Py_XDECREF(error_handler_obj);
7355
72.3k
    Py_XDECREF(exc);
7356
72.3k
    return NULL;
7357
77.0k
}
7358
7359
PyObject *
7360
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7361
10
{
7362
10
    if (!PyUnicode_Check(unicode)) {
7363
0
        PyErr_BadArgument();
7364
0
        return NULL;
7365
0
    }
7366
    /* Fast path: if it is a one-byte string, construct
7367
       bytes object directly. */
7368
10
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7369
10
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7370
10
                                         PyUnicode_GET_LENGTH(unicode));
7371
    /* Non-Latin-1 characters present. Defer to above function to
7372
       raise the exception. */
7373
0
    return unicode_encode_ucs1(unicode, errors, 256);
7374
10
}
7375
7376
PyObject*
7377
PyUnicode_AsLatin1String(PyObject *unicode)
7378
0
{
7379
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7380
0
}
7381
7382
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7383
7384
PyObject *
7385
PyUnicode_DecodeASCII(const char *s,
7386
                      Py_ssize_t size,
7387
                      const char *errors)
7388
10.8M
{
7389
10.8M
    const char *starts = s;
7390
10.8M
    const char *e = s + size;
7391
10.8M
    PyObject *error_handler_obj = NULL;
7392
10.8M
    PyObject *exc = NULL;
7393
10.8M
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7394
7395
10.8M
    if (size == 0)
7396
0
        _Py_RETURN_UNICODE_EMPTY();
7397
7398
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7399
10.8M
    if (size == 1 && (unsigned char)s[0] < 128) {
7400
14.5k
        return get_latin1_char((unsigned char)s[0]);
7401
14.5k
    }
7402
7403
    // Shortcut for simple case
7404
10.8M
    PyObject *u = PyUnicode_New(size, 127);
7405
10.8M
    if (u == NULL) {
7406
0
        return NULL;
7407
0
    }
7408
10.8M
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7409
10.8M
    if (outpos == size) {
7410
8.66M
        return u;
7411
8.66M
    }
7412
7413
2.20M
    _PyUnicodeWriter writer;
7414
2.20M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7415
2.20M
    writer.pos = outpos;
7416
7417
2.20M
    s += outpos;
7418
2.20M
    int kind = writer.kind;
7419
2.20M
    void *data = writer.data;
7420
2.20M
    Py_ssize_t startinpos, endinpos;
7421
7422
21.6M
    while (s < e) {
7423
21.4M
        unsigned char c = (unsigned char)*s;
7424
21.4M
        if (c < 128) {
7425
7.86M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7426
7.86M
            writer.pos++;
7427
7.86M
            ++s;
7428
7.86M
            continue;
7429
7.86M
        }
7430
7431
        /* byte outsize range 0x00..0x7f: call the error handler */
7432
7433
13.6M
        if (error_handler == _Py_ERROR_UNKNOWN)
7434
2.20M
            error_handler = _Py_GetErrorHandler(errors);
7435
7436
13.6M
        switch (error_handler)
7437
13.6M
        {
7438
990k
        case _Py_ERROR_REPLACE:
7439
11.5M
        case _Py_ERROR_SURROGATEESCAPE:
7440
            /* Fast-path: the error handler only writes one character,
7441
               but we may switch to UCS2 at the first write */
7442
11.5M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7443
0
                goto onError;
7444
11.5M
            kind = writer.kind;
7445
11.5M
            data = writer.data;
7446
7447
11.5M
            if (error_handler == _Py_ERROR_REPLACE)
7448
990k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7449
10.6M
            else
7450
10.6M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7451
11.5M
            writer.pos++;
7452
11.5M
            ++s;
7453
11.5M
            break;
7454
7455
0
        case _Py_ERROR_IGNORE:
7456
0
            ++s;
7457
0
            break;
7458
7459
2.04M
        default:
7460
2.04M
            startinpos = s-starts;
7461
2.04M
            endinpos = startinpos + 1;
7462
2.04M
            if (unicode_decode_call_errorhandler_writer(
7463
2.04M
                    errors, &error_handler_obj,
7464
2.04M
                    "ascii", "ordinal not in range(128)",
7465
2.04M
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7466
2.04M
                    &writer))
7467
2.04M
                goto onError;
7468
0
            kind = writer.kind;
7469
0
            data = writer.data;
7470
13.6M
        }
7471
13.6M
    }
7472
168k
    Py_XDECREF(error_handler_obj);
7473
168k
    Py_XDECREF(exc);
7474
168k
    return _PyUnicodeWriter_Finish(&writer);
7475
7476
2.04M
  onError:
7477
2.04M
    _PyUnicodeWriter_Dealloc(&writer);
7478
2.04M
    Py_XDECREF(error_handler_obj);
7479
2.04M
    Py_XDECREF(exc);
7480
2.04M
    return NULL;
7481
2.20M
}
7482
7483
PyObject *
7484
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7485
409k
{
7486
409k
    if (!PyUnicode_Check(unicode)) {
7487
0
        PyErr_BadArgument();
7488
0
        return NULL;
7489
0
    }
7490
    /* Fast path: if it is an ASCII-only string, construct bytes object
7491
       directly. Else defer to above function to raise the exception. */
7492
409k
    if (PyUnicode_IS_ASCII(unicode))
7493
332k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7494
332k
                                         PyUnicode_GET_LENGTH(unicode));
7495
77.0k
    return unicode_encode_ucs1(unicode, errors, 128);
7496
409k
}
7497
7498
PyObject *
7499
PyUnicode_AsASCIIString(PyObject *unicode)
7500
114k
{
7501
114k
    return _PyUnicode_AsASCIIString(unicode, NULL);
7502
114k
}
7503
7504
#ifdef MS_WINDOWS
7505
7506
/* --- MBCS codecs for Windows -------------------------------------------- */
7507
7508
#if SIZEOF_INT < SIZEOF_SIZE_T
7509
#define NEED_RETRY
7510
#endif
7511
7512
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7513
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7514
   both cases also and avoids partial characters overrunning the
7515
   length limit in MultiByteToWideChar on Windows */
7516
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7517
7518
#ifndef WC_ERR_INVALID_CHARS
7519
#  define WC_ERR_INVALID_CHARS 0x0080
7520
#endif
7521
7522
static const char*
7523
code_page_name(UINT code_page, PyObject **obj)
7524
{
7525
    *obj = NULL;
7526
    if (code_page == CP_ACP)
7527
        return "mbcs";
7528
7529
    *obj = PyBytes_FromFormat("cp%u", code_page);
7530
    if (*obj == NULL)
7531
        return NULL;
7532
    return PyBytes_AS_STRING(*obj);
7533
}
7534
7535
static DWORD
7536
decode_code_page_flags(UINT code_page)
7537
{
7538
    if (code_page == CP_UTF7) {
7539
        /* The CP_UTF7 decoder only supports flags=0 */
7540
        return 0;
7541
    }
7542
    else
7543
        return MB_ERR_INVALID_CHARS;
7544
}
7545
7546
/*
7547
 * Decode a byte string from a Windows code page into unicode object in strict
7548
 * mode.
7549
 *
7550
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7551
 * OSError and returns -1 on other error.
7552
 */
7553
static int
7554
decode_code_page_strict(UINT code_page,
7555
                        wchar_t **buf,
7556
                        Py_ssize_t *bufsize,
7557
                        const char *in,
7558
                        int insize)
7559
{
7560
    DWORD flags = MB_ERR_INVALID_CHARS;
7561
    wchar_t *out;
7562
    DWORD outsize;
7563
7564
    /* First get the size of the result */
7565
    assert(insize > 0);
7566
    while ((outsize = MultiByteToWideChar(code_page, flags,
7567
                                          in, insize, NULL, 0)) <= 0)
7568
    {
7569
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7570
            goto error;
7571
        }
7572
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7573
        flags = 0;
7574
    }
7575
7576
    /* Extend a wchar_t* buffer */
7577
    Py_ssize_t n = *bufsize;   /* Get the current length */
7578
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7579
        return -1;
7580
    }
7581
    out = *buf + n;
7582
7583
    /* Do the conversion */
7584
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7585
    if (outsize <= 0)
7586
        goto error;
7587
    return insize;
7588
7589
error:
7590
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7591
        return -2;
7592
    PyErr_SetFromWindowsErr(0);
7593
    return -1;
7594
}
7595
7596
/*
7597
 * Decode a byte string from a code page into unicode object with an error
7598
 * handler.
7599
 *
7600
 * Returns consumed size if succeed, or raise an OSError or
7601
 * UnicodeDecodeError exception and returns -1 on error.
7602
 */
7603
static int
7604
decode_code_page_errors(UINT code_page,
7605
                        wchar_t **buf,
7606
                        Py_ssize_t *bufsize,
7607
                        const char *in, const int size,
7608
                        const char *errors, int final)
7609
{
7610
    const char *startin = in;
7611
    const char *endin = in + size;
7612
    DWORD flags = MB_ERR_INVALID_CHARS;
7613
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7614
       2000 English version of the message. */
7615
    const char *reason = "No mapping for the Unicode character exists "
7616
                         "in the target code page.";
7617
    /* each step cannot decode more than 1 character, but a character can be
7618
       represented as a surrogate pair */
7619
    wchar_t buffer[2], *out;
7620
    int insize;
7621
    Py_ssize_t outsize;
7622
    PyObject *errorHandler = NULL;
7623
    PyObject *exc = NULL;
7624
    PyObject *encoding_obj = NULL;
7625
    const char *encoding;
7626
    DWORD err;
7627
    int ret = -1;
7628
7629
    assert(size > 0);
7630
7631
    encoding = code_page_name(code_page, &encoding_obj);
7632
    if (encoding == NULL)
7633
        return -1;
7634
7635
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7636
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7637
           UnicodeDecodeError. */
7638
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7639
        if (exc != NULL) {
7640
            PyCodec_StrictErrors(exc);
7641
            Py_CLEAR(exc);
7642
        }
7643
        goto error;
7644
    }
7645
7646
    /* Extend a wchar_t* buffer */
7647
    Py_ssize_t n = *bufsize;   /* Get the current length */
7648
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7649
        PyErr_NoMemory();
7650
        goto error;
7651
    }
7652
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7653
        goto error;
7654
    }
7655
    out = *buf + n;
7656
7657
    /* Decode the byte string character per character */
7658
    while (in < endin)
7659
    {
7660
        /* Decode a character */
7661
        insize = 1;
7662
        do
7663
        {
7664
            outsize = MultiByteToWideChar(code_page, flags,
7665
                                          in, insize,
7666
                                          buffer, Py_ARRAY_LENGTH(buffer));
7667
            if (outsize > 0)
7668
                break;
7669
            err = GetLastError();
7670
            if (err == ERROR_INVALID_FLAGS && flags) {
7671
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7672
                flags = 0;
7673
                continue;
7674
            }
7675
            if (err != ERROR_NO_UNICODE_TRANSLATION
7676
                && err != ERROR_INSUFFICIENT_BUFFER)
7677
            {
7678
                PyErr_SetFromWindowsErr(err);
7679
                goto error;
7680
            }
7681
            insize++;
7682
        }
7683
        /* 4=maximum length of a UTF-8 sequence */
7684
        while (insize <= 4 && (in + insize) <= endin);
7685
7686
        if (outsize <= 0) {
7687
            Py_ssize_t startinpos, endinpos, outpos;
7688
7689
            /* last character in partial decode? */
7690
            if (in + insize >= endin && !final)
7691
                break;
7692
7693
            startinpos = in - startin;
7694
            endinpos = startinpos + 1;
7695
            outpos = out - *buf;
7696
            if (unicode_decode_call_errorhandler_wchar(
7697
                    errors, &errorHandler,
7698
                    encoding, reason,
7699
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7700
                    buf, bufsize, &outpos))
7701
            {
7702
                goto error;
7703
            }
7704
            out = *buf + outpos;
7705
        }
7706
        else {
7707
            in += insize;
7708
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7709
            out += outsize;
7710
        }
7711
    }
7712
7713
    /* Shrink the buffer */
7714
    assert(out - *buf <= *bufsize);
7715
    *bufsize = out - *buf;
7716
    /* (in - startin) <= size and size is an int */
7717
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7718
7719
error:
7720
    Py_XDECREF(encoding_obj);
7721
    Py_XDECREF(errorHandler);
7722
    Py_XDECREF(exc);
7723
    return ret;
7724
}
7725
7726
static PyObject *
7727
decode_code_page_stateful(int code_page,
7728
                          const char *s, Py_ssize_t size,
7729
                          const char *errors, Py_ssize_t *consumed)
7730
{
7731
    wchar_t *buf = NULL;
7732
    Py_ssize_t bufsize = 0;
7733
    int chunk_size, final, converted, done;
7734
7735
    if (code_page < 0) {
7736
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7737
        return NULL;
7738
    }
7739
    if (size < 0) {
7740
        PyErr_BadInternalCall();
7741
        return NULL;
7742
    }
7743
7744
    if (consumed)
7745
        *consumed = 0;
7746
7747
    do
7748
    {
7749
#ifdef NEED_RETRY
7750
        if (size > DECODING_CHUNK_SIZE) {
7751
            chunk_size = DECODING_CHUNK_SIZE;
7752
            final = 0;
7753
            done = 0;
7754
        }
7755
        else
7756
#endif
7757
        {
7758
            chunk_size = (int)size;
7759
            final = (consumed == NULL);
7760
            done = 1;
7761
        }
7762
7763
        if (chunk_size == 0 && done) {
7764
            if (buf != NULL)
7765
                break;
7766
            _Py_RETURN_UNICODE_EMPTY();
7767
        }
7768
7769
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7770
                                            s, chunk_size);
7771
        if (converted == -2)
7772
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7773
                                                s, chunk_size,
7774
                                                errors, final);
7775
        assert(converted != 0 || done);
7776
7777
        if (converted < 0) {
7778
            PyMem_Free(buf);
7779
            return NULL;
7780
        }
7781
7782
        if (consumed)
7783
            *consumed += converted;
7784
7785
        s += converted;
7786
        size -= converted;
7787
    } while (!done);
7788
7789
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7790
    PyMem_Free(buf);
7791
    return v;
7792
}
7793
7794
PyObject *
7795
PyUnicode_DecodeCodePageStateful(int code_page,
7796
                                 const char *s,
7797
                                 Py_ssize_t size,
7798
                                 const char *errors,
7799
                                 Py_ssize_t *consumed)
7800
{
7801
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7802
}
7803
7804
PyObject *
7805
PyUnicode_DecodeMBCSStateful(const char *s,
7806
                             Py_ssize_t size,
7807
                             const char *errors,
7808
                             Py_ssize_t *consumed)
7809
{
7810
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7811
}
7812
7813
PyObject *
7814
PyUnicode_DecodeMBCS(const char *s,
7815
                     Py_ssize_t size,
7816
                     const char *errors)
7817
{
7818
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7819
}
7820
7821
static DWORD
7822
encode_code_page_flags(UINT code_page, const char *errors)
7823
{
7824
    if (code_page == CP_UTF8) {
7825
        return WC_ERR_INVALID_CHARS;
7826
    }
7827
    else if (code_page == CP_UTF7) {
7828
        /* CP_UTF7 only supports flags=0 */
7829
        return 0;
7830
    }
7831
    else {
7832
        if (errors != NULL && strcmp(errors, "replace") == 0)
7833
            return 0;
7834
        else
7835
            return WC_NO_BEST_FIT_CHARS;
7836
    }
7837
}
7838
7839
/*
7840
 * Encode a Unicode string to a Windows code page into a byte string in strict
7841
 * mode.
7842
 *
7843
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7844
 * an OSError and returns -1 on other error.
7845
 */
7846
static int
7847
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7848
                        PyObject *unicode, Py_ssize_t offset, int len,
7849
                        const char* errors)
7850
{
7851
    BOOL usedDefaultChar = FALSE;
7852
    BOOL *pusedDefaultChar = &usedDefaultChar;
7853
    int outsize;
7854
    wchar_t *p;
7855
    Py_ssize_t size;
7856
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7857
    char *out;
7858
    /* Create a substring so that we can get the UTF-16 representation
7859
       of just the slice under consideration. */
7860
    PyObject *substring;
7861
    int ret = -1;
7862
7863
    assert(len > 0);
7864
7865
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7866
        pusedDefaultChar = &usedDefaultChar;
7867
    else
7868
        pusedDefaultChar = NULL;
7869
7870
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7871
    if (substring == NULL)
7872
        return -1;
7873
    p = PyUnicode_AsWideCharString(substring, &size);
7874
    Py_CLEAR(substring);
7875
    if (p == NULL) {
7876
        return -1;
7877
    }
7878
    assert(size <= INT_MAX);
7879
7880
    /* First get the size of the result */
7881
    outsize = WideCharToMultiByte(code_page, flags,
7882
                                  p, (int)size,
7883
                                  NULL, 0,
7884
                                  NULL, pusedDefaultChar);
7885
    if (outsize <= 0)
7886
        goto error;
7887
    /* If we used a default char, then we failed! */
7888
    if (pusedDefaultChar && *pusedDefaultChar) {
7889
        ret = -2;
7890
        goto done;
7891
    }
7892
7893
    if (*writer == NULL) {
7894
        /* Create string object */
7895
        *writer = PyBytesWriter_Create(outsize);
7896
        if (*writer == NULL) {
7897
            goto done;
7898
        }
7899
        out = PyBytesWriter_GetData(*writer);
7900
    }
7901
    else {
7902
        /* Extend string object */
7903
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7904
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7905
            goto done;
7906
        }
7907
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7908
    }
7909
7910
    /* Do the conversion */
7911
    outsize = WideCharToMultiByte(code_page, flags,
7912
                                  p, (int)size,
7913
                                  out, outsize,
7914
                                  NULL, pusedDefaultChar);
7915
    if (outsize <= 0)
7916
        goto error;
7917
    if (pusedDefaultChar && *pusedDefaultChar) {
7918
        ret = -2;
7919
        goto done;
7920
    }
7921
    ret = 0;
7922
7923
done:
7924
    PyMem_Free(p);
7925
    return ret;
7926
7927
error:
7928
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7929
        ret = -2;
7930
        goto done;
7931
    }
7932
    PyErr_SetFromWindowsErr(0);
7933
    goto done;
7934
}
7935
7936
/*
7937
 * Encode a Unicode string to a Windows code page into a byte string using an
7938
 * error handler.
7939
 *
7940
 * Returns consumed characters if succeed, or raise an OSError and returns
7941
 * -1 on other error.
7942
 */
7943
static int
7944
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7945
                        PyObject *unicode, Py_ssize_t unicode_offset,
7946
                        Py_ssize_t insize, const char* errors)
7947
{
7948
    const DWORD flags = encode_code_page_flags(code_page, errors);
7949
    Py_ssize_t pos = unicode_offset;
7950
    Py_ssize_t endin = unicode_offset + insize;
7951
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7952
       2000 English version of the message. */
7953
    const char *reason = "invalid character";
7954
    /* 4=maximum length of a UTF-8 sequence */
7955
    char buffer[4];
7956
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7957
    Py_ssize_t outsize;
7958
    char *out;
7959
    PyObject *errorHandler = NULL;
7960
    PyObject *exc = NULL;
7961
    PyObject *encoding_obj = NULL;
7962
    const char *encoding;
7963
    Py_ssize_t newpos;
7964
    PyObject *rep;
7965
    int ret = -1;
7966
7967
    assert(insize > 0);
7968
7969
    encoding = code_page_name(code_page, &encoding_obj);
7970
    if (encoding == NULL)
7971
        return -1;
7972
7973
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7974
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7975
           then we raise a UnicodeEncodeError. */
7976
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7977
        if (exc != NULL) {
7978
            PyCodec_StrictErrors(exc);
7979
            Py_DECREF(exc);
7980
        }
7981
        Py_XDECREF(encoding_obj);
7982
        return -1;
7983
    }
7984
7985
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7986
        pusedDefaultChar = &usedDefaultChar;
7987
    else
7988
        pusedDefaultChar = NULL;
7989
7990
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7991
        PyErr_NoMemory();
7992
        goto error;
7993
    }
7994
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7995
7996
    if (*writer == NULL) {
7997
        /* Create string object */
7998
        *writer = PyBytesWriter_Create(outsize);
7999
        if (*writer == NULL) {
8000
            goto error;
8001
        }
8002
        out = PyBytesWriter_GetData(*writer);
8003
    }
8004
    else {
8005
        /* Extend string object */
8006
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8007
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8008
            goto error;
8009
        }
8010
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8011
    }
8012
8013
    /* Encode the string character per character */
8014
    while (pos < endin)
8015
    {
8016
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8017
        wchar_t chars[2];
8018
        int charsize;
8019
        if (ch < 0x10000) {
8020
            chars[0] = (wchar_t)ch;
8021
            charsize = 1;
8022
        }
8023
        else {
8024
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8025
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8026
            charsize = 2;
8027
        }
8028
8029
        outsize = WideCharToMultiByte(code_page, flags,
8030
                                      chars, charsize,
8031
                                      buffer, Py_ARRAY_LENGTH(buffer),
8032
                                      NULL, pusedDefaultChar);
8033
        if (outsize > 0) {
8034
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8035
            {
8036
                pos++;
8037
                memcpy(out, buffer, outsize);
8038
                out += outsize;
8039
                continue;
8040
            }
8041
        }
8042
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8043
            PyErr_SetFromWindowsErr(0);
8044
            goto error;
8045
        }
8046
8047
        rep = unicode_encode_call_errorhandler(
8048
                  errors, &errorHandler, encoding, reason,
8049
                  unicode, &exc,
8050
                  pos, pos + 1, &newpos);
8051
        if (rep == NULL)
8052
            goto error;
8053
8054
        Py_ssize_t morebytes = pos - newpos;
8055
        if (PyBytes_Check(rep)) {
8056
            outsize = PyBytes_GET_SIZE(rep);
8057
            morebytes += outsize;
8058
            if (morebytes > 0) {
8059
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8060
                if (out == NULL) {
8061
                    Py_DECREF(rep);
8062
                    goto error;
8063
                }
8064
            }
8065
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8066
            out += outsize;
8067
        }
8068
        else {
8069
            Py_ssize_t i;
8070
            int kind;
8071
            const void *data;
8072
8073
            outsize = PyUnicode_GET_LENGTH(rep);
8074
            morebytes += outsize;
8075
            if (morebytes > 0) {
8076
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8077
                if (out == NULL) {
8078
                    Py_DECREF(rep);
8079
                    goto error;
8080
                }
8081
            }
8082
            kind = PyUnicode_KIND(rep);
8083
            data = PyUnicode_DATA(rep);
8084
            for (i=0; i < outsize; i++) {
8085
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8086
                if (ch > 127) {
8087
                    raise_encode_exception(&exc,
8088
                        encoding, unicode,
8089
                        pos, pos + 1,
8090
                        "unable to encode error handler result to ASCII");
8091
                    Py_DECREF(rep);
8092
                    goto error;
8093
                }
8094
                *out = (unsigned char)ch;
8095
                out++;
8096
            }
8097
        }
8098
        pos = newpos;
8099
        Py_DECREF(rep);
8100
    }
8101
    /* write a NUL byte */
8102
    *out = 0;
8103
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8104
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8105
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8106
        goto error;
8107
    }
8108
    ret = 0;
8109
8110
error:
8111
    Py_XDECREF(encoding_obj);
8112
    Py_XDECREF(errorHandler);
8113
    Py_XDECREF(exc);
8114
    return ret;
8115
}
8116
8117
8118
PyObject *
8119
PyUnicode_EncodeCodePage(int code_page,
8120
                         PyObject *unicode,
8121
                         const char *errors)
8122
{
8123
    Py_ssize_t len;
8124
    PyBytesWriter *writer = NULL;
8125
    Py_ssize_t offset;
8126
    int chunk_len, ret, done;
8127
8128
    if (!PyUnicode_Check(unicode)) {
8129
        PyErr_BadArgument();
8130
        return NULL;
8131
    }
8132
8133
    len = PyUnicode_GET_LENGTH(unicode);
8134
8135
    if (code_page < 0) {
8136
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8137
        return NULL;
8138
    }
8139
8140
    if (len == 0)
8141
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8142
8143
    offset = 0;
8144
    do
8145
    {
8146
#ifdef NEED_RETRY
8147
        if (len > DECODING_CHUNK_SIZE) {
8148
            chunk_len = DECODING_CHUNK_SIZE;
8149
            done = 0;
8150
        }
8151
        else
8152
#endif
8153
        {
8154
            chunk_len = (int)len;
8155
            done = 1;
8156
        }
8157
8158
        ret = encode_code_page_strict(code_page, &writer,
8159
                                      unicode, offset, chunk_len,
8160
                                      errors);
8161
        if (ret == -2)
8162
            ret = encode_code_page_errors(code_page, &writer,
8163
                                          unicode, offset,
8164
                                          chunk_len, errors);
8165
        if (ret < 0) {
8166
            PyBytesWriter_Discard(writer);
8167
            return NULL;
8168
        }
8169
8170
        offset += chunk_len;
8171
        len -= chunk_len;
8172
    } while (!done);
8173
8174
    return PyBytesWriter_Finish(writer);
8175
}
8176
8177
8178
PyObject *
8179
PyUnicode_AsMBCSString(PyObject *unicode)
8180
{
8181
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8182
}
8183
8184
#undef NEED_RETRY
8185
8186
#endif /* MS_WINDOWS */
8187
8188
/* --- Character Mapping Codec -------------------------------------------- */
8189
8190
static int
8191
charmap_decode_string(const char *s,
8192
                      Py_ssize_t size,
8193
                      PyObject *mapping,
8194
                      const char *errors,
8195
                      _PyUnicodeWriter *writer)
8196
585k
{
8197
585k
    const char *starts = s;
8198
585k
    const char *e;
8199
585k
    Py_ssize_t startinpos, endinpos;
8200
585k
    PyObject *errorHandler = NULL, *exc = NULL;
8201
585k
    Py_ssize_t maplen;
8202
585k
    int mapkind;
8203
585k
    const void *mapdata;
8204
585k
    Py_UCS4 x;
8205
585k
    unsigned char ch;
8206
8207
585k
    maplen = PyUnicode_GET_LENGTH(mapping);
8208
585k
    mapdata = PyUnicode_DATA(mapping);
8209
585k
    mapkind = PyUnicode_KIND(mapping);
8210
8211
585k
    e = s + size;
8212
8213
585k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8214
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8215
         * is disabled in encoding aliases, latin1 is preferred because
8216
         * its implementation is faster. */
8217
123
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8218
123
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8219
123
        Py_UCS4 maxchar = writer->maxchar;
8220
8221
123
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8222
8.06k
        while (s < e) {
8223
7.94k
            ch = *s;
8224
7.94k
            x = mapdata_ucs1[ch];
8225
7.94k
            if (x > maxchar) {
8226
114
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8227
0
                    goto onError;
8228
114
                maxchar = writer->maxchar;
8229
114
                outdata = (Py_UCS1 *)writer->data;
8230
114
            }
8231
7.94k
            outdata[writer->pos] = x;
8232
7.94k
            writer->pos++;
8233
7.94k
            ++s;
8234
7.94k
        }
8235
123
        return 0;
8236
123
    }
8237
8238
698k
    while (s < e) {
8239
683k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8240
683k
            int outkind = writer->kind;
8241
683k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8242
683k
            if (outkind == PyUnicode_1BYTE_KIND) {
8243
621k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8244
621k
                Py_UCS4 maxchar = writer->maxchar;
8245
11.8M
                while (s < e) {
8246
11.3M
                    ch = *s;
8247
11.3M
                    x = mapdata_ucs2[ch];
8248
11.3M
                    if (x > maxchar)
8249
76.2k
                        goto Error;
8250
11.2M
                    outdata[writer->pos] = x;
8251
11.2M
                    writer->pos++;
8252
11.2M
                    ++s;
8253
11.2M
                }
8254
545k
                break;
8255
621k
            }
8256
61.2k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8257
61.2k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8258
16.5M
                while (s < e) {
8259
16.5M
                    ch = *s;
8260
16.5M
                    x = mapdata_ucs2[ch];
8261
16.5M
                    if (x == 0xFFFE)
8262
37.0k
                        goto Error;
8263
16.4M
                    outdata[writer->pos] = x;
8264
16.4M
                    writer->pos++;
8265
16.4M
                    ++s;
8266
16.4M
                }
8267
24.1k
                break;
8268
61.2k
            }
8269
683k
        }
8270
0
        ch = *s;
8271
8272
0
        if (ch < maplen)
8273
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8274
0
        else
8275
0
            x = 0xfffe; /* invalid value */
8276
113k
Error:
8277
113k
        if (x == 0xfffe)
8278
58.9k
        {
8279
            /* undefined mapping */
8280
58.9k
            startinpos = s-starts;
8281
58.9k
            endinpos = startinpos+1;
8282
58.9k
            if (unicode_decode_call_errorhandler_writer(
8283
58.9k
                    errors, &errorHandler,
8284
58.9k
                    "charmap", "character maps to <undefined>",
8285
58.9k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8286
58.9k
                    writer)) {
8287
14
                goto onError;
8288
14
            }
8289
58.9k
            continue;
8290
58.9k
        }
8291
8292
54.4k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8293
0
            goto onError;
8294
54.4k
        ++s;
8295
54.4k
    }
8296
585k
    Py_XDECREF(errorHandler);
8297
585k
    Py_XDECREF(exc);
8298
585k
    return 0;
8299
8300
14
onError:
8301
14
    Py_XDECREF(errorHandler);
8302
14
    Py_XDECREF(exc);
8303
14
    return -1;
8304
585k
}
8305
8306
static int
8307
charmap_decode_mapping(const char *s,
8308
                       Py_ssize_t size,
8309
                       PyObject *mapping,
8310
                       const char *errors,
8311
                       _PyUnicodeWriter *writer)
8312
0
{
8313
0
    const char *starts = s;
8314
0
    const char *e;
8315
0
    Py_ssize_t startinpos, endinpos;
8316
0
    PyObject *errorHandler = NULL, *exc = NULL;
8317
0
    unsigned char ch;
8318
0
    PyObject *key, *item = NULL;
8319
8320
0
    e = s + size;
8321
8322
0
    while (s < e) {
8323
0
        ch = *s;
8324
8325
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8326
0
        key = PyLong_FromLong((long)ch);
8327
0
        if (key == NULL)
8328
0
            goto onError;
8329
8330
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8331
0
        Py_DECREF(key);
8332
0
        if (rc == 0) {
8333
            /* No mapping found means: mapping is undefined. */
8334
0
            goto Undefined;
8335
0
        }
8336
0
        if (item == NULL) {
8337
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8338
                /* No mapping found means: mapping is undefined. */
8339
0
                PyErr_Clear();
8340
0
                goto Undefined;
8341
0
            } else
8342
0
                goto onError;
8343
0
        }
8344
8345
        /* Apply mapping */
8346
0
        if (item == Py_None)
8347
0
            goto Undefined;
8348
0
        if (PyLong_Check(item)) {
8349
0
            long value = PyLong_AsLong(item);
8350
0
            if (value == 0xFFFE)
8351
0
                goto Undefined;
8352
0
            if (value < 0 || value > MAX_UNICODE) {
8353
0
                PyErr_Format(PyExc_TypeError,
8354
0
                             "character mapping must be in range(0x%x)",
8355
0
                             (unsigned long)MAX_UNICODE + 1);
8356
0
                goto onError;
8357
0
            }
8358
8359
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8360
0
                goto onError;
8361
0
        }
8362
0
        else if (PyUnicode_Check(item)) {
8363
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8364
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8365
0
                if (value == 0xFFFE)
8366
0
                    goto Undefined;
8367
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8368
0
                    goto onError;
8369
0
            }
8370
0
            else {
8371
0
                writer->overallocate = 1;
8372
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8373
0
                    goto onError;
8374
0
            }
8375
0
        }
8376
0
        else {
8377
            /* wrong return value */
8378
0
            PyErr_SetString(PyExc_TypeError,
8379
0
                            "character mapping must return integer, None or str");
8380
0
            goto onError;
8381
0
        }
8382
0
        Py_CLEAR(item);
8383
0
        ++s;
8384
0
        continue;
8385
8386
0
Undefined:
8387
        /* undefined mapping */
8388
0
        Py_CLEAR(item);
8389
0
        startinpos = s-starts;
8390
0
        endinpos = startinpos+1;
8391
0
        if (unicode_decode_call_errorhandler_writer(
8392
0
                errors, &errorHandler,
8393
0
                "charmap", "character maps to <undefined>",
8394
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8395
0
                writer)) {
8396
0
            goto onError;
8397
0
        }
8398
0
    }
8399
0
    Py_XDECREF(errorHandler);
8400
0
    Py_XDECREF(exc);
8401
0
    return 0;
8402
8403
0
onError:
8404
0
    Py_XDECREF(item);
8405
0
    Py_XDECREF(errorHandler);
8406
0
    Py_XDECREF(exc);
8407
0
    return -1;
8408
0
}
8409
8410
PyObject *
8411
PyUnicode_DecodeCharmap(const char *s,
8412
                        Py_ssize_t size,
8413
                        PyObject *mapping,
8414
                        const char *errors)
8415
585k
{
8416
585k
    _PyUnicodeWriter writer;
8417
8418
    /* Default to Latin-1 */
8419
585k
    if (mapping == NULL)
8420
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8421
8422
585k
    if (size == 0)
8423
0
        _Py_RETURN_UNICODE_EMPTY();
8424
585k
    _PyUnicodeWriter_Init(&writer);
8425
585k
    writer.min_length = size;
8426
585k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8427
0
        goto onError;
8428
8429
585k
    if (PyUnicode_CheckExact(mapping)) {
8430
585k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8431
14
            goto onError;
8432
585k
    }
8433
0
    else {
8434
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8435
0
            goto onError;
8436
0
    }
8437
585k
    return _PyUnicodeWriter_Finish(&writer);
8438
8439
14
  onError:
8440
14
    _PyUnicodeWriter_Dealloc(&writer);
8441
14
    return NULL;
8442
585k
}
8443
8444
/* Charmap encoding: the lookup table */
8445
8446
/*[clinic input]
8447
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8448
[clinic start generated code]*/
8449
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8450
8451
struct encoding_map {
8452
    PyObject_HEAD
8453
    unsigned char level1[32];
8454
    int count2, count3;
8455
    unsigned char level23[1];
8456
};
8457
8458
/*[clinic input]
8459
EncodingMap.size
8460
8461
Return the size (in bytes) of this object.
8462
[clinic start generated code]*/
8463
8464
static PyObject *
8465
EncodingMap_size_impl(struct encoding_map *self)
8466
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8467
0
{
8468
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8469
0
                           128*self->count3);
8470
0
}
8471
8472
static PyMethodDef encoding_map_methods[] = {
8473
    ENCODINGMAP_SIZE_METHODDEF
8474
    {NULL, NULL}
8475
};
8476
8477
static PyTypeObject EncodingMapType = {
8478
    PyVarObject_HEAD_INIT(NULL, 0)
8479
    .tp_name = "EncodingMap",
8480
    .tp_basicsize = sizeof(struct encoding_map),
8481
    /* methods */
8482
    .tp_flags = Py_TPFLAGS_DEFAULT,
8483
    .tp_methods = encoding_map_methods,
8484
};
8485
8486
PyObject*
8487
PyUnicode_BuildEncodingMap(PyObject* string)
8488
126
{
8489
126
    PyObject *result;
8490
126
    struct encoding_map *mresult;
8491
126
    int i;
8492
126
    int need_dict = 0;
8493
126
    unsigned char level1[32];
8494
126
    unsigned char level2[512];
8495
126
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8496
126
    int count2 = 0, count3 = 0;
8497
126
    int kind;
8498
126
    const void *data;
8499
126
    int length;
8500
126
    Py_UCS4 ch;
8501
8502
126
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8503
0
        PyErr_BadArgument();
8504
0
        return NULL;
8505
0
    }
8506
126
    kind = PyUnicode_KIND(string);
8507
126
    data = PyUnicode_DATA(string);
8508
126
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8509
126
    memset(level1, 0xFF, sizeof level1);
8510
126
    memset(level2, 0xFF, sizeof level2);
8511
8512
    /* If there isn't a one-to-one mapping of NULL to \0,
8513
       or if there are non-BMP characters, we need to use
8514
       a mapping dictionary. */
8515
126
    if (PyUnicode_READ(kind, data, 0) != 0)
8516
0
        need_dict = 1;
8517
32.2k
    for (i = 1; i < length; i++) {
8518
32.1k
        int l1, l2;
8519
32.1k
        ch = PyUnicode_READ(kind, data, i);
8520
32.1k
        if (ch == 0 || ch > 0xFFFF) {
8521
0
            need_dict = 1;
8522
0
            break;
8523
0
        }
8524
32.1k
        if (ch == 0xFFFE)
8525
            /* unmapped character */
8526
826
            continue;
8527
31.3k
        l1 = ch >> 11;
8528
31.3k
        l2 = ch >> 7;
8529
31.3k
        if (level1[l1] == 0xFF)
8530
227
            level1[l1] = count2++;
8531
31.3k
        if (level2[l2] == 0xFF)
8532
667
            level2[l2] = count3++;
8533
31.3k
    }
8534
8535
126
    if (count2 >= 0xFF || count3 >= 0xFF)
8536
0
        need_dict = 1;
8537
8538
126
    if (need_dict) {
8539
0
        PyObject *result = PyDict_New();
8540
0
        if (!result)
8541
0
            return NULL;
8542
0
        for (i = 0; i < length; i++) {
8543
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8544
0
            PyObject *key = PyLong_FromLong(c);
8545
0
            if (key == NULL) {
8546
0
                Py_DECREF(result);
8547
0
                return NULL;
8548
0
            }
8549
0
            PyObject *value = PyLong_FromLong(i);
8550
0
            if (value == NULL) {
8551
0
                Py_DECREF(key);
8552
0
                Py_DECREF(result);
8553
0
                return NULL;
8554
0
            }
8555
0
            int rc = PyDict_SetItem(result, key, value);
8556
0
            Py_DECREF(key);
8557
0
            Py_DECREF(value);
8558
0
            if (rc < 0) {
8559
0
                Py_DECREF(result);
8560
0
                return NULL;
8561
0
            }
8562
0
        }
8563
0
        return result;
8564
0
    }
8565
8566
    /* Create a three-level trie */
8567
126
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8568
126
                             16*count2 + 128*count3 - 1);
8569
126
    if (!result) {
8570
0
        return PyErr_NoMemory();
8571
0
    }
8572
8573
126
    _PyObject_Init(result, &EncodingMapType);
8574
126
    mresult = (struct encoding_map*)result;
8575
126
    mresult->count2 = count2;
8576
126
    mresult->count3 = count3;
8577
126
    mlevel1 = mresult->level1;
8578
126
    mlevel2 = mresult->level23;
8579
126
    mlevel3 = mresult->level23 + 16*count2;
8580
126
    memcpy(mlevel1, level1, 32);
8581
126
    memset(mlevel2, 0xFF, 16*count2);
8582
126
    memset(mlevel3, 0, 128*count3);
8583
126
    count3 = 0;
8584
32.2k
    for (i = 1; i < length; i++) {
8585
32.1k
        int o1, o2, o3, i2, i3;
8586
32.1k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8587
32.1k
        if (ch == 0xFFFE)
8588
            /* unmapped character */
8589
826
            continue;
8590
31.3k
        o1 = ch>>11;
8591
31.3k
        o2 = (ch>>7) & 0xF;
8592
31.3k
        i2 = 16*mlevel1[o1] + o2;
8593
31.3k
        if (mlevel2[i2] == 0xFF)
8594
667
            mlevel2[i2] = count3++;
8595
31.3k
        o3 = ch & 0x7F;
8596
31.3k
        i3 = 128*mlevel2[i2] + o3;
8597
31.3k
        mlevel3[i3] = i;
8598
31.3k
    }
8599
126
    return result;
8600
126
}
8601
8602
static int
8603
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8604
0
{
8605
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8606
0
    int l1 = c>>11;
8607
0
    int l2 = (c>>7) & 0xF;
8608
0
    int l3 = c & 0x7F;
8609
0
    int i;
8610
8611
0
    if (c > 0xFFFF)
8612
0
        return -1;
8613
0
    if (c == 0)
8614
0
        return 0;
8615
    /* level 1*/
8616
0
    i = map->level1[l1];
8617
0
    if (i == 0xFF) {
8618
0
        return -1;
8619
0
    }
8620
    /* level 2*/
8621
0
    i = map->level23[16*i+l2];
8622
0
    if (i == 0xFF) {
8623
0
        return -1;
8624
0
    }
8625
    /* level 3 */
8626
0
    i = map->level23[16*map->count2 + 128*i + l3];
8627
0
    if (i == 0) {
8628
0
        return -1;
8629
0
    }
8630
0
    return i;
8631
0
}
8632
8633
/* Lookup the character in the mapping.
8634
   On success, return PyLong, PyBytes or None (if the character can't be found).
8635
   If the result is PyLong, put its value in replace.
8636
   On error, return NULL.
8637
   */
8638
static PyObject *
8639
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8640
0
{
8641
0
    PyObject *w = PyLong_FromLong((long)c);
8642
0
    PyObject *x;
8643
8644
0
    if (w == NULL)
8645
0
        return NULL;
8646
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8647
0
    Py_DECREF(w);
8648
0
    if (rc == 0) {
8649
        /* No mapping found means: mapping is undefined. */
8650
0
        Py_RETURN_NONE;
8651
0
    }
8652
0
    if (x == NULL) {
8653
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8654
            /* No mapping found means: mapping is undefined. */
8655
0
            PyErr_Clear();
8656
0
            Py_RETURN_NONE;
8657
0
        } else
8658
0
            return NULL;
8659
0
    }
8660
0
    else if (x == Py_None)
8661
0
        return x;
8662
0
    else if (PyLong_Check(x)) {
8663
0
        long value = PyLong_AsLong(x);
8664
0
        if (value < 0 || value > 255) {
8665
0
            PyErr_SetString(PyExc_TypeError,
8666
0
                            "character mapping must be in range(256)");
8667
0
            Py_DECREF(x);
8668
0
            return NULL;
8669
0
        }
8670
0
        *replace = (unsigned char)value;
8671
0
        return x;
8672
0
    }
8673
0
    else if (PyBytes_Check(x))
8674
0
        return x;
8675
0
    else {
8676
        /* wrong return value */
8677
0
        PyErr_Format(PyExc_TypeError,
8678
0
                     "character mapping must return integer, bytes or None, not %.400s",
8679
0
                     Py_TYPE(x)->tp_name);
8680
0
        Py_DECREF(x);
8681
0
        return NULL;
8682
0
    }
8683
0
}
8684
8685
static int
8686
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8687
0
{
8688
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8689
    /* exponentially overallocate to minimize reallocations */
8690
0
    if (requiredsize < 2 * outsize)
8691
0
        requiredsize = 2 * outsize;
8692
0
    return PyBytesWriter_Resize(writer, requiredsize);
8693
0
}
8694
8695
typedef enum charmapencode_result {
8696
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8697
} charmapencode_result;
8698
/* lookup the character, put the result in the output string and adjust
8699
   various state variables. Resize the output bytes object if not enough
8700
   space is available. Return a new reference to the object that
8701
   was put in the output buffer, or Py_None, if the mapping was undefined
8702
   (in which case no character was written) or NULL, if a
8703
   reallocation error occurred. The caller must decref the result */
8704
static charmapencode_result
8705
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8706
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8707
0
{
8708
0
    PyObject *rep;
8709
0
    unsigned char replace;
8710
0
    char *outstart;
8711
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8712
8713
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8714
0
        int res = encoding_map_lookup(c, mapping);
8715
0
        Py_ssize_t requiredsize = *outpos+1;
8716
0
        if (res == -1) {
8717
0
            return enc_FAILED;
8718
0
        }
8719
8720
0
        if (outsize<requiredsize) {
8721
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8722
0
                return enc_EXCEPTION;
8723
0
            }
8724
0
        }
8725
0
        outstart = _PyBytesWriter_GetData(writer);
8726
0
        outstart[(*outpos)++] = (char)res;
8727
0
        return enc_SUCCESS;
8728
0
    }
8729
8730
0
    rep = charmapencode_lookup(c, mapping, &replace);
8731
0
    if (rep==NULL)
8732
0
        return enc_EXCEPTION;
8733
0
    else if (rep==Py_None) {
8734
0
        Py_DECREF(rep);
8735
0
        return enc_FAILED;
8736
0
    } else {
8737
0
        if (PyLong_Check(rep)) {
8738
0
            Py_ssize_t requiredsize = *outpos+1;
8739
0
            if (outsize<requiredsize)
8740
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8741
0
                    Py_DECREF(rep);
8742
0
                    return enc_EXCEPTION;
8743
0
                }
8744
0
            outstart = _PyBytesWriter_GetData(writer);
8745
0
            outstart[(*outpos)++] = (char)replace;
8746
0
        }
8747
0
        else {
8748
0
            const char *repchars = PyBytes_AS_STRING(rep);
8749
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8750
0
            Py_ssize_t requiredsize = *outpos+repsize;
8751
0
            if (outsize<requiredsize)
8752
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8753
0
                    Py_DECREF(rep);
8754
0
                    return enc_EXCEPTION;
8755
0
                }
8756
0
            outstart = _PyBytesWriter_GetData(writer);
8757
0
            memcpy(outstart + *outpos, repchars, repsize);
8758
0
            *outpos += repsize;
8759
0
        }
8760
0
    }
8761
0
    Py_DECREF(rep);
8762
0
    return enc_SUCCESS;
8763
0
}
8764
8765
/* handle an error in _PyUnicode_EncodeCharmap()
8766
   Return 0 on success, -1 on error */
8767
static int
8768
charmap_encoding_error(
8769
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8770
    PyObject **exceptionObject,
8771
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8772
    PyBytesWriter *writer, Py_ssize_t *respos)
8773
0
{
8774
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8775
0
    Py_ssize_t size, repsize;
8776
0
    Py_ssize_t newpos;
8777
0
    int kind;
8778
0
    const void *data;
8779
0
    Py_ssize_t index;
8780
    /* startpos for collecting unencodable chars */
8781
0
    Py_ssize_t collstartpos = *inpos;
8782
0
    Py_ssize_t collendpos = *inpos+1;
8783
0
    Py_ssize_t collpos;
8784
0
    const char *encoding = "charmap";
8785
0
    const char *reason = "character maps to <undefined>";
8786
0
    charmapencode_result x;
8787
0
    Py_UCS4 ch;
8788
0
    int val;
8789
8790
0
    size = PyUnicode_GET_LENGTH(unicode);
8791
    /* find all unencodable characters */
8792
0
    while (collendpos < size) {
8793
0
        PyObject *rep;
8794
0
        unsigned char replace;
8795
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8796
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8797
0
            val = encoding_map_lookup(ch, mapping);
8798
0
            if (val != -1)
8799
0
                break;
8800
0
            ++collendpos;
8801
0
            continue;
8802
0
        }
8803
8804
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8805
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8806
0
        if (rep==NULL)
8807
0
            return -1;
8808
0
        else if (rep!=Py_None) {
8809
0
            Py_DECREF(rep);
8810
0
            break;
8811
0
        }
8812
0
        Py_DECREF(rep);
8813
0
        ++collendpos;
8814
0
    }
8815
    /* cache callback name lookup
8816
     * (if not done yet, i.e. it's the first error) */
8817
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8818
0
        *error_handler = _Py_GetErrorHandler(errors);
8819
8820
0
    switch (*error_handler) {
8821
0
    case _Py_ERROR_STRICT:
8822
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8823
0
        return -1;
8824
8825
0
    case _Py_ERROR_REPLACE:
8826
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8827
0
            x = charmapencode_output('?', mapping, writer, respos);
8828
0
            if (x==enc_EXCEPTION) {
8829
0
                return -1;
8830
0
            }
8831
0
            else if (x==enc_FAILED) {
8832
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8833
0
                return -1;
8834
0
            }
8835
0
        }
8836
0
        _Py_FALLTHROUGH;
8837
0
    case _Py_ERROR_IGNORE:
8838
0
        *inpos = collendpos;
8839
0
        break;
8840
8841
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8842
        /* generate replacement (temporarily (mis)uses p) */
8843
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8844
0
            char buffer[2+29+1+1];
8845
0
            char *cp;
8846
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8847
0
            for (cp = buffer; *cp; ++cp) {
8848
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8849
0
                if (x==enc_EXCEPTION)
8850
0
                    return -1;
8851
0
                else if (x==enc_FAILED) {
8852
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8853
0
                    return -1;
8854
0
                }
8855
0
            }
8856
0
        }
8857
0
        *inpos = collendpos;
8858
0
        break;
8859
8860
0
    default:
8861
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8862
0
                                                      encoding, reason, unicode, exceptionObject,
8863
0
                                                      collstartpos, collendpos, &newpos);
8864
0
        if (repunicode == NULL)
8865
0
            return -1;
8866
0
        if (PyBytes_Check(repunicode)) {
8867
            /* Directly copy bytes result to output. */
8868
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8869
0
            Py_ssize_t requiredsize;
8870
0
            repsize = PyBytes_Size(repunicode);
8871
0
            requiredsize = *respos + repsize;
8872
0
            if (requiredsize > outsize)
8873
                /* Make room for all additional bytes. */
8874
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8875
0
                    Py_DECREF(repunicode);
8876
0
                    return -1;
8877
0
                }
8878
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8879
0
                   PyBytes_AsString(repunicode),  repsize);
8880
0
            *respos += repsize;
8881
0
            *inpos = newpos;
8882
0
            Py_DECREF(repunicode);
8883
0
            break;
8884
0
        }
8885
        /* generate replacement  */
8886
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8887
0
        data = PyUnicode_DATA(repunicode);
8888
0
        kind = PyUnicode_KIND(repunicode);
8889
0
        for (index = 0; index < repsize; index++) {
8890
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8891
0
            x = charmapencode_output(repch, mapping, writer, respos);
8892
0
            if (x==enc_EXCEPTION) {
8893
0
                Py_DECREF(repunicode);
8894
0
                return -1;
8895
0
            }
8896
0
            else if (x==enc_FAILED) {
8897
0
                Py_DECREF(repunicode);
8898
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8899
0
                return -1;
8900
0
            }
8901
0
        }
8902
0
        *inpos = newpos;
8903
0
        Py_DECREF(repunicode);
8904
0
    }
8905
0
    return 0;
8906
0
}
8907
8908
PyObject *
8909
_PyUnicode_EncodeCharmap(PyObject *unicode,
8910
                         PyObject *mapping,
8911
                         const char *errors)
8912
0
{
8913
    /* Default to Latin-1 */
8914
0
    if (mapping == NULL) {
8915
0
        return unicode_encode_ucs1(unicode, errors, 256);
8916
0
    }
8917
8918
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8919
0
    if (size == 0) {
8920
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8921
0
    }
8922
0
    const void *data = PyUnicode_DATA(unicode);
8923
0
    int kind = PyUnicode_KIND(unicode);
8924
8925
0
    PyObject *error_handler_obj = NULL;
8926
0
    PyObject *exc = NULL;
8927
8928
    /* output object */
8929
0
    PyBytesWriter *writer;
8930
    /* allocate enough for a simple encoding without
8931
       replacements, if we need more, we'll resize */
8932
0
    writer = PyBytesWriter_Create(size);
8933
0
    if (writer == NULL) {
8934
0
        goto onError;
8935
0
    }
8936
8937
    /* current input position */
8938
0
    Py_ssize_t inpos = 0;
8939
    /* current output position */
8940
0
    Py_ssize_t respos = 0;
8941
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8942
8943
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8944
0
        char *outstart = _PyBytesWriter_GetData(writer);
8945
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8946
8947
0
        while (inpos<size) {
8948
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8949
8950
            /* try to encode it */
8951
0
            int res = encoding_map_lookup(ch, mapping);
8952
0
            Py_ssize_t requiredsize = respos+1;
8953
0
            if (res == -1) {
8954
0
                goto enc_FAILED;
8955
0
            }
8956
8957
0
            if (outsize<requiredsize) {
8958
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8959
0
                    goto onError;
8960
0
                }
8961
0
                outstart = _PyBytesWriter_GetData(writer);
8962
0
                outsize = _PyBytesWriter_GetSize(writer);
8963
0
            }
8964
0
            outstart[respos++] = (char)res;
8965
8966
            /* done with this character => adjust input position */
8967
0
            ++inpos;
8968
0
            continue;
8969
8970
0
enc_FAILED:
8971
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8972
0
                                       &exc,
8973
0
                                       &error_handler, &error_handler_obj, errors,
8974
0
                                       writer, &respos)) {
8975
0
                goto onError;
8976
0
            }
8977
0
            outstart = _PyBytesWriter_GetData(writer);
8978
0
            outsize = _PyBytesWriter_GetSize(writer);
8979
0
        }
8980
0
    }
8981
0
    else {
8982
0
        while (inpos<size) {
8983
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8984
            /* try to encode it */
8985
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8986
0
            if (x==enc_EXCEPTION) { /* error */
8987
0
                goto onError;
8988
0
            }
8989
0
            if (x==enc_FAILED) { /* unencodable character */
8990
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8991
0
                                           &exc,
8992
0
                                           &error_handler, &error_handler_obj, errors,
8993
0
                                           writer, &respos)) {
8994
0
                    goto onError;
8995
0
                }
8996
0
            }
8997
0
            else {
8998
                /* done with this character => adjust input position */
8999
0
                ++inpos;
9000
0
            }
9001
0
        }
9002
0
    }
9003
9004
0
    Py_XDECREF(exc);
9005
0
    Py_XDECREF(error_handler_obj);
9006
9007
    /* Resize if we allocated too much */
9008
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9009
9010
0
  onError:
9011
0
    PyBytesWriter_Discard(writer);
9012
0
    Py_XDECREF(exc);
9013
0
    Py_XDECREF(error_handler_obj);
9014
0
    return NULL;
9015
0
}
9016
9017
PyObject *
9018
PyUnicode_AsCharmapString(PyObject *unicode,
9019
                          PyObject *mapping)
9020
0
{
9021
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9022
0
        PyErr_BadArgument();
9023
0
        return NULL;
9024
0
    }
9025
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9026
0
}
9027
9028
/* create or adjust a UnicodeTranslateError */
9029
static void
9030
make_translate_exception(PyObject **exceptionObject,
9031
                         PyObject *unicode,
9032
                         Py_ssize_t startpos, Py_ssize_t endpos,
9033
                         const char *reason)
9034
0
{
9035
0
    if (*exceptionObject == NULL) {
9036
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9037
0
            unicode, startpos, endpos, reason);
9038
0
    }
9039
0
    else {
9040
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9041
0
            goto onError;
9042
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9043
0
            goto onError;
9044
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9045
0
            goto onError;
9046
0
        return;
9047
0
      onError:
9048
0
        Py_CLEAR(*exceptionObject);
9049
0
    }
9050
0
}
9051
9052
/* error handling callback helper:
9053
   build arguments, call the callback and check the arguments,
9054
   put the result into newpos and return the replacement string, which
9055
   has to be freed by the caller */
9056
static PyObject *
9057
unicode_translate_call_errorhandler(const char *errors,
9058
                                    PyObject **errorHandler,
9059
                                    const char *reason,
9060
                                    PyObject *unicode, PyObject **exceptionObject,
9061
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9062
                                    Py_ssize_t *newpos)
9063
0
{
9064
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9065
9066
0
    Py_ssize_t i_newpos;
9067
0
    PyObject *restuple;
9068
0
    PyObject *resunicode;
9069
9070
0
    if (*errorHandler == NULL) {
9071
0
        *errorHandler = PyCodec_LookupError(errors);
9072
0
        if (*errorHandler == NULL)
9073
0
            return NULL;
9074
0
    }
9075
9076
0
    make_translate_exception(exceptionObject,
9077
0
                             unicode, startpos, endpos, reason);
9078
0
    if (*exceptionObject == NULL)
9079
0
        return NULL;
9080
9081
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9082
0
    if (restuple == NULL)
9083
0
        return NULL;
9084
0
    if (!PyTuple_Check(restuple)) {
9085
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9086
0
        Py_DECREF(restuple);
9087
0
        return NULL;
9088
0
    }
9089
0
    if (!PyArg_ParseTuple(restuple, argparse,
9090
0
                          &resunicode, &i_newpos)) {
9091
0
        Py_DECREF(restuple);
9092
0
        return NULL;
9093
0
    }
9094
0
    if (i_newpos<0)
9095
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9096
0
    else
9097
0
        *newpos = i_newpos;
9098
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9099
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9100
0
        Py_DECREF(restuple);
9101
0
        return NULL;
9102
0
    }
9103
0
    Py_INCREF(resunicode);
9104
0
    Py_DECREF(restuple);
9105
0
    return resunicode;
9106
0
}
9107
9108
/* Lookup the character ch in the mapping and put the result in result,
9109
   which must be decrefed by the caller.
9110
   The result can be PyLong, PyUnicode, None or NULL.
9111
   If the result is PyLong, put its value in replace.
9112
   Return 0 on success, -1 on error */
9113
static int
9114
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9115
18.8k
{
9116
18.8k
    PyObject *w = PyLong_FromLong((long)c);
9117
18.8k
    PyObject *x;
9118
9119
18.8k
    if (w == NULL)
9120
0
        return -1;
9121
18.8k
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9122
18.8k
    Py_DECREF(w);
9123
18.8k
    if (rc == 0) {
9124
        /* No mapping found means: use 1:1 mapping. */
9125
6.35k
        *result = NULL;
9126
6.35k
        return 0;
9127
6.35k
    }
9128
12.4k
    if (x == NULL) {
9129
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9130
            /* No mapping found means: use 1:1 mapping. */
9131
0
            PyErr_Clear();
9132
0
            *result = NULL;
9133
0
            return 0;
9134
0
        } else
9135
0
            return -1;
9136
0
    }
9137
12.4k
    else if (x == Py_None) {
9138
0
        *result = x;
9139
0
        return 0;
9140
0
    }
9141
12.4k
    else if (PyLong_Check(x)) {
9142
0
        long value = PyLong_AsLong(x);
9143
0
        if (value < 0 || value > MAX_UNICODE) {
9144
0
            PyErr_Format(PyExc_ValueError,
9145
0
                         "character mapping must be in range(0x%x)",
9146
0
                         MAX_UNICODE+1);
9147
0
            Py_DECREF(x);
9148
0
            return -1;
9149
0
        }
9150
0
        *result = x;
9151
0
        *replace = (Py_UCS4)value;
9152
0
        return 0;
9153
0
    }
9154
12.4k
    else if (PyUnicode_Check(x)) {
9155
12.4k
        *result = x;
9156
12.4k
        return 0;
9157
12.4k
    }
9158
0
    else {
9159
        /* wrong return value */
9160
0
        PyErr_SetString(PyExc_TypeError,
9161
0
                        "character mapping must return integer, None or str");
9162
0
        Py_DECREF(x);
9163
0
        return -1;
9164
0
    }
9165
12.4k
}
9166
9167
/* lookup the character, write the result into the writer.
9168
   Return 1 if the result was written into the writer, return 0 if the mapping
9169
   was undefined, raise an exception return -1 on error. */
9170
static int
9171
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9172
                        _PyUnicodeWriter *writer)
9173
6.40k
{
9174
6.40k
    PyObject *item;
9175
6.40k
    Py_UCS4 replace;
9176
9177
6.40k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9178
0
        return -1;
9179
9180
6.40k
    if (item == NULL) {
9181
        /* not found => default to 1:1 mapping */
9182
107
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9183
0
            return -1;
9184
0
        }
9185
107
        return 1;
9186
107
    }
9187
9188
6.29k
    if (item == Py_None) {
9189
0
        Py_DECREF(item);
9190
0
        return 0;
9191
0
    }
9192
9193
6.29k
    if (PyLong_Check(item)) {
9194
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9195
0
            Py_DECREF(item);
9196
0
            return -1;
9197
0
        }
9198
0
        Py_DECREF(item);
9199
0
        return 1;
9200
0
    }
9201
9202
6.29k
    if (!PyUnicode_Check(item)) {
9203
0
        Py_DECREF(item);
9204
0
        return -1;
9205
0
    }
9206
9207
6.29k
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9208
0
        Py_DECREF(item);
9209
0
        return -1;
9210
0
    }
9211
9212
6.29k
    Py_DECREF(item);
9213
6.29k
    return 1;
9214
6.29k
}
9215
9216
static int
9217
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9218
                              Py_UCS1 *translate)
9219
12.4k
{
9220
12.4k
    PyObject *item = NULL;
9221
12.4k
    Py_UCS4 replace;
9222
12.4k
    int ret = 0;
9223
9224
12.4k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9225
0
        return -1;
9226
0
    }
9227
9228
12.4k
    if (item == Py_None) {
9229
        /* deletion */
9230
0
        translate[ch] = 0xfe;
9231
0
    }
9232
12.4k
    else if (item == NULL) {
9233
        /* not found => default to 1:1 mapping */
9234
6.25k
        translate[ch] = ch;
9235
6.25k
        return 1;
9236
6.25k
    }
9237
6.18k
    else if (PyLong_Check(item)) {
9238
0
        if (replace > 127) {
9239
            /* invalid character or character outside ASCII:
9240
               skip the fast translate */
9241
0
            goto exit;
9242
0
        }
9243
0
        translate[ch] = (Py_UCS1)replace;
9244
0
    }
9245
6.18k
    else if (PyUnicode_Check(item)) {
9246
6.18k
        if (PyUnicode_GET_LENGTH(item) != 1)
9247
6.18k
            goto exit;
9248
9249
0
        replace = PyUnicode_READ_CHAR(item, 0);
9250
0
        if (replace > 127)
9251
0
            goto exit;
9252
0
        translate[ch] = (Py_UCS1)replace;
9253
0
    }
9254
0
    else {
9255
        /* not None, NULL, long or unicode */
9256
0
        goto exit;
9257
0
    }
9258
0
    ret = 1;
9259
9260
6.18k
  exit:
9261
6.18k
    Py_DECREF(item);
9262
6.18k
    return ret;
9263
0
}
9264
9265
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9266
   was translated into writer, return 0 if the input string was partially
9267
   translated into writer, raise an exception and return -1 on error. */
9268
static int
9269
unicode_fast_translate(PyObject *input, PyObject *mapping,
9270
                       _PyUnicodeWriter *writer, int ignore,
9271
                       Py_ssize_t *input_pos)
9272
12.3k
{
9273
12.3k
    Py_UCS1 ascii_table[128], ch, ch2;
9274
12.3k
    Py_ssize_t len;
9275
12.3k
    const Py_UCS1 *in, *end;
9276
12.3k
    Py_UCS1 *out;
9277
12.3k
    int res = 0;
9278
9279
12.3k
    len = PyUnicode_GET_LENGTH(input);
9280
9281
12.3k
    memset(ascii_table, 0xff, 128);
9282
9283
12.3k
    in = PyUnicode_1BYTE_DATA(input);
9284
12.3k
    end = in + len;
9285
9286
12.3k
    assert(PyUnicode_IS_ASCII(writer->buffer));
9287
12.3k
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9288
12.3k
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9289
9290
18.6k
    for (; in < end; in++) {
9291
12.4k
        ch = *in;
9292
12.4k
        ch2 = ascii_table[ch];
9293
12.4k
        if (ch2 == 0xff) {
9294
12.4k
            int translate = unicode_fast_translate_lookup(mapping, ch,
9295
12.4k
                                                          ascii_table);
9296
12.4k
            if (translate < 0)
9297
0
                return -1;
9298
12.4k
            if (translate == 0)
9299
6.18k
                goto exit;
9300
6.25k
            ch2 = ascii_table[ch];
9301
6.25k
        }
9302
6.29k
        if (ch2 == 0xfe) {
9303
0
            if (ignore)
9304
0
                continue;
9305
0
            goto exit;
9306
0
        }
9307
6.29k
        assert(ch2 < 128);
9308
6.29k
        *out = ch2;
9309
6.29k
        out++;
9310
6.29k
    }
9311
6.17k
    res = 1;
9312
9313
12.3k
exit:
9314
12.3k
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9315
12.3k
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9316
12.3k
    return res;
9317
6.17k
}
9318
9319
static PyObject *
9320
_PyUnicode_TranslateCharmap(PyObject *input,
9321
                            PyObject *mapping,
9322
                            const char *errors)
9323
12.3k
{
9324
    /* input object */
9325
12.3k
    const void *data;
9326
12.3k
    Py_ssize_t size, i;
9327
12.3k
    int kind;
9328
    /* output buffer */
9329
12.3k
    _PyUnicodeWriter writer;
9330
    /* error handler */
9331
12.3k
    const char *reason = "character maps to <undefined>";
9332
12.3k
    PyObject *errorHandler = NULL;
9333
12.3k
    PyObject *exc = NULL;
9334
12.3k
    int ignore;
9335
12.3k
    int res;
9336
9337
12.3k
    if (mapping == NULL) {
9338
0
        PyErr_BadArgument();
9339
0
        return NULL;
9340
0
    }
9341
9342
12.3k
    data = PyUnicode_DATA(input);
9343
12.3k
    kind = PyUnicode_KIND(input);
9344
12.3k
    size = PyUnicode_GET_LENGTH(input);
9345
9346
12.3k
    if (size == 0)
9347
0
        return PyUnicode_FromObject(input);
9348
9349
    /* allocate enough for a simple 1:1 translation without
9350
       replacements, if we need more, we'll resize */
9351
12.3k
    _PyUnicodeWriter_Init(&writer);
9352
12.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9353
0
        goto onError;
9354
9355
12.3k
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9356
9357
12.3k
    if (PyUnicode_IS_ASCII(input)) {
9358
12.3k
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9359
12.3k
        if (res < 0) {
9360
0
            _PyUnicodeWriter_Dealloc(&writer);
9361
0
            return NULL;
9362
0
        }
9363
12.3k
        if (res == 1)
9364
6.17k
            return _PyUnicodeWriter_Finish(&writer);
9365
12.3k
    }
9366
0
    else {
9367
0
        i = 0;
9368
0
    }
9369
9370
12.5k
    while (i<size) {
9371
        /* try to encode it */
9372
6.40k
        int translate;
9373
6.40k
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9374
6.40k
        Py_ssize_t newpos;
9375
        /* startpos for collecting untranslatable chars */
9376
6.40k
        Py_ssize_t collstart;
9377
6.40k
        Py_ssize_t collend;
9378
6.40k
        Py_UCS4 ch;
9379
9380
6.40k
        ch = PyUnicode_READ(kind, data, i);
9381
6.40k
        translate = charmaptranslate_output(ch, mapping, &writer);
9382
6.40k
        if (translate < 0)
9383
0
            goto onError;
9384
9385
6.40k
        if (translate != 0) {
9386
            /* it worked => adjust input pointer */
9387
6.40k
            ++i;
9388
6.40k
            continue;
9389
6.40k
        }
9390
9391
        /* untranslatable character */
9392
0
        collstart = i;
9393
0
        collend = i+1;
9394
9395
        /* find all untranslatable characters */
9396
0
        while (collend < size) {
9397
0
            PyObject *x;
9398
0
            Py_UCS4 replace;
9399
0
            ch = PyUnicode_READ(kind, data, collend);
9400
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9401
0
                goto onError;
9402
0
            Py_XDECREF(x);
9403
0
            if (x != Py_None)
9404
0
                break;
9405
0
            ++collend;
9406
0
        }
9407
9408
0
        if (ignore) {
9409
0
            i = collend;
9410
0
        }
9411
0
        else {
9412
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9413
0
                                                             reason, input, &exc,
9414
0
                                                             collstart, collend, &newpos);
9415
0
            if (repunicode == NULL)
9416
0
                goto onError;
9417
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9418
0
                Py_DECREF(repunicode);
9419
0
                goto onError;
9420
0
            }
9421
0
            Py_DECREF(repunicode);
9422
0
            i = newpos;
9423
0
        }
9424
0
    }
9425
6.18k
    Py_XDECREF(exc);
9426
6.18k
    Py_XDECREF(errorHandler);
9427
6.18k
    return _PyUnicodeWriter_Finish(&writer);
9428
9429
0
  onError:
9430
0
    _PyUnicodeWriter_Dealloc(&writer);
9431
0
    Py_XDECREF(exc);
9432
0
    Py_XDECREF(errorHandler);
9433
0
    return NULL;
9434
6.18k
}
9435
9436
PyObject *
9437
PyUnicode_Translate(PyObject *str,
9438
                    PyObject *mapping,
9439
                    const char *errors)
9440
0
{
9441
0
    if (ensure_unicode(str) < 0)
9442
0
        return NULL;
9443
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9444
0
}
9445
9446
PyObject *
9447
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9448
13.5M
{
9449
13.5M
    if (!PyUnicode_Check(unicode)) {
9450
0
        PyErr_BadInternalCall();
9451
0
        return NULL;
9452
0
    }
9453
13.5M
    if (PyUnicode_IS_ASCII(unicode)) {
9454
        /* If the string is already ASCII, just return the same string */
9455
13.5M
        return Py_NewRef(unicode);
9456
13.5M
    }
9457
9458
2.48k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9459
2.48k
    PyObject *result = PyUnicode_New(len, 127);
9460
2.48k
    if (result == NULL) {
9461
0
        return NULL;
9462
0
    }
9463
9464
2.48k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9465
2.48k
    int kind = PyUnicode_KIND(unicode);
9466
2.48k
    const void *data = PyUnicode_DATA(unicode);
9467
2.48k
    Py_ssize_t i;
9468
34.9k
    for (i = 0; i < len; ++i) {
9469
32.5k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9470
32.5k
        if (ch < 127) {
9471
29.6k
            out[i] = ch;
9472
29.6k
        }
9473
2.93k
        else if (Py_UNICODE_ISSPACE(ch)) {
9474
1.25k
            out[i] = ' ';
9475
1.25k
        }
9476
1.67k
        else {
9477
1.67k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9478
1.67k
            if (decimal < 0) {
9479
135
                out[i] = '?';
9480
135
                out[i+1] = '\0';
9481
135
                _PyUnicode_LENGTH(result) = i + 1;
9482
135
                break;
9483
135
            }
9484
1.54k
            out[i] = '0' + decimal;
9485
1.54k
        }
9486
32.5k
    }
9487
9488
2.48k
    assert(_PyUnicode_CheckConsistency(result, 1));
9489
2.48k
    return result;
9490
2.48k
}
9491
9492
/* --- Helpers ------------------------------------------------------------ */
9493
9494
/* helper macro to fixup start/end slice values */
9495
#define ADJUST_INDICES(start, end, len) \
9496
94.2M
    do {                                \
9497
94.2M
        if (end > len) {                \
9498
71.5M
            end = len;                  \
9499
71.5M
        }                               \
9500
94.2M
        else if (end < 0) {             \
9501
0
            end += len;                 \
9502
0
            if (end < 0) {              \
9503
0
                end = 0;                \
9504
0
            }                           \
9505
0
        }                               \
9506
94.2M
        if (start < 0) {                \
9507
17.1k
            start += len;               \
9508
17.1k
            if (start < 0) {            \
9509
0
                start = 0;              \
9510
0
            }                           \
9511
17.1k
        }                               \
9512
94.2M
    } while (0)
9513
9514
static Py_ssize_t
9515
any_find_slice(PyObject* s1, PyObject* s2,
9516
               Py_ssize_t start,
9517
               Py_ssize_t end,
9518
               int direction)
9519
21.8M
{
9520
21.8M
    int kind1, kind2;
9521
21.8M
    const void *buf1, *buf2;
9522
21.8M
    Py_ssize_t len1, len2, result;
9523
9524
21.8M
    kind1 = PyUnicode_KIND(s1);
9525
21.8M
    kind2 = PyUnicode_KIND(s2);
9526
21.8M
    if (kind1 < kind2)
9527
0
        return -1;
9528
9529
21.8M
    len1 = PyUnicode_GET_LENGTH(s1);
9530
21.8M
    len2 = PyUnicode_GET_LENGTH(s2);
9531
21.8M
    ADJUST_INDICES(start, end, len1);
9532
21.8M
    if (end - start < len2)
9533
1.33M
        return -1;
9534
9535
20.5M
    buf1 = PyUnicode_DATA(s1);
9536
20.5M
    buf2 = PyUnicode_DATA(s2);
9537
20.5M
    if (len2 == 1) {
9538
19.9M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9539
19.9M
        result = findchar((const char *)buf1 + kind1*start,
9540
19.9M
                          kind1, end - start, ch, direction);
9541
19.9M
        if (result == -1)
9542
3.64M
            return -1;
9543
16.2M
        else
9544
16.2M
            return start + result;
9545
19.9M
    }
9546
9547
624k
    if (kind2 != kind1) {
9548
263k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9549
263k
        if (!buf2)
9550
0
            return -2;
9551
263k
    }
9552
9553
624k
    if (direction > 0) {
9554
624k
        switch (kind1) {
9555
360k
        case PyUnicode_1BYTE_KIND:
9556
360k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9557
186k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9558
174k
            else
9559
174k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9560
360k
            break;
9561
189k
        case PyUnicode_2BYTE_KIND:
9562
189k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9563
189k
            break;
9564
73.7k
        case PyUnicode_4BYTE_KIND:
9565
73.7k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9566
73.7k
            break;
9567
0
        default:
9568
0
            Py_UNREACHABLE();
9569
624k
        }
9570
624k
    }
9571
0
    else {
9572
0
        switch (kind1) {
9573
0
        case PyUnicode_1BYTE_KIND:
9574
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9575
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9576
0
            else
9577
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9578
0
            break;
9579
0
        case PyUnicode_2BYTE_KIND:
9580
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9581
0
            break;
9582
0
        case PyUnicode_4BYTE_KIND:
9583
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9584
0
            break;
9585
0
        default:
9586
0
            Py_UNREACHABLE();
9587
0
        }
9588
0
    }
9589
9590
624k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9591
624k
    if (kind2 != kind1)
9592
263k
        PyMem_Free((void *)buf2);
9593
9594
624k
    return result;
9595
624k
}
9596
9597
9598
Py_ssize_t
9599
PyUnicode_Count(PyObject *str,
9600
                PyObject *substr,
9601
                Py_ssize_t start,
9602
                Py_ssize_t end)
9603
0
{
9604
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9605
0
        return -1;
9606
9607
0
    return unicode_count_impl(str, substr, start, end);
9608
0
}
9609
9610
Py_ssize_t
9611
PyUnicode_Find(PyObject *str,
9612
               PyObject *substr,
9613
               Py_ssize_t start,
9614
               Py_ssize_t end,
9615
               int direction)
9616
0
{
9617
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9618
0
        return -2;
9619
9620
0
    return any_find_slice(str, substr, start, end, direction);
9621
0
}
9622
9623
Py_ssize_t
9624
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9625
                   Py_ssize_t start, Py_ssize_t end,
9626
                   int direction)
9627
3.10M
{
9628
3.10M
    int kind;
9629
3.10M
    Py_ssize_t len, result;
9630
3.10M
    len = PyUnicode_GET_LENGTH(str);
9631
3.10M
    ADJUST_INDICES(start, end, len);
9632
3.10M
    if (end - start < 1)
9633
0
        return -1;
9634
3.10M
    kind = PyUnicode_KIND(str);
9635
3.10M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9636
3.10M
                      kind, end-start, ch, direction);
9637
3.10M
    if (result == -1)
9638
2.20M
        return -1;
9639
893k
    else
9640
893k
        return start + result;
9641
3.10M
}
9642
9643
static int
9644
tailmatch(PyObject *self,
9645
          PyObject *substring,
9646
          Py_ssize_t start,
9647
          Py_ssize_t end,
9648
          int direction)
9649
45.9M
{
9650
45.9M
    int kind_self;
9651
45.9M
    int kind_sub;
9652
45.9M
    const void *data_self;
9653
45.9M
    const void *data_sub;
9654
45.9M
    Py_ssize_t offset;
9655
45.9M
    Py_ssize_t i;
9656
45.9M
    Py_ssize_t end_sub;
9657
9658
45.9M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9659
45.9M
    end -= PyUnicode_GET_LENGTH(substring);
9660
45.9M
    if (end < start)
9661
6.96M
        return 0;
9662
9663
39.0M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9664
0
        return 1;
9665
9666
39.0M
    kind_self = PyUnicode_KIND(self);
9667
39.0M
    data_self = PyUnicode_DATA(self);
9668
39.0M
    kind_sub = PyUnicode_KIND(substring);
9669
39.0M
    data_sub = PyUnicode_DATA(substring);
9670
39.0M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9671
9672
39.0M
    if (direction > 0)
9673
7.01M
        offset = end;
9674
31.9M
    else
9675
31.9M
        offset = start;
9676
9677
39.0M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9678
39.0M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9679
25.4M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9680
25.4M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9681
        /* If both are of the same kind, memcmp is sufficient */
9682
11.0M
        if (kind_self == kind_sub) {
9683
5.78M
            return ! memcmp((char *)data_self +
9684
5.78M
                                (offset * PyUnicode_KIND(substring)),
9685
5.78M
                            data_sub,
9686
5.78M
                            PyUnicode_GET_LENGTH(substring) *
9687
5.78M
                                PyUnicode_KIND(substring));
9688
5.78M
        }
9689
        /* otherwise we have to compare each character by first accessing it */
9690
5.27M
        else {
9691
            /* We do not need to compare 0 and len(substring)-1 because
9692
               the if statement above ensured already that they are equal
9693
               when we end up here. */
9694
5.43M
            for (i = 1; i < end_sub; ++i) {
9695
163k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9696
163k
                    PyUnicode_READ(kind_sub, data_sub, i))
9697
4.94k
                    return 0;
9698
163k
            }
9699
5.26M
            return 1;
9700
5.27M
        }
9701
11.0M
    }
9702
9703
27.9M
    return 0;
9704
39.0M
}
9705
9706
Py_ssize_t
9707
PyUnicode_Tailmatch(PyObject *str,
9708
                    PyObject *substr,
9709
                    Py_ssize_t start,
9710
                    Py_ssize_t end,
9711
                    int direction)
9712
126
{
9713
126
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9714
0
        return -1;
9715
9716
126
    return tailmatch(str, substr, start, end, direction);
9717
126
}
9718
9719
static PyObject *
9720
ascii_upper_or_lower(PyObject *self, int lower)
9721
63.7M
{
9722
63.7M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9723
63.7M
    const char *data = PyUnicode_DATA(self);
9724
63.7M
    char *resdata;
9725
63.7M
    PyObject *res;
9726
9727
63.7M
    res = PyUnicode_New(len, 127);
9728
63.7M
    if (res == NULL)
9729
0
        return NULL;
9730
63.7M
    resdata = PyUnicode_DATA(res);
9731
63.7M
    if (lower)
9732
63.7M
        _Py_bytes_lower(resdata, data, len);
9733
306
    else
9734
306
        _Py_bytes_upper(resdata, data, len);
9735
63.7M
    return res;
9736
63.7M
}
9737
9738
static Py_UCS4
9739
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9740
490k
{
9741
490k
    Py_ssize_t j;
9742
490k
    int final_sigma;
9743
490k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9744
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9745
9746
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9747
9748
    where ! is a negation and \p{xxx} is a character with property xxx.
9749
    */
9750
910k
    for (j = i - 1; j >= 0; j--) {
9751
906k
        c = PyUnicode_READ(kind, data, j);
9752
906k
        if (!_PyUnicode_IsCaseIgnorable(c))
9753
486k
            break;
9754
906k
    }
9755
490k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9756
490k
    if (final_sigma) {
9757
764k
        for (j = i + 1; j < length; j++) {
9758
762k
            c = PyUnicode_READ(kind, data, j);
9759
762k
            if (!_PyUnicode_IsCaseIgnorable(c))
9760
381k
                break;
9761
762k
        }
9762
383k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9763
383k
    }
9764
490k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9765
490k
}
9766
9767
static int
9768
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9769
           Py_UCS4 c, Py_UCS4 *mapped)
9770
123M
{
9771
    /* Obscure special case. */
9772
123M
    if (c == 0x3A3) {
9773
490k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9774
490k
        return 1;
9775
490k
    }
9776
123M
    return _PyUnicode_ToLowerFull(c, mapped);
9777
123M
}
9778
9779
static Py_ssize_t
9780
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9781
0
{
9782
0
    Py_ssize_t i, k = 0;
9783
0
    int n_res, j;
9784
0
    Py_UCS4 c, mapped[3];
9785
9786
0
    c = PyUnicode_READ(kind, data, 0);
9787
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9788
0
    for (j = 0; j < n_res; j++) {
9789
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9790
0
        res[k++] = mapped[j];
9791
0
    }
9792
0
    for (i = 1; i < length; i++) {
9793
0
        c = PyUnicode_READ(kind, data, i);
9794
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9795
0
        for (j = 0; j < n_res; j++) {
9796
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9797
0
            res[k++] = mapped[j];
9798
0
        }
9799
0
    }
9800
0
    return k;
9801
0
}
9802
9803
static Py_ssize_t
9804
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9805
0
    Py_ssize_t i, k = 0;
9806
9807
0
    for (i = 0; i < length; i++) {
9808
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9809
0
        int n_res, j;
9810
0
        if (Py_UNICODE_ISUPPER(c)) {
9811
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9812
0
        }
9813
0
        else if (Py_UNICODE_ISLOWER(c)) {
9814
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9815
0
        }
9816
0
        else {
9817
0
            n_res = 1;
9818
0
            mapped[0] = c;
9819
0
        }
9820
0
        for (j = 0; j < n_res; j++) {
9821
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9822
0
            res[k++] = mapped[j];
9823
0
        }
9824
0
    }
9825
0
    return k;
9826
0
}
9827
9828
static Py_ssize_t
9829
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9830
                  Py_UCS4 *maxchar, int lower)
9831
3.09M
{
9832
3.09M
    Py_ssize_t i, k = 0;
9833
9834
126M
    for (i = 0; i < length; i++) {
9835
123M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9836
123M
        int n_res, j;
9837
123M
        if (lower)
9838
123M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9839
0
        else
9840
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9841
246M
        for (j = 0; j < n_res; j++) {
9842
123M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9843
123M
            res[k++] = mapped[j];
9844
123M
        }
9845
123M
    }
9846
3.09M
    return k;
9847
3.09M
}
9848
9849
static Py_ssize_t
9850
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9851
0
{
9852
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9853
0
}
9854
9855
static Py_ssize_t
9856
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9857
3.09M
{
9858
3.09M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9859
3.09M
}
9860
9861
static Py_ssize_t
9862
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9863
0
{
9864
0
    Py_ssize_t i, k = 0;
9865
9866
0
    for (i = 0; i < length; i++) {
9867
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9868
0
        Py_UCS4 mapped[3];
9869
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9870
0
        for (j = 0; j < n_res; j++) {
9871
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9872
0
            res[k++] = mapped[j];
9873
0
        }
9874
0
    }
9875
0
    return k;
9876
0
}
9877
9878
static Py_ssize_t
9879
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9880
0
{
9881
0
    Py_ssize_t i, k = 0;
9882
0
    int previous_is_cased;
9883
9884
0
    previous_is_cased = 0;
9885
0
    for (i = 0; i < length; i++) {
9886
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9887
0
        Py_UCS4 mapped[3];
9888
0
        int n_res, j;
9889
9890
0
        if (previous_is_cased)
9891
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9892
0
        else
9893
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9894
9895
0
        for (j = 0; j < n_res; j++) {
9896
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9897
0
            res[k++] = mapped[j];
9898
0
        }
9899
9900
0
        previous_is_cased = _PyUnicode_IsCased(c);
9901
0
    }
9902
0
    return k;
9903
0
}
9904
9905
static PyObject *
9906
case_operation(PyObject *self,
9907
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9908
3.09M
{
9909
3.09M
    PyObject *res = NULL;
9910
3.09M
    Py_ssize_t length, newlength = 0;
9911
3.09M
    int kind, outkind;
9912
3.09M
    const void *data;
9913
3.09M
    void *outdata;
9914
3.09M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9915
9916
3.09M
    kind = PyUnicode_KIND(self);
9917
3.09M
    data = PyUnicode_DATA(self);
9918
3.09M
    length = PyUnicode_GET_LENGTH(self);
9919
3.09M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9920
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9921
0
        return NULL;
9922
0
    }
9923
3.09M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9924
3.09M
    if (tmp == NULL)
9925
0
        return PyErr_NoMemory();
9926
3.09M
    newlength = perform(kind, data, length, tmp, &maxchar);
9927
3.09M
    res = PyUnicode_New(newlength, maxchar);
9928
3.09M
    if (res == NULL)
9929
0
        goto leave;
9930
3.09M
    tmpend = tmp + newlength;
9931
3.09M
    outdata = PyUnicode_DATA(res);
9932
3.09M
    outkind = PyUnicode_KIND(res);
9933
3.09M
    switch (outkind) {
9934
189k
    case PyUnicode_1BYTE_KIND:
9935
189k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9936
189k
        break;
9937
2.84M
    case PyUnicode_2BYTE_KIND:
9938
2.84M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9939
2.84M
        break;
9940
58.5k
    case PyUnicode_4BYTE_KIND:
9941
58.5k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9942
58.5k
        break;
9943
0
    default:
9944
0
        Py_UNREACHABLE();
9945
3.09M
    }
9946
3.09M
  leave:
9947
3.09M
    PyMem_Free(tmp);
9948
3.09M
    return res;
9949
3.09M
}
9950
9951
PyObject *
9952
PyUnicode_Join(PyObject *separator, PyObject *seq)
9953
25.3M
{
9954
25.3M
    PyObject *res;
9955
25.3M
    PyObject *fseq;
9956
25.3M
    Py_ssize_t seqlen;
9957
25.3M
    PyObject **items;
9958
9959
25.3M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9960
25.3M
    if (fseq == NULL) {
9961
622
        return NULL;
9962
622
    }
9963
9964
25.3M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9965
9966
25.3M
    items = PySequence_Fast_ITEMS(fseq);
9967
25.3M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9968
25.3M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9969
9970
25.3M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9971
9972
25.3M
    Py_DECREF(fseq);
9973
25.3M
    return res;
9974
25.3M
}
9975
9976
PyObject *
9977
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9978
37.0M
{
9979
37.0M
    PyObject *res = NULL; /* the result */
9980
37.0M
    PyObject *sep = NULL;
9981
37.0M
    Py_ssize_t seplen;
9982
37.0M
    PyObject *item;
9983
37.0M
    Py_ssize_t sz, i, res_offset;
9984
37.0M
    Py_UCS4 maxchar;
9985
37.0M
    Py_UCS4 item_maxchar;
9986
37.0M
    int use_memcpy;
9987
37.0M
    unsigned char *res_data = NULL, *sep_data = NULL;
9988
37.0M
    PyObject *last_obj;
9989
37.0M
    int kind = 0;
9990
9991
    /* If empty sequence, return u"". */
9992
37.0M
    if (seqlen == 0) {
9993
7.07M
        _Py_RETURN_UNICODE_EMPTY();
9994
7.07M
    }
9995
9996
    /* If singleton sequence with an exact Unicode, return that. */
9997
29.9M
    last_obj = NULL;
9998
29.9M
    if (seqlen == 1) {
9999
12.7M
        if (PyUnicode_CheckExact(items[0])) {
10000
11.4M
            res = items[0];
10001
11.4M
            return Py_NewRef(res);
10002
11.4M
        }
10003
1.36M
        seplen = 0;
10004
1.36M
        maxchar = 0;
10005
1.36M
    }
10006
17.2M
    else {
10007
        /* Set up sep and seplen */
10008
17.2M
        if (separator == NULL) {
10009
            /* fall back to a blank space separator */
10010
0
            sep = PyUnicode_FromOrdinal(' ');
10011
0
            if (!sep)
10012
0
                goto onError;
10013
0
            seplen = 1;
10014
0
            maxchar = 32;
10015
0
        }
10016
17.2M
        else {
10017
17.2M
            if (!PyUnicode_Check(separator)) {
10018
0
                PyErr_Format(PyExc_TypeError,
10019
0
                             "separator: expected str instance,"
10020
0
                             " %.80s found",
10021
0
                             Py_TYPE(separator)->tp_name);
10022
0
                goto onError;
10023
0
            }
10024
17.2M
            sep = separator;
10025
17.2M
            seplen = PyUnicode_GET_LENGTH(separator);
10026
17.2M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10027
            /* inc refcount to keep this code path symmetric with the
10028
               above case of a blank separator */
10029
17.2M
            Py_INCREF(sep);
10030
17.2M
        }
10031
17.2M
        last_obj = sep;
10032
17.2M
    }
10033
10034
    /* There are at least two things to join, or else we have a subclass
10035
     * of str in the sequence.
10036
     * Do a pre-pass to figure out the total amount of space we'll
10037
     * need (sz), and see whether all argument are strings.
10038
     */
10039
18.5M
    sz = 0;
10040
#ifdef Py_DEBUG
10041
    use_memcpy = 0;
10042
#else
10043
18.5M
    use_memcpy = 1;
10044
18.5M
#endif
10045
181M
    for (i = 0; i < seqlen; i++) {
10046
162M
        size_t add_sz;
10047
162M
        item = items[i];
10048
162M
        if (!PyUnicode_Check(item)) {
10049
0
            PyErr_Format(PyExc_TypeError,
10050
0
                         "sequence item %zd: expected str instance,"
10051
0
                         " %.80s found",
10052
0
                         i, Py_TYPE(item)->tp_name);
10053
0
            goto onError;
10054
0
        }
10055
162M
        add_sz = PyUnicode_GET_LENGTH(item);
10056
162M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10057
162M
        maxchar = Py_MAX(maxchar, item_maxchar);
10058
162M
        if (i != 0) {
10059
143M
            add_sz += seplen;
10060
143M
        }
10061
162M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10062
0
            PyErr_SetString(PyExc_OverflowError,
10063
0
                            "join() result is too long for a Python string");
10064
0
            goto onError;
10065
0
        }
10066
162M
        sz += add_sz;
10067
162M
        if (use_memcpy && last_obj != NULL) {
10068
98.4M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10069
2.89M
                use_memcpy = 0;
10070
98.4M
        }
10071
162M
        last_obj = item;
10072
162M
    }
10073
10074
18.5M
    res = PyUnicode_New(sz, maxchar);
10075
18.5M
    if (res == NULL)
10076
0
        goto onError;
10077
10078
    /* Catenate everything. */
10079
#ifdef Py_DEBUG
10080
    use_memcpy = 0;
10081
#else
10082
18.5M
    if (use_memcpy) {
10083
15.6M
        res_data = PyUnicode_1BYTE_DATA(res);
10084
15.6M
        kind = PyUnicode_KIND(res);
10085
15.6M
        if (seplen != 0)
10086
178k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10087
15.6M
    }
10088
18.5M
#endif
10089
18.5M
    if (use_memcpy) {
10090
98.3M
        for (i = 0; i < seqlen; ++i) {
10091
82.6M
            Py_ssize_t itemlen;
10092
82.6M
            item = items[i];
10093
10094
            /* Copy item, and maybe the separator. */
10095
82.6M
            if (i && seplen != 0) {
10096
736k
                memcpy(res_data,
10097
736k
                          sep_data,
10098
736k
                          kind * seplen);
10099
736k
                res_data += kind * seplen;
10100
736k
            }
10101
10102
82.6M
            itemlen = PyUnicode_GET_LENGTH(item);
10103
82.6M
            if (itemlen != 0) {
10104
73.2M
                memcpy(res_data,
10105
73.2M
                          PyUnicode_DATA(item),
10106
73.2M
                          kind * itemlen);
10107
73.2M
                res_data += kind * itemlen;
10108
73.2M
            }
10109
82.6M
        }
10110
15.6M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10111
15.6M
                           + kind * PyUnicode_GET_LENGTH(res));
10112
15.6M
    }
10113
2.89M
    else {
10114
82.6M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10115
79.7M
            Py_ssize_t itemlen;
10116
79.7M
            item = items[i];
10117
10118
            /* Copy item, and maybe the separator. */
10119
79.7M
            if (i && seplen != 0) {
10120
996k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10121
996k
                res_offset += seplen;
10122
996k
            }
10123
10124
79.7M
            itemlen = PyUnicode_GET_LENGTH(item);
10125
79.7M
            if (itemlen != 0) {
10126
78.4M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10127
78.4M
                res_offset += itemlen;
10128
78.4M
            }
10129
79.7M
        }
10130
2.89M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10131
2.89M
    }
10132
10133
18.5M
    Py_XDECREF(sep);
10134
18.5M
    assert(_PyUnicode_CheckConsistency(res, 1));
10135
18.5M
    return res;
10136
10137
0
  onError:
10138
0
    Py_XDECREF(sep);
10139
0
    Py_XDECREF(res);
10140
0
    return NULL;
10141
18.5M
}
10142
10143
void
10144
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10145
                    Py_UCS4 fill_char)
10146
612
{
10147
612
    const int kind = PyUnicode_KIND(unicode);
10148
612
    void *data = PyUnicode_DATA(unicode);
10149
612
    assert(_PyUnicode_IsModifiable(unicode));
10150
612
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10151
612
    assert(start >= 0);
10152
612
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10153
612
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10154
612
}
10155
10156
Py_ssize_t
10157
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10158
               Py_UCS4 fill_char)
10159
612
{
10160
612
    Py_ssize_t maxlen;
10161
10162
612
    if (!PyUnicode_Check(unicode)) {
10163
0
        PyErr_BadInternalCall();
10164
0
        return -1;
10165
0
    }
10166
612
    if (unicode_check_modifiable(unicode))
10167
0
        return -1;
10168
10169
612
    if (start < 0) {
10170
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10171
0
        return -1;
10172
0
    }
10173
612
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10174
0
        PyErr_SetString(PyExc_ValueError,
10175
0
                         "fill character is bigger than "
10176
0
                         "the string maximum character");
10177
0
        return -1;
10178
0
    }
10179
10180
612
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10181
612
    length = Py_MIN(maxlen, length);
10182
612
    if (length <= 0)
10183
0
        return 0;
10184
10185
612
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10186
612
    return length;
10187
612
}
10188
10189
static PyObject *
10190
pad(PyObject *self,
10191
    Py_ssize_t left,
10192
    Py_ssize_t right,
10193
    Py_UCS4 fill)
10194
68
{
10195
68
    PyObject *u;
10196
68
    Py_UCS4 maxchar;
10197
68
    int kind;
10198
68
    void *data;
10199
10200
68
    if (left < 0)
10201
0
        left = 0;
10202
68
    if (right < 0)
10203
0
        right = 0;
10204
10205
68
    if (left == 0 && right == 0)
10206
0
        return unicode_result_unchanged(self);
10207
10208
68
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10209
68
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10210
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10211
0
        return NULL;
10212
0
    }
10213
68
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10214
68
    maxchar = Py_MAX(maxchar, fill);
10215
68
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10216
68
    if (!u)
10217
0
        return NULL;
10218
10219
68
    kind = PyUnicode_KIND(u);
10220
68
    data = PyUnicode_DATA(u);
10221
68
    if (left)
10222
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10223
68
    if (right)
10224
68
        _PyUnicode_Fill(kind, data, fill,
10225
68
                        left + _PyUnicode_LENGTH(self), right);
10226
68
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10227
68
    assert(_PyUnicode_CheckConsistency(u, 1));
10228
68
    return u;
10229
68
}
10230
10231
PyObject *
10232
PyUnicode_Splitlines(PyObject *string, int keepends)
10233
14.3k
{
10234
14.3k
    PyObject *list;
10235
10236
14.3k
    if (ensure_unicode(string) < 0)
10237
0
        return NULL;
10238
10239
14.3k
    switch (PyUnicode_KIND(string)) {
10240
3.78k
    case PyUnicode_1BYTE_KIND:
10241
3.78k
        if (PyUnicode_IS_ASCII(string))
10242
2.88k
            list = asciilib_splitlines(
10243
2.88k
                string, PyUnicode_1BYTE_DATA(string),
10244
2.88k
                PyUnicode_GET_LENGTH(string), keepends);
10245
902
        else
10246
902
            list = ucs1lib_splitlines(
10247
902
                string, PyUnicode_1BYTE_DATA(string),
10248
902
                PyUnicode_GET_LENGTH(string), keepends);
10249
3.78k
        break;
10250
7.53k
    case PyUnicode_2BYTE_KIND:
10251
7.53k
        list = ucs2lib_splitlines(
10252
7.53k
            string, PyUnicode_2BYTE_DATA(string),
10253
7.53k
            PyUnicode_GET_LENGTH(string), keepends);
10254
7.53k
        break;
10255
2.99k
    case PyUnicode_4BYTE_KIND:
10256
2.99k
        list = ucs4lib_splitlines(
10257
2.99k
            string, PyUnicode_4BYTE_DATA(string),
10258
2.99k
            PyUnicode_GET_LENGTH(string), keepends);
10259
2.99k
        break;
10260
0
    default:
10261
0
        Py_UNREACHABLE();
10262
14.3k
    }
10263
14.3k
    return list;
10264
14.3k
}
10265
10266
static PyObject *
10267
split(PyObject *self,
10268
      PyObject *substring,
10269
      Py_ssize_t maxcount)
10270
20.4M
{
10271
20.4M
    int kind1, kind2;
10272
20.4M
    const void *buf1, *buf2;
10273
20.4M
    Py_ssize_t len1, len2;
10274
20.4M
    PyObject* out;
10275
20.4M
    len1 = PyUnicode_GET_LENGTH(self);
10276
20.4M
    kind1 = PyUnicode_KIND(self);
10277
10278
20.4M
    if (substring == NULL) {
10279
173k
        if (maxcount < 0) {
10280
148k
            maxcount = (len1 - 1) / 2 + 1;
10281
148k
        }
10282
173k
        switch (kind1) {
10283
112k
        case PyUnicode_1BYTE_KIND:
10284
112k
            if (PyUnicode_IS_ASCII(self))
10285
82.5k
                return asciilib_split_whitespace(
10286
82.5k
                    self,  PyUnicode_1BYTE_DATA(self),
10287
82.5k
                    len1, maxcount
10288
82.5k
                    );
10289
29.9k
            else
10290
29.9k
                return ucs1lib_split_whitespace(
10291
29.9k
                    self,  PyUnicode_1BYTE_DATA(self),
10292
29.9k
                    len1, maxcount
10293
29.9k
                    );
10294
49.2k
        case PyUnicode_2BYTE_KIND:
10295
49.2k
            return ucs2lib_split_whitespace(
10296
49.2k
                self,  PyUnicode_2BYTE_DATA(self),
10297
49.2k
                len1, maxcount
10298
49.2k
                );
10299
11.8k
        case PyUnicode_4BYTE_KIND:
10300
11.8k
            return ucs4lib_split_whitespace(
10301
11.8k
                self,  PyUnicode_4BYTE_DATA(self),
10302
11.8k
                len1, maxcount
10303
11.8k
                );
10304
0
        default:
10305
0
            Py_UNREACHABLE();
10306
173k
        }
10307
173k
    }
10308
10309
20.2M
    kind2 = PyUnicode_KIND(substring);
10310
20.2M
    len2 = PyUnicode_GET_LENGTH(substring);
10311
20.2M
    if (maxcount < 0) {
10312
        // if len2 == 0, it will raise ValueError.
10313
14.9M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10314
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10315
14.9M
        maxcount = maxcount < 0 ? len1 : maxcount;
10316
14.9M
    }
10317
20.2M
    if (kind1 < kind2 || len1 < len2) {
10318
1.04M
        out = PyList_New(1);
10319
1.04M
        if (out == NULL)
10320
0
            return NULL;
10321
1.04M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10322
1.04M
        return out;
10323
1.04M
    }
10324
19.2M
    buf1 = PyUnicode_DATA(self);
10325
19.2M
    buf2 = PyUnicode_DATA(substring);
10326
19.2M
    if (kind2 != kind1) {
10327
231k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10328
231k
        if (!buf2)
10329
0
            return NULL;
10330
231k
    }
10331
10332
19.2M
    switch (kind1) {
10333
19.0M
    case PyUnicode_1BYTE_KIND:
10334
19.0M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10335
17.8M
            out = asciilib_split(
10336
17.8M
                self,  buf1, len1, buf2, len2, maxcount);
10337
1.12M
        else
10338
1.12M
            out = ucs1lib_split(
10339
1.12M
                self,  buf1, len1, buf2, len2, maxcount);
10340
19.0M
        break;
10341
195k
    case PyUnicode_2BYTE_KIND:
10342
195k
        out = ucs2lib_split(
10343
195k
            self,  buf1, len1, buf2, len2, maxcount);
10344
195k
        break;
10345
36.5k
    case PyUnicode_4BYTE_KIND:
10346
36.5k
        out = ucs4lib_split(
10347
36.5k
            self,  buf1, len1, buf2, len2, maxcount);
10348
36.5k
        break;
10349
0
    default:
10350
0
        out = NULL;
10351
19.2M
    }
10352
19.2M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10353
19.2M
    if (kind2 != kind1)
10354
231k
        PyMem_Free((void *)buf2);
10355
19.2M
    return out;
10356
19.2M
}
10357
10358
static PyObject *
10359
rsplit(PyObject *self,
10360
       PyObject *substring,
10361
       Py_ssize_t maxcount)
10362
78
{
10363
78
    int kind1, kind2;
10364
78
    const void *buf1, *buf2;
10365
78
    Py_ssize_t len1, len2;
10366
78
    PyObject* out;
10367
10368
78
    len1 = PyUnicode_GET_LENGTH(self);
10369
78
    kind1 = PyUnicode_KIND(self);
10370
10371
78
    if (substring == NULL) {
10372
0
        if (maxcount < 0) {
10373
0
            maxcount = (len1 - 1) / 2 + 1;
10374
0
        }
10375
0
        switch (kind1) {
10376
0
        case PyUnicode_1BYTE_KIND:
10377
0
            if (PyUnicode_IS_ASCII(self))
10378
0
                return asciilib_rsplit_whitespace(
10379
0
                    self,  PyUnicode_1BYTE_DATA(self),
10380
0
                    len1, maxcount
10381
0
                    );
10382
0
            else
10383
0
                return ucs1lib_rsplit_whitespace(
10384
0
                    self,  PyUnicode_1BYTE_DATA(self),
10385
0
                    len1, maxcount
10386
0
                    );
10387
0
        case PyUnicode_2BYTE_KIND:
10388
0
            return ucs2lib_rsplit_whitespace(
10389
0
                self,  PyUnicode_2BYTE_DATA(self),
10390
0
                len1, maxcount
10391
0
                );
10392
0
        case PyUnicode_4BYTE_KIND:
10393
0
            return ucs4lib_rsplit_whitespace(
10394
0
                self,  PyUnicode_4BYTE_DATA(self),
10395
0
                len1, maxcount
10396
0
                );
10397
0
        default:
10398
0
            Py_UNREACHABLE();
10399
0
        }
10400
0
    }
10401
78
    kind2 = PyUnicode_KIND(substring);
10402
78
    len2 = PyUnicode_GET_LENGTH(substring);
10403
78
    if (maxcount < 0) {
10404
        // if len2 == 0, it will raise ValueError.
10405
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10406
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10407
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10408
0
    }
10409
78
    if (kind1 < kind2 || len1 < len2) {
10410
0
        out = PyList_New(1);
10411
0
        if (out == NULL)
10412
0
            return NULL;
10413
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10414
0
        return out;
10415
0
    }
10416
78
    buf1 = PyUnicode_DATA(self);
10417
78
    buf2 = PyUnicode_DATA(substring);
10418
78
    if (kind2 != kind1) {
10419
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10420
0
        if (!buf2)
10421
0
            return NULL;
10422
0
    }
10423
10424
78
    switch (kind1) {
10425
78
    case PyUnicode_1BYTE_KIND:
10426
78
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10427
78
            out = asciilib_rsplit(
10428
78
                self,  buf1, len1, buf2, len2, maxcount);
10429
0
        else
10430
0
            out = ucs1lib_rsplit(
10431
0
                self,  buf1, len1, buf2, len2, maxcount);
10432
78
        break;
10433
0
    case PyUnicode_2BYTE_KIND:
10434
0
        out = ucs2lib_rsplit(
10435
0
            self,  buf1, len1, buf2, len2, maxcount);
10436
0
        break;
10437
0
    case PyUnicode_4BYTE_KIND:
10438
0
        out = ucs4lib_rsplit(
10439
0
            self,  buf1, len1, buf2, len2, maxcount);
10440
0
        break;
10441
0
    default:
10442
0
        out = NULL;
10443
78
    }
10444
78
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10445
78
    if (kind2 != kind1)
10446
0
        PyMem_Free((void *)buf2);
10447
78
    return out;
10448
78
}
10449
10450
static Py_ssize_t
10451
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10452
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10453
21.6M
{
10454
21.6M
    switch (kind) {
10455
7.67M
    case PyUnicode_1BYTE_KIND:
10456
7.67M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10457
4.00M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10458
3.66M
        else
10459
3.66M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10460
6.39M
    case PyUnicode_2BYTE_KIND:
10461
6.39M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10462
7.56M
    case PyUnicode_4BYTE_KIND:
10463
7.56M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10464
21.6M
    }
10465
21.6M
    Py_UNREACHABLE();
10466
21.6M
}
10467
10468
static Py_ssize_t
10469
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10470
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10471
9.50M
{
10472
9.50M
    switch (kind) {
10473
8.72M
    case PyUnicode_1BYTE_KIND:
10474
8.72M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10475
704k
    case PyUnicode_2BYTE_KIND:
10476
704k
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10477
82.6k
    case PyUnicode_4BYTE_KIND:
10478
82.6k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10479
9.50M
    }
10480
9.50M
    Py_UNREACHABLE();
10481
9.50M
}
10482
10483
static void
10484
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10485
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10486
81.3k
{
10487
81.3k
    int kind = PyUnicode_KIND(u);
10488
81.3k
    void *data = PyUnicode_DATA(u);
10489
81.3k
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10490
81.3k
    if (kind == PyUnicode_1BYTE_KIND) {
10491
47.8k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10492
47.8k
                                      (Py_UCS1 *)data + len,
10493
47.8k
                                      u1, u2, maxcount);
10494
47.8k
    }
10495
33.4k
    else if (kind == PyUnicode_2BYTE_KIND) {
10496
25.3k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10497
25.3k
                                      (Py_UCS2 *)data + len,
10498
25.3k
                                      u1, u2, maxcount);
10499
25.3k
    }
10500
8.05k
    else {
10501
8.05k
        assert(kind == PyUnicode_4BYTE_KIND);
10502
8.05k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10503
8.05k
                                      (Py_UCS4 *)data + len,
10504
8.05k
                                      u1, u2, maxcount);
10505
8.05k
    }
10506
81.3k
}
10507
10508
static PyObject *
10509
replace(PyObject *self, PyObject *str1,
10510
        PyObject *str2, Py_ssize_t maxcount)
10511
16.3M
{
10512
16.3M
    PyObject *u;
10513
16.3M
    const char *sbuf = PyUnicode_DATA(self);
10514
16.3M
    const void *buf1 = PyUnicode_DATA(str1);
10515
16.3M
    const void *buf2 = PyUnicode_DATA(str2);
10516
16.3M
    int srelease = 0, release1 = 0, release2 = 0;
10517
16.3M
    int skind = PyUnicode_KIND(self);
10518
16.3M
    int kind1 = PyUnicode_KIND(str1);
10519
16.3M
    int kind2 = PyUnicode_KIND(str2);
10520
16.3M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10521
16.3M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10522
16.3M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10523
16.3M
    int mayshrink;
10524
16.3M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10525
10526
16.3M
    if (slen < len1)
10527
6.39M
        goto nothing;
10528
10529
9.93M
    if (maxcount < 0)
10530
9.93M
        maxcount = PY_SSIZE_T_MAX;
10531
0
    else if (maxcount == 0)
10532
0
        goto nothing;
10533
10534
9.93M
    if (str1 == str2)
10535
26.8k
        goto nothing;
10536
10537
9.91M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10538
9.91M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10539
9.91M
    if (maxchar < maxchar_str1)
10540
        /* substring too wide to be present */
10541
0
        goto nothing;
10542
9.91M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10543
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10544
       result string. */
10545
9.91M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10546
9.91M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10547
10548
9.91M
    if (len1 == len2) {
10549
        /* same length */
10550
401k
        if (len1 == 0)
10551
0
            goto nothing;
10552
401k
        if (len1 == 1) {
10553
            /* replace characters */
10554
394k
            Py_UCS4 u1, u2;
10555
394k
            Py_ssize_t pos;
10556
10557
394k
            u1 = PyUnicode_READ(kind1, buf1, 0);
10558
394k
            pos = findchar(sbuf, skind, slen, u1, 1);
10559
394k
            if (pos < 0)
10560
313k
                goto nothing;
10561
81.3k
            u2 = PyUnicode_READ(kind2, buf2, 0);
10562
81.3k
            u = PyUnicode_New(slen, maxchar);
10563
81.3k
            if (!u)
10564
0
                goto error;
10565
10566
81.3k
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10567
81.3k
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10568
81.3k
        }
10569
7.07k
        else {
10570
7.07k
            int rkind = skind;
10571
7.07k
            char *res;
10572
7.07k
            Py_ssize_t i;
10573
10574
7.07k
            if (kind1 < rkind) {
10575
                /* widen substring */
10576
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10577
0
                if (!buf1) goto error;
10578
0
                release1 = 1;
10579
0
            }
10580
7.07k
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10581
7.07k
            if (i < 0)
10582
7.07k
                goto nothing;
10583
0
            if (rkind > kind2) {
10584
                /* widen replacement */
10585
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10586
0
                if (!buf2) goto error;
10587
0
                release2 = 1;
10588
0
            }
10589
0
            else if (rkind < kind2) {
10590
                /* widen self and buf1 */
10591
0
                rkind = kind2;
10592
0
                if (release1) {
10593
0
                    assert(buf1 != PyUnicode_DATA(str1));
10594
0
                    PyMem_Free((void *)buf1);
10595
0
                    buf1 = PyUnicode_DATA(str1);
10596
0
                    release1 = 0;
10597
0
                }
10598
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10599
0
                if (!sbuf) goto error;
10600
0
                srelease = 1;
10601
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10602
0
                if (!buf1) goto error;
10603
0
                release1 = 1;
10604
0
            }
10605
0
            u = PyUnicode_New(slen, maxchar);
10606
0
            if (!u)
10607
0
                goto error;
10608
0
            assert(PyUnicode_KIND(u) == rkind);
10609
0
            res = PyUnicode_DATA(u);
10610
10611
0
            memcpy(res, sbuf, rkind * slen);
10612
            /* change everything in-place, starting with this one */
10613
0
            memcpy(res + rkind * i,
10614
0
                   buf2,
10615
0
                   rkind * len2);
10616
0
            i += len1;
10617
10618
0
            while ( --maxcount > 0) {
10619
0
                i = anylib_find(rkind, self,
10620
0
                                sbuf+rkind*i, slen-i,
10621
0
                                str1, buf1, len1, i);
10622
0
                if (i == -1)
10623
0
                    break;
10624
0
                memcpy(res + rkind * i,
10625
0
                       buf2,
10626
0
                       rkind * len2);
10627
0
                i += len1;
10628
0
            }
10629
0
        }
10630
401k
    }
10631
9.50M
    else {
10632
9.50M
        Py_ssize_t n, i, j, ires;
10633
9.50M
        Py_ssize_t new_size;
10634
9.50M
        int rkind = skind;
10635
9.50M
        char *res;
10636
10637
9.50M
        if (kind1 < rkind) {
10638
            /* widen substring */
10639
787k
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10640
787k
            if (!buf1) goto error;
10641
787k
            release1 = 1;
10642
787k
        }
10643
9.50M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10644
9.50M
        if (n == 0)
10645
8.36M
            goto nothing;
10646
1.14M
        if (kind2 < rkind) {
10647
            /* widen replacement */
10648
45.9k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10649
45.9k
            if (!buf2) goto error;
10650
45.9k
            release2 = 1;
10651
45.9k
        }
10652
1.09M
        else if (kind2 > rkind) {
10653
            /* widen self and buf1 */
10654
0
            rkind = kind2;
10655
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10656
0
            if (!sbuf) goto error;
10657
0
            srelease = 1;
10658
0
            if (release1) {
10659
0
                assert(buf1 != PyUnicode_DATA(str1));
10660
0
                PyMem_Free((void *)buf1);
10661
0
                buf1 = PyUnicode_DATA(str1);
10662
0
                release1 = 0;
10663
0
            }
10664
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10665
0
            if (!buf1) goto error;
10666
0
            release1 = 1;
10667
0
        }
10668
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10669
           PyUnicode_GET_LENGTH(str1)); */
10670
1.14M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10671
0
                PyErr_SetString(PyExc_OverflowError,
10672
0
                                "replace string is too long");
10673
0
                goto error;
10674
0
        }
10675
1.14M
        new_size = slen + n * (len2 - len1);
10676
1.14M
        if (new_size == 0) {
10677
0
            u = _PyUnicode_GetEmpty();
10678
0
            goto done;
10679
0
        }
10680
1.14M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10681
0
            PyErr_SetString(PyExc_OverflowError,
10682
0
                            "replace string is too long");
10683
0
            goto error;
10684
0
        }
10685
1.14M
        u = PyUnicode_New(new_size, maxchar);
10686
1.14M
        if (!u)
10687
0
            goto error;
10688
1.14M
        assert(PyUnicode_KIND(u) == rkind);
10689
1.14M
        res = PyUnicode_DATA(u);
10690
1.14M
        ires = i = 0;
10691
1.14M
        if (len1 > 0) {
10692
22.7M
            while (n-- > 0) {
10693
                /* look for next match */
10694
21.6M
                j = anylib_find(rkind, self,
10695
21.6M
                                sbuf + rkind * i, slen-i,
10696
21.6M
                                str1, buf1, len1, i);
10697
21.6M
                if (j == -1)
10698
0
                    break;
10699
21.6M
                else if (j > i) {
10700
                    /* copy unchanged part [i:j] */
10701
3.97M
                    memcpy(res + rkind * ires,
10702
3.97M
                           sbuf + rkind * i,
10703
3.97M
                           rkind * (j-i));
10704
3.97M
                    ires += j - i;
10705
3.97M
                }
10706
                /* copy substitution string */
10707
21.6M
                if (len2 > 0) {
10708
21.6M
                    memcpy(res + rkind * ires,
10709
21.6M
                           buf2,
10710
21.6M
                           rkind * len2);
10711
21.6M
                    ires += len2;
10712
21.6M
                }
10713
21.6M
                i = j + len1;
10714
21.6M
            }
10715
1.14M
            if (i < slen)
10716
                /* copy tail [i:] */
10717
1.12M
                memcpy(res + rkind * ires,
10718
1.12M
                       sbuf + rkind * i,
10719
1.12M
                       rkind * (slen-i));
10720
1.14M
        }
10721
0
        else {
10722
            /* interleave */
10723
0
            while (n > 0) {
10724
0
                memcpy(res + rkind * ires,
10725
0
                       buf2,
10726
0
                       rkind * len2);
10727
0
                ires += len2;
10728
0
                if (--n <= 0)
10729
0
                    break;
10730
0
                memcpy(res + rkind * ires,
10731
0
                       sbuf + rkind * i,
10732
0
                       rkind);
10733
0
                ires++;
10734
0
                i++;
10735
0
            }
10736
0
            memcpy(res + rkind * ires,
10737
0
                   sbuf + rkind * i,
10738
0
                   rkind * (slen-i));
10739
0
        }
10740
1.14M
    }
10741
10742
1.22M
    if (mayshrink) {
10743
0
        unicode_adjust_maxchar(&u);
10744
0
        if (u == NULL)
10745
0
            goto error;
10746
0
    }
10747
10748
1.22M
  done:
10749
1.22M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10750
1.22M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10751
1.22M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10752
1.22M
    if (srelease)
10753
0
        PyMem_Free((void *)sbuf);
10754
1.22M
    if (release1)
10755
45.9k
        PyMem_Free((void *)buf1);
10756
1.22M
    if (release2)
10757
45.9k
        PyMem_Free((void *)buf2);
10758
1.22M
    assert(_PyUnicode_CheckConsistency(u, 1));
10759
1.22M
    return u;
10760
10761
15.1M
  nothing:
10762
    /* nothing to replace; return original string (when possible) */
10763
15.1M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10764
15.1M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10765
15.1M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10766
15.1M
    if (srelease)
10767
0
        PyMem_Free((void *)sbuf);
10768
15.1M
    if (release1)
10769
741k
        PyMem_Free((void *)buf1);
10770
15.1M
    if (release2)
10771
0
        PyMem_Free((void *)buf2);
10772
15.1M
    return unicode_result_unchanged(self);
10773
10774
0
  error:
10775
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10776
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10777
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10778
0
    if (srelease)
10779
0
        PyMem_Free((void *)sbuf);
10780
0
    if (release1)
10781
0
        PyMem_Free((void *)buf1);
10782
0
    if (release2)
10783
0
        PyMem_Free((void *)buf2);
10784
0
    return NULL;
10785
1.22M
}
10786
10787
/* --- Unicode Object Methods --------------------------------------------- */
10788
10789
/*[clinic input]
10790
@permit_long_docstring_body
10791
str.title as unicode_title
10792
10793
Return a version of the string where each word is titlecased.
10794
10795
More specifically, words start with uppercased characters and all remaining
10796
cased characters have lower case.
10797
[clinic start generated code]*/
10798
10799
static PyObject *
10800
unicode_title_impl(PyObject *self)
10801
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10802
0
{
10803
0
    return case_operation(self, do_title);
10804
0
}
10805
10806
/*[clinic input]
10807
@permit_long_docstring_body
10808
str.capitalize as unicode_capitalize
10809
10810
Return a capitalized version of the string.
10811
10812
More specifically, make the first character have upper case and the rest lower
10813
case.
10814
[clinic start generated code]*/
10815
10816
static PyObject *
10817
unicode_capitalize_impl(PyObject *self)
10818
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10819
0
{
10820
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10821
0
        return unicode_result_unchanged(self);
10822
0
    return case_operation(self, do_capitalize);
10823
0
}
10824
10825
/*[clinic input]
10826
str.casefold as unicode_casefold
10827
10828
Return a version of the string suitable for caseless comparisons.
10829
[clinic start generated code]*/
10830
10831
static PyObject *
10832
unicode_casefold_impl(PyObject *self)
10833
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10834
0
{
10835
0
    if (PyUnicode_IS_ASCII(self))
10836
0
        return ascii_upper_or_lower(self, 1);
10837
0
    return case_operation(self, do_casefold);
10838
0
}
10839
10840
10841
/* Argument converter. Accepts a single Unicode character. */
10842
10843
static int
10844
convert_uc(PyObject *obj, void *addr)
10845
130
{
10846
130
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10847
10848
130
    if (!PyUnicode_Check(obj)) {
10849
0
        PyErr_Format(PyExc_TypeError,
10850
0
                     "The fill character must be a unicode character, "
10851
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10852
0
        return 0;
10853
0
    }
10854
130
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10855
0
        PyErr_SetString(PyExc_TypeError,
10856
0
                        "The fill character must be exactly one character long");
10857
0
        return 0;
10858
0
    }
10859
130
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10860
130
    return 1;
10861
130
}
10862
10863
/*[clinic input]
10864
str.center as unicode_center
10865
10866
    width: Py_ssize_t
10867
    fillchar: Py_UCS4 = ' '
10868
    /
10869
10870
Return a centered string of length width.
10871
10872
Padding is done using the specified fill character (default is a space).
10873
[clinic start generated code]*/
10874
10875
static PyObject *
10876
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10877
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10878
0
{
10879
0
    Py_ssize_t marg, left;
10880
10881
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10882
0
        return unicode_result_unchanged(self);
10883
10884
0
    marg = width - PyUnicode_GET_LENGTH(self);
10885
0
    left = marg / 2 + (marg & width & 1);
10886
10887
0
    return pad(self, left, marg - left, fillchar);
10888
0
}
10889
10890
/* This function assumes that str1 and str2 are readied by the caller. */
10891
10892
static int
10893
unicode_compare(PyObject *str1, PyObject *str2)
10894
13.9M
{
10895
13.9M
#define COMPARE(TYPE1, TYPE2) \
10896
13.9M
    do { \
10897
12.2M
        TYPE1* p1 = (TYPE1 *)data1; \
10898
12.2M
        TYPE2* p2 = (TYPE2 *)data2; \
10899
12.2M
        TYPE1* end = p1 + len; \
10900
12.2M
        Py_UCS4 c1, c2; \
10901
12.2M
        for (; p1 != end; p1++, p2++) { \
10902
12.2M
            c1 = *p1; \
10903
12.2M
            c2 = *p2; \
10904
12.2M
            if (c1 != c2) \
10905
12.2M
                return (c1 < c2) ? -1 : 1; \
10906
12.2M
        } \
10907
12.2M
    } \
10908
12.2M
    while (0)
10909
10910
13.9M
    int kind1, kind2;
10911
13.9M
    const void *data1, *data2;
10912
13.9M
    Py_ssize_t len1, len2, len;
10913
10914
13.9M
    kind1 = PyUnicode_KIND(str1);
10915
13.9M
    kind2 = PyUnicode_KIND(str2);
10916
13.9M
    data1 = PyUnicode_DATA(str1);
10917
13.9M
    data2 = PyUnicode_DATA(str2);
10918
13.9M
    len1 = PyUnicode_GET_LENGTH(str1);
10919
13.9M
    len2 = PyUnicode_GET_LENGTH(str2);
10920
13.9M
    len = Py_MIN(len1, len2);
10921
10922
13.9M
    switch(kind1) {
10923
1.28M
    case PyUnicode_1BYTE_KIND:
10924
1.28M
    {
10925
1.28M
        switch(kind2) {
10926
299k
        case PyUnicode_1BYTE_KIND:
10927
299k
        {
10928
299k
            int cmp = memcmp(data1, data2, len);
10929
            /* normalize result of memcmp() into the range [-1; 1] */
10930
299k
            if (cmp < 0)
10931
265k
                return -1;
10932
34.0k
            if (cmp > 0)
10933
27.9k
                return 1;
10934
6.14k
            break;
10935
34.0k
        }
10936
810k
        case PyUnicode_2BYTE_KIND:
10937
810k
            COMPARE(Py_UCS1, Py_UCS2);
10938
0
            break;
10939
169k
        case PyUnicode_4BYTE_KIND:
10940
169k
            COMPARE(Py_UCS1, Py_UCS4);
10941
0
            break;
10942
0
        default:
10943
0
            Py_UNREACHABLE();
10944
1.28M
        }
10945
6.14k
        break;
10946
1.28M
    }
10947
10.9M
    case PyUnicode_2BYTE_KIND:
10948
10.9M
    {
10949
10.9M
        switch(kind2) {
10950
4.23k
        case PyUnicode_1BYTE_KIND:
10951
4.23k
            COMPARE(Py_UCS2, Py_UCS1);
10952
0
            break;
10953
9.97M
        case PyUnicode_2BYTE_KIND:
10954
9.97M
        {
10955
9.97M
            COMPARE(Py_UCS2, Py_UCS2);
10956
0
            break;
10957
9.97M
        }
10958
985k
        case PyUnicode_4BYTE_KIND:
10959
985k
            COMPARE(Py_UCS2, Py_UCS4);
10960
0
            break;
10961
0
        default:
10962
0
            Py_UNREACHABLE();
10963
10.9M
        }
10964
0
        break;
10965
10.9M
    }
10966
1.74M
    case PyUnicode_4BYTE_KIND:
10967
1.74M
    {
10968
1.74M
        switch(kind2) {
10969
1.25k
        case PyUnicode_1BYTE_KIND:
10970
1.25k
            COMPARE(Py_UCS4, Py_UCS1);
10971
0
            break;
10972
296k
        case PyUnicode_2BYTE_KIND:
10973
296k
            COMPARE(Py_UCS4, Py_UCS2);
10974
0
            break;
10975
1.44M
        case PyUnicode_4BYTE_KIND:
10976
1.44M
        {
10977
1.44M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10978
1.44M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10979
            /* normalize result of wmemcmp() into the range [-1; 1] */
10980
1.44M
            if (cmp < 0)
10981
716k
                return -1;
10982
732k
            if (cmp > 0)
10983
732k
                return 1;
10984
#else
10985
            COMPARE(Py_UCS4, Py_UCS4);
10986
#endif
10987
0
            break;
10988
732k
        }
10989
0
        default:
10990
0
            Py_UNREACHABLE();
10991
1.74M
        }
10992
0
        break;
10993
1.74M
    }
10994
0
    default:
10995
0
        Py_UNREACHABLE();
10996
13.9M
    }
10997
10998
6.14k
    if (len1 == len2)
10999
6.11k
        return 0;
11000
35
    if (len1 < len2)
11001
12
        return -1;
11002
23
    else
11003
23
        return 1;
11004
11005
35
#undef COMPARE
11006
35
}
11007
11008
11009
int
11010
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11011
549M
{
11012
549M
    assert(PyUnicode_Check(str1));
11013
549M
    assert(PyUnicode_Check(str2));
11014
549M
    if (str1 == str2) {
11015
80.2M
        return 1;
11016
80.2M
    }
11017
469M
    return unicode_eq(str1, str2);
11018
549M
}
11019
11020
11021
int
11022
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11023
0
{
11024
0
    if (!PyUnicode_Check(str1)) {
11025
0
        PyErr_Format(PyExc_TypeError,
11026
0
                     "first argument must be str, not %T", str1);
11027
0
        return -1;
11028
0
    }
11029
0
    if (!PyUnicode_Check(str2)) {
11030
0
        PyErr_Format(PyExc_TypeError,
11031
0
                     "second argument must be str, not %T", str2);
11032
0
        return -1;
11033
0
    }
11034
11035
0
    return _PyUnicode_Equal(str1, str2);
11036
0
}
11037
11038
11039
int
11040
PyUnicode_Compare(PyObject *left, PyObject *right)
11041
228k
{
11042
228k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11043
        /* a string is equal to itself */
11044
228k
        if (left == right)
11045
0
            return 0;
11046
11047
228k
        return unicode_compare(left, right);
11048
228k
    }
11049
0
    PyErr_Format(PyExc_TypeError,
11050
0
                 "Can't compare %.100s and %.100s",
11051
0
                 Py_TYPE(left)->tp_name,
11052
0
                 Py_TYPE(right)->tp_name);
11053
0
    return -1;
11054
228k
}
11055
11056
int
11057
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11058
4.66M
{
11059
4.66M
    Py_ssize_t i;
11060
4.66M
    int kind;
11061
4.66M
    Py_UCS4 chr;
11062
11063
4.66M
    assert(_PyUnicode_CHECK(uni));
11064
4.66M
    kind = PyUnicode_KIND(uni);
11065
4.66M
    if (kind == PyUnicode_1BYTE_KIND) {
11066
4.66M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11067
4.66M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11068
4.66M
        size_t len, len2 = strlen(str);
11069
4.66M
        int cmp;
11070
11071
4.66M
        len = Py_MIN(len1, len2);
11072
4.66M
        cmp = memcmp(data, str, len);
11073
4.66M
        if (cmp != 0) {
11074
4.22M
            if (cmp < 0)
11075
46.2k
                return -1;
11076
4.18M
            else
11077
4.18M
                return 1;
11078
4.22M
        }
11079
437k
        if (len1 > len2)
11080
298
            return 1; /* uni is longer */
11081
436k
        if (len1 < len2)
11082
704
            return -1; /* str is longer */
11083
436k
        return 0;
11084
436k
    }
11085
1.19k
    else {
11086
1.19k
        const void *data = PyUnicode_DATA(uni);
11087
        /* Compare Unicode string and source character set string */
11088
2.29k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11089
2.09k
            if (chr != (unsigned char)str[i])
11090
994
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11091
        /* This check keeps Python strings that end in '\0' from comparing equal
11092
         to C strings identical up to that point. */
11093
203
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11094
203
            return 1; /* uni is longer */
11095
0
        if (str[i])
11096
0
            return -1; /* str is longer */
11097
0
        return 0;
11098
0
    }
11099
4.66M
}
11100
11101
int
11102
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11103
28
{
11104
28
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11105
28
}
11106
11107
int
11108
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11109
28
{
11110
28
    assert(_PyUnicode_CHECK(unicode));
11111
28
    assert(str);
11112
11113
28
    if (PyUnicode_IS_ASCII(unicode)) {
11114
28
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11115
28
        return size == len &&
11116
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11117
28
    }
11118
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11119
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11120
0
        return size == len &&
11121
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11122
0
    }
11123
11124
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11125
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11126
0
        return 0;
11127
0
    }
11128
0
    const unsigned char *s = (const unsigned char *)str;
11129
0
    const unsigned char *ends = s + (size_t)size;
11130
0
    int kind = PyUnicode_KIND(unicode);
11131
0
    const void *data = PyUnicode_DATA(unicode);
11132
    /* Compare Unicode string and UTF-8 string */
11133
0
    for (Py_ssize_t i = 0; i < len; i++) {
11134
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11135
0
        if (ch < 0x80) {
11136
0
            if (ends == s || s[0] != ch) {
11137
0
                return 0;
11138
0
            }
11139
0
            s += 1;
11140
0
        }
11141
0
        else if (ch < 0x800) {
11142
0
            if ((ends - s) < 2 ||
11143
0
                s[0] != (0xc0 | (ch >> 6)) ||
11144
0
                s[1] != (0x80 | (ch & 0x3f)))
11145
0
            {
11146
0
                return 0;
11147
0
            }
11148
0
            s += 2;
11149
0
        }
11150
0
        else if (ch < 0x10000) {
11151
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11152
0
                (ends - s) < 3 ||
11153
0
                s[0] != (0xe0 | (ch >> 12)) ||
11154
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11155
0
                s[2] != (0x80 | (ch & 0x3f)))
11156
0
            {
11157
0
                return 0;
11158
0
            }
11159
0
            s += 3;
11160
0
        }
11161
0
        else {
11162
0
            assert(ch <= MAX_UNICODE);
11163
0
            if ((ends - s) < 4 ||
11164
0
                s[0] != (0xf0 | (ch >> 18)) ||
11165
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11166
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11167
0
                s[3] != (0x80 | (ch & 0x3f)))
11168
0
            {
11169
0
                return 0;
11170
0
            }
11171
0
            s += 4;
11172
0
        }
11173
0
    }
11174
0
    return s == ends;
11175
0
}
11176
11177
int
11178
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11179
35.1M
{
11180
35.1M
    size_t len;
11181
35.1M
    assert(_PyUnicode_CHECK(unicode));
11182
35.1M
    assert(str);
11183
#ifndef NDEBUG
11184
    for (const char *p = str; *p; p++) {
11185
        assert((unsigned char)*p < 128);
11186
    }
11187
#endif
11188
35.1M
    if (!PyUnicode_IS_ASCII(unicode))
11189
123k
        return 0;
11190
35.0M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11191
35.0M
    return strlen(str) == len &&
11192
620k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11193
35.1M
}
11194
11195
PyObject *
11196
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11197
49.0M
{
11198
49.0M
    int result;
11199
11200
49.0M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11201
249k
        Py_RETURN_NOTIMPLEMENTED;
11202
11203
48.8M
    if (left == right) {
11204
2.94k
        switch (op) {
11205
2.70k
        case Py_EQ:
11206
2.70k
        case Py_LE:
11207
2.70k
        case Py_GE:
11208
            /* a string is equal to itself */
11209
2.70k
            Py_RETURN_TRUE;
11210
241
        case Py_NE:
11211
241
        case Py_LT:
11212
241
        case Py_GT:
11213
241
            Py_RETURN_FALSE;
11214
0
        default:
11215
0
            PyErr_BadArgument();
11216
0
            return NULL;
11217
2.94k
        }
11218
2.94k
    }
11219
48.8M
    else if (op == Py_EQ || op == Py_NE) {
11220
35.0M
        result = unicode_eq(left, right);
11221
35.0M
        result ^= (op == Py_NE);
11222
35.0M
        return PyBool_FromLong(result);
11223
35.0M
    }
11224
13.7M
    else {
11225
13.7M
        result = unicode_compare(left, right);
11226
13.7M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11227
13.7M
    }
11228
48.8M
}
11229
11230
int
11231
PyUnicode_Contains(PyObject *str, PyObject *substr)
11232
201M
{
11233
201M
    int kind1, kind2;
11234
201M
    const void *buf1, *buf2;
11235
201M
    Py_ssize_t len1, len2;
11236
201M
    int result;
11237
11238
201M
    if (!PyUnicode_Check(substr)) {
11239
0
        PyErr_Format(PyExc_TypeError,
11240
0
                     "'in <string>' requires string as left operand, not %.100s",
11241
0
                     Py_TYPE(substr)->tp_name);
11242
0
        return -1;
11243
0
    }
11244
201M
    if (ensure_unicode(str) < 0)
11245
0
        return -1;
11246
11247
201M
    kind1 = PyUnicode_KIND(str);
11248
201M
    kind2 = PyUnicode_KIND(substr);
11249
201M
    if (kind1 < kind2)
11250
13.8M
        return 0;
11251
187M
    len1 = PyUnicode_GET_LENGTH(str);
11252
187M
    len2 = PyUnicode_GET_LENGTH(substr);
11253
187M
    if (len1 < len2)
11254
712k
        return 0;
11255
187M
    buf1 = PyUnicode_DATA(str);
11256
187M
    buf2 = PyUnicode_DATA(substr);
11257
187M
    if (len2 == 1) {
11258
170M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11259
170M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11260
170M
        return result;
11261
170M
    }
11262
16.5M
    if (kind2 != kind1) {
11263
18.7k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11264
18.7k
        if (!buf2)
11265
0
            return -1;
11266
18.7k
    }
11267
11268
16.5M
    switch (kind1) {
11269
16.5M
    case PyUnicode_1BYTE_KIND:
11270
16.5M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11271
16.5M
        break;
11272
14.2k
    case PyUnicode_2BYTE_KIND:
11273
14.2k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11274
14.2k
        break;
11275
4.51k
    case PyUnicode_4BYTE_KIND:
11276
4.51k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11277
4.51k
        break;
11278
0
    default:
11279
0
        Py_UNREACHABLE();
11280
16.5M
    }
11281
11282
16.5M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11283
16.5M
    if (kind2 != kind1)
11284
18.7k
        PyMem_Free((void *)buf2);
11285
11286
16.5M
    return result;
11287
16.5M
}
11288
11289
/* Concat to string or Unicode object giving a new Unicode object. */
11290
11291
PyObject *
11292
PyUnicode_Concat(PyObject *left, PyObject *right)
11293
21.4M
{
11294
21.4M
    PyObject *result;
11295
21.4M
    Py_UCS4 maxchar, maxchar2;
11296
21.4M
    Py_ssize_t left_len, right_len, new_len;
11297
11298
21.4M
    if (ensure_unicode(left) < 0)
11299
0
        return NULL;
11300
11301
21.4M
    if (!PyUnicode_Check(right)) {
11302
0
        PyErr_Format(PyExc_TypeError,
11303
0
            "can only concatenate str (not \"%.200s\") to str",
11304
0
            Py_TYPE(right)->tp_name);
11305
0
        return NULL;
11306
0
    }
11307
11308
    /* Shortcuts */
11309
21.4M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11310
21.4M
    if (left == empty) {
11311
101k
        return PyUnicode_FromObject(right);
11312
101k
    }
11313
21.3M
    if (right == empty) {
11314
978k
        return PyUnicode_FromObject(left);
11315
978k
    }
11316
11317
20.3M
    left_len = PyUnicode_GET_LENGTH(left);
11318
20.3M
    right_len = PyUnicode_GET_LENGTH(right);
11319
20.3M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11320
0
        PyErr_SetString(PyExc_OverflowError,
11321
0
                        "strings are too large to concat");
11322
0
        return NULL;
11323
0
    }
11324
20.3M
    new_len = left_len + right_len;
11325
11326
20.3M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11327
20.3M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11328
20.3M
    maxchar = Py_MAX(maxchar, maxchar2);
11329
11330
    /* Concat the two Unicode strings */
11331
20.3M
    result = PyUnicode_New(new_len, maxchar);
11332
20.3M
    if (result == NULL)
11333
0
        return NULL;
11334
20.3M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11335
20.3M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11336
20.3M
    assert(_PyUnicode_CheckConsistency(result, 1));
11337
20.3M
    return result;
11338
20.3M
}
11339
11340
void
11341
PyUnicode_Append(PyObject **p_left, PyObject *right)
11342
4.94M
{
11343
4.94M
    PyObject *left, *res;
11344
4.94M
    Py_UCS4 maxchar, maxchar2;
11345
4.94M
    Py_ssize_t left_len, right_len, new_len;
11346
11347
4.94M
    if (p_left == NULL) {
11348
0
        if (!PyErr_Occurred())
11349
0
            PyErr_BadInternalCall();
11350
0
        return;
11351
0
    }
11352
4.94M
    left = *p_left;
11353
4.94M
    if (right == NULL || left == NULL
11354
4.94M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11355
0
        if (!PyErr_Occurred())
11356
0
            PyErr_BadInternalCall();
11357
0
        goto error;
11358
0
    }
11359
11360
    /* Shortcuts */
11361
4.94M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11362
4.94M
    if (left == empty) {
11363
421k
        Py_DECREF(left);
11364
421k
        *p_left = Py_NewRef(right);
11365
421k
        return;
11366
421k
    }
11367
4.52M
    if (right == empty) {
11368
11.4k
        return;
11369
11.4k
    }
11370
11371
4.51M
    left_len = PyUnicode_GET_LENGTH(left);
11372
4.51M
    right_len = PyUnicode_GET_LENGTH(right);
11373
4.51M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11374
0
        PyErr_SetString(PyExc_OverflowError,
11375
0
                        "strings are too large to concat");
11376
0
        goto error;
11377
0
    }
11378
4.51M
    new_len = left_len + right_len;
11379
11380
4.51M
    if (_PyUnicode_IsModifiable(left)
11381
4.51M
        && PyUnicode_CheckExact(right)
11382
4.51M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11383
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11384
           to change the structure size, but characters are stored just after
11385
           the structure, and so it requires to move all characters which is
11386
           not so different than duplicating the string. */
11387
1.71M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11388
1.71M
    {
11389
        /* append inplace */
11390
1.71M
        if (unicode_resize(p_left, new_len) != 0)
11391
0
            goto error;
11392
11393
        /* copy 'right' into the newly allocated area of 'left' */
11394
1.71M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11395
1.71M
    }
11396
2.80M
    else {
11397
2.80M
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11398
2.80M
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11399
2.80M
        maxchar = Py_MAX(maxchar, maxchar2);
11400
11401
        /* Concat the two Unicode strings */
11402
2.80M
        res = PyUnicode_New(new_len, maxchar);
11403
2.80M
        if (res == NULL)
11404
0
            goto error;
11405
2.80M
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11406
2.80M
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11407
2.80M
        Py_DECREF(left);
11408
2.80M
        *p_left = res;
11409
2.80M
    }
11410
4.51M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11411
4.51M
    return;
11412
11413
0
error:
11414
0
    Py_CLEAR(*p_left);
11415
0
}
11416
11417
void
11418
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11419
8
{
11420
8
    PyUnicode_Append(pleft, right);
11421
8
    Py_XDECREF(right);
11422
8
}
11423
11424
/*[clinic input]
11425
@permit_long_summary
11426
@text_signature "($self, sub[, start[, end]], /)"
11427
str.count as unicode_count -> Py_ssize_t
11428
11429
    self as str: self
11430
    sub as substr: unicode
11431
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11432
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11433
    /
11434
11435
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11436
11437
Optional arguments start and end are interpreted as in slice notation.
11438
[clinic start generated code]*/
11439
11440
static Py_ssize_t
11441
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11442
                   Py_ssize_t end)
11443
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11444
23.3M
{
11445
23.3M
    assert(PyUnicode_Check(str));
11446
23.3M
    assert(PyUnicode_Check(substr));
11447
11448
23.3M
    Py_ssize_t result;
11449
23.3M
    int kind1, kind2;
11450
23.3M
    const void *buf1 = NULL, *buf2 = NULL;
11451
23.3M
    Py_ssize_t len1, len2;
11452
11453
23.3M
    kind1 = PyUnicode_KIND(str);
11454
23.3M
    kind2 = PyUnicode_KIND(substr);
11455
23.3M
    if (kind1 < kind2)
11456
0
        return 0;
11457
11458
23.3M
    len1 = PyUnicode_GET_LENGTH(str);
11459
23.3M
    len2 = PyUnicode_GET_LENGTH(substr);
11460
23.3M
    ADJUST_INDICES(start, end, len1);
11461
23.3M
    if (end - start < len2)
11462
3.04M
        return 0;
11463
11464
20.2M
    buf1 = PyUnicode_DATA(str);
11465
20.2M
    buf2 = PyUnicode_DATA(substr);
11466
20.2M
    if (kind2 != kind1) {
11467
4.75M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11468
4.75M
        if (!buf2)
11469
0
            goto onError;
11470
4.75M
    }
11471
11472
    // We don't reuse `anylib_count` here because of the explicit casts.
11473
20.2M
    switch (kind1) {
11474
15.5M
    case PyUnicode_1BYTE_KIND:
11475
15.5M
        result = ucs1lib_count(
11476
15.5M
            ((const Py_UCS1*)buf1) + start, end - start,
11477
15.5M
            buf2, len2, PY_SSIZE_T_MAX
11478
15.5M
            );
11479
15.5M
        break;
11480
3.45M
    case PyUnicode_2BYTE_KIND:
11481
3.45M
        result = ucs2lib_count(
11482
3.45M
            ((const Py_UCS2*)buf1) + start, end - start,
11483
3.45M
            buf2, len2, PY_SSIZE_T_MAX
11484
3.45M
            );
11485
3.45M
        break;
11486
1.30M
    case PyUnicode_4BYTE_KIND:
11487
1.30M
        result = ucs4lib_count(
11488
1.30M
            ((const Py_UCS4*)buf1) + start, end - start,
11489
1.30M
            buf2, len2, PY_SSIZE_T_MAX
11490
1.30M
            );
11491
1.30M
        break;
11492
0
    default:
11493
0
        Py_UNREACHABLE();
11494
20.2M
    }
11495
11496
20.2M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11497
20.2M
    if (kind2 != kind1)
11498
4.75M
        PyMem_Free((void *)buf2);
11499
11500
20.2M
    return result;
11501
0
  onError:
11502
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11503
0
    if (kind2 != kind1)
11504
0
        PyMem_Free((void *)buf2);
11505
0
    return -1;
11506
20.2M
}
11507
11508
/*[clinic input]
11509
str.encode as unicode_encode
11510
11511
    encoding: str(c_default="NULL") = 'utf-8'
11512
        The encoding in which to encode the string.
11513
    errors: str(c_default="NULL") = 'strict'
11514
        The error handling scheme to use for encoding errors.
11515
        The default is 'strict' meaning that encoding errors raise a
11516
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11517
        'xmlcharrefreplace' as well as any other name registered with
11518
        codecs.register_error that can handle UnicodeEncodeErrors.
11519
11520
Encode the string using the codec registered for encoding.
11521
[clinic start generated code]*/
11522
11523
static PyObject *
11524
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11525
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11526
17.7M
{
11527
17.7M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11528
17.7M
}
11529
11530
/*[clinic input]
11531
str.expandtabs as unicode_expandtabs
11532
11533
    tabsize: int = 8
11534
11535
Return a copy where all tab characters are expanded using spaces.
11536
11537
If tabsize is not given, a tab size of 8 characters is assumed.
11538
[clinic start generated code]*/
11539
11540
static PyObject *
11541
unicode_expandtabs_impl(PyObject *self, int tabsize)
11542
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11543
1.05M
{
11544
1.05M
    Py_ssize_t i, j, line_pos, src_len, incr;
11545
1.05M
    Py_UCS4 ch;
11546
1.05M
    PyObject *u;
11547
1.05M
    const void *src_data;
11548
1.05M
    void *dest_data;
11549
1.05M
    int kind;
11550
1.05M
    int found;
11551
11552
    /* First pass: determine size of output string */
11553
1.05M
    src_len = PyUnicode_GET_LENGTH(self);
11554
1.05M
    i = j = line_pos = 0;
11555
1.05M
    kind = PyUnicode_KIND(self);
11556
1.05M
    src_data = PyUnicode_DATA(self);
11557
1.05M
    found = 0;
11558
2.66M
    for (; i < src_len; i++) {
11559
1.61M
        ch = PyUnicode_READ(kind, src_data, i);
11560
1.61M
        if (ch == '\t') {
11561
308k
            found = 1;
11562
308k
            if (tabsize > 0) {
11563
308k
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11564
308k
                if (j > PY_SSIZE_T_MAX - incr)
11565
0
                    goto overflow;
11566
308k
                line_pos += incr;
11567
308k
                j += incr;
11568
308k
            }
11569
308k
        }
11570
1.30M
        else {
11571
1.30M
            if (j > PY_SSIZE_T_MAX - 1)
11572
0
                goto overflow;
11573
1.30M
            line_pos++;
11574
1.30M
            j++;
11575
1.30M
            if (ch == '\n' || ch == '\r')
11576
4.45k
                line_pos = 0;
11577
1.30M
        }
11578
1.61M
    }
11579
1.05M
    if (!found)
11580
1.03M
        return unicode_result_unchanged(self);
11581
11582
    /* Second pass: create output string and fill it */
11583
19.0k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11584
19.0k
    if (!u)
11585
0
        return NULL;
11586
19.0k
    dest_data = PyUnicode_DATA(u);
11587
11588
19.0k
    i = j = line_pos = 0;
11589
11590
684k
    for (; i < src_len; i++) {
11591
665k
        ch = PyUnicode_READ(kind, src_data, i);
11592
665k
        if (ch == '\t') {
11593
308k
            if (tabsize > 0) {
11594
308k
                incr = tabsize - (line_pos % tabsize);
11595
308k
                line_pos += incr;
11596
308k
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11597
308k
                j += incr;
11598
308k
            }
11599
308k
        }
11600
357k
        else {
11601
357k
            line_pos++;
11602
357k
            PyUnicode_WRITE(kind, dest_data, j, ch);
11603
357k
            j++;
11604
357k
            if (ch == '\n' || ch == '\r')
11605
0
                line_pos = 0;
11606
357k
        }
11607
665k
    }
11608
19.0k
    assert (j == PyUnicode_GET_LENGTH(u));
11609
19.0k
    return unicode_result(u);
11610
11611
0
  overflow:
11612
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11613
0
    return NULL;
11614
19.0k
}
11615
11616
/*[clinic input]
11617
@permit_long_summary
11618
str.find as unicode_find = str.count
11619
11620
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11621
11622
Optional arguments start and end are interpreted as in slice notation.
11623
Return -1 on failure.
11624
[clinic start generated code]*/
11625
11626
static Py_ssize_t
11627
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11628
                  Py_ssize_t end)
11629
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11630
21.4M
{
11631
21.4M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11632
21.4M
    if (result < 0) {
11633
4.96M
        return -1;
11634
4.96M
    }
11635
16.5M
    return result;
11636
21.4M
}
11637
11638
static PyObject *
11639
unicode_getitem(PyObject *self, Py_ssize_t index)
11640
43.7M
{
11641
43.7M
    const void *data;
11642
43.7M
    int kind;
11643
43.7M
    Py_UCS4 ch;
11644
11645
43.7M
    if (!PyUnicode_Check(self)) {
11646
0
        PyErr_BadArgument();
11647
0
        return NULL;
11648
0
    }
11649
43.7M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11650
15.1k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11651
15.1k
        return NULL;
11652
15.1k
    }
11653
43.7M
    kind = PyUnicode_KIND(self);
11654
43.7M
    data = PyUnicode_DATA(self);
11655
43.7M
    ch = PyUnicode_READ(kind, data, index);
11656
43.7M
    return unicode_char(ch);
11657
43.7M
}
11658
11659
/* Believe it or not, this produces the same value for ASCII strings
11660
   as bytes_hash(). */
11661
static Py_hash_t
11662
unicode_hash(PyObject *self)
11663
981M
{
11664
981M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11665
11666
#ifdef Py_DEBUG
11667
    assert(_Py_HashSecret_Initialized);
11668
#endif
11669
981M
    Py_hash_t hash = PyUnicode_HASH(self);
11670
981M
    if (hash != -1) {
11671
939M
        return hash;
11672
939M
    }
11673
41.3M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11674
41.3M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11675
11676
41.3M
    PyUnicode_SET_HASH(self, x);
11677
41.3M
    return x;
11678
981M
}
11679
11680
/*[clinic input]
11681
@permit_long_summary
11682
str.index as unicode_index = str.count
11683
11684
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11685
11686
Optional arguments start and end are interpreted as in slice notation.
11687
Raises ValueError when the substring is not found.
11688
[clinic start generated code]*/
11689
11690
static Py_ssize_t
11691
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11692
                   Py_ssize_t end)
11693
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11694
45.2k
{
11695
45.2k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11696
45.2k
    if (result == -1) {
11697
712
        PyErr_SetString(PyExc_ValueError, "substring not found");
11698
712
    }
11699
44.5k
    else if (result < 0) {
11700
0
        return -1;
11701
0
    }
11702
45.2k
    return result;
11703
45.2k
}
11704
11705
/*[clinic input]
11706
str.isascii as unicode_isascii
11707
11708
Return True if all characters in the string are ASCII, False otherwise.
11709
11710
ASCII characters have code points in the range U+0000-U+007F.
11711
Empty string is ASCII too.
11712
[clinic start generated code]*/
11713
11714
static PyObject *
11715
unicode_isascii_impl(PyObject *self)
11716
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11717
9.84k
{
11718
9.84k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11719
9.84k
}
11720
11721
/*[clinic input]
11722
@permit_long_docstring_body
11723
str.islower as unicode_islower
11724
11725
Return True if the string is a lowercase string, False otherwise.
11726
11727
A string is lowercase if all cased characters in the string are lowercase and
11728
there is at least one cased character in the string.
11729
[clinic start generated code]*/
11730
11731
static PyObject *
11732
unicode_islower_impl(PyObject *self)
11733
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11734
0
{
11735
0
    Py_ssize_t i, length;
11736
0
    int kind;
11737
0
    const void *data;
11738
0
    int cased;
11739
11740
0
    length = PyUnicode_GET_LENGTH(self);
11741
0
    kind = PyUnicode_KIND(self);
11742
0
    data = PyUnicode_DATA(self);
11743
11744
    /* Shortcut for single character strings */
11745
0
    if (length == 1)
11746
0
        return PyBool_FromLong(
11747
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11748
11749
    /* Special case for empty strings */
11750
0
    if (length == 0)
11751
0
        Py_RETURN_FALSE;
11752
11753
0
    cased = 0;
11754
0
    for (i = 0; i < length; i++) {
11755
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11756
11757
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11758
0
            Py_RETURN_FALSE;
11759
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11760
0
            cased = 1;
11761
0
    }
11762
0
    return PyBool_FromLong(cased);
11763
0
}
11764
11765
/*[clinic input]
11766
@permit_long_docstring_body
11767
str.isupper as unicode_isupper
11768
11769
Return True if the string is an uppercase string, False otherwise.
11770
11771
A string is uppercase if all cased characters in the string are uppercase and
11772
there is at least one cased character in the string.
11773
[clinic start generated code]*/
11774
11775
static PyObject *
11776
unicode_isupper_impl(PyObject *self)
11777
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11778
14.2k
{
11779
14.2k
    Py_ssize_t i, length;
11780
14.2k
    int kind;
11781
14.2k
    const void *data;
11782
14.2k
    int cased;
11783
11784
14.2k
    length = PyUnicode_GET_LENGTH(self);
11785
14.2k
    kind = PyUnicode_KIND(self);
11786
14.2k
    data = PyUnicode_DATA(self);
11787
11788
    /* Shortcut for single character strings */
11789
14.2k
    if (length == 1)
11790
0
        return PyBool_FromLong(
11791
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11792
11793
    /* Special case for empty strings */
11794
14.2k
    if (length == 0)
11795
0
        Py_RETURN_FALSE;
11796
11797
14.2k
    cased = 0;
11798
180k
    for (i = 0; i < length; i++) {
11799
167k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11800
11801
167k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11802
1.77k
            Py_RETURN_FALSE;
11803
166k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11804
12.6k
            cased = 1;
11805
167k
    }
11806
12.5k
    return PyBool_FromLong(cased);
11807
14.2k
}
11808
11809
/*[clinic input]
11810
str.istitle as unicode_istitle
11811
11812
Return True if the string is a title-cased string, False otherwise.
11813
11814
In a title-cased string, upper- and title-case characters may only
11815
follow uncased characters and lowercase characters only cased ones.
11816
[clinic start generated code]*/
11817
11818
static PyObject *
11819
unicode_istitle_impl(PyObject *self)
11820
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11821
0
{
11822
0
    Py_ssize_t i, length;
11823
0
    int kind;
11824
0
    const void *data;
11825
0
    int cased, previous_is_cased;
11826
11827
0
    length = PyUnicode_GET_LENGTH(self);
11828
0
    kind = PyUnicode_KIND(self);
11829
0
    data = PyUnicode_DATA(self);
11830
11831
    /* Shortcut for single character strings */
11832
0
    if (length == 1) {
11833
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11834
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11835
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11836
0
    }
11837
11838
    /* Special case for empty strings */
11839
0
    if (length == 0)
11840
0
        Py_RETURN_FALSE;
11841
11842
0
    cased = 0;
11843
0
    previous_is_cased = 0;
11844
0
    for (i = 0; i < length; i++) {
11845
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11846
11847
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11848
0
            if (previous_is_cased)
11849
0
                Py_RETURN_FALSE;
11850
0
            previous_is_cased = 1;
11851
0
            cased = 1;
11852
0
        }
11853
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11854
0
            if (!previous_is_cased)
11855
0
                Py_RETURN_FALSE;
11856
0
            previous_is_cased = 1;
11857
0
            cased = 1;
11858
0
        }
11859
0
        else
11860
0
            previous_is_cased = 0;
11861
0
    }
11862
0
    return PyBool_FromLong(cased);
11863
0
}
11864
11865
/*[clinic input]
11866
@permit_long_docstring_body
11867
str.isspace as unicode_isspace
11868
11869
Return True if the string is a whitespace string, False otherwise.
11870
11871
A string is whitespace if all characters in the string are whitespace and there
11872
is at least one character in the string.
11873
[clinic start generated code]*/
11874
11875
static PyObject *
11876
unicode_isspace_impl(PyObject *self)
11877
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11878
1.33M
{
11879
1.33M
    Py_ssize_t i, length;
11880
1.33M
    int kind;
11881
1.33M
    const void *data;
11882
11883
1.33M
    length = PyUnicode_GET_LENGTH(self);
11884
1.33M
    kind = PyUnicode_KIND(self);
11885
1.33M
    data = PyUnicode_DATA(self);
11886
11887
    /* Shortcut for single character strings */
11888
1.33M
    if (length == 1)
11889
1.33M
        return PyBool_FromLong(
11890
1.33M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11891
11892
    /* Special case for empty strings */
11893
1.33k
    if (length == 0)
11894
284
        Py_RETURN_FALSE;
11895
11896
7.60k
    for (i = 0; i < length; i++) {
11897
7.50k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11898
7.50k
        if (!Py_UNICODE_ISSPACE(ch))
11899
944
            Py_RETURN_FALSE;
11900
7.50k
    }
11901
1.04k
    Py_RETURN_TRUE;
11902
1.04k
}
11903
11904
/*[clinic input]
11905
@permit_long_docstring_body
11906
str.isalpha as unicode_isalpha
11907
11908
Return True if the string is an alphabetic string, False otherwise.
11909
11910
A string is alphabetic if all characters in the string are alphabetic and there
11911
is at least one character in the string.
11912
[clinic start generated code]*/
11913
11914
static PyObject *
11915
unicode_isalpha_impl(PyObject *self)
11916
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11917
20
{
11918
20
    Py_ssize_t i, length;
11919
20
    int kind;
11920
20
    const void *data;
11921
11922
20
    length = PyUnicode_GET_LENGTH(self);
11923
20
    kind = PyUnicode_KIND(self);
11924
20
    data = PyUnicode_DATA(self);
11925
11926
    /* Shortcut for single character strings */
11927
20
    if (length == 1)
11928
18
        return PyBool_FromLong(
11929
18
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11930
11931
    /* Special case for empty strings */
11932
2
    if (length == 0)
11933
0
        Py_RETURN_FALSE;
11934
11935
2
    for (i = 0; i < length; i++) {
11936
2
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11937
2
            Py_RETURN_FALSE;
11938
2
    }
11939
2
    Py_RETURN_TRUE;
11940
2
}
11941
11942
/*[clinic input]
11943
@permit_long_docstring_body
11944
str.isalnum as unicode_isalnum
11945
11946
Return True if the string is an alpha-numeric string, False otherwise.
11947
11948
A string is alpha-numeric if all characters in the string are alpha-numeric and
11949
there is at least one character in the string.
11950
[clinic start generated code]*/
11951
11952
static PyObject *
11953
unicode_isalnum_impl(PyObject *self)
11954
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11955
0
{
11956
0
    int kind;
11957
0
    const void *data;
11958
0
    Py_ssize_t len, i;
11959
11960
0
    kind = PyUnicode_KIND(self);
11961
0
    data = PyUnicode_DATA(self);
11962
0
    len = PyUnicode_GET_LENGTH(self);
11963
11964
    /* Shortcut for single character strings */
11965
0
    if (len == 1) {
11966
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11967
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11968
0
    }
11969
11970
    /* Special case for empty strings */
11971
0
    if (len == 0)
11972
0
        Py_RETURN_FALSE;
11973
11974
0
    for (i = 0; i < len; i++) {
11975
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11976
0
        if (!Py_UNICODE_ISALNUM(ch))
11977
0
            Py_RETURN_FALSE;
11978
0
    }
11979
0
    Py_RETURN_TRUE;
11980
0
}
11981
11982
/*[clinic input]
11983
@permit_long_docstring_body
11984
str.isdecimal as unicode_isdecimal
11985
11986
Return True if the string is a decimal string, False otherwise.
11987
11988
A string is a decimal string if all characters in the string are decimal and
11989
there is at least one character in the string.
11990
[clinic start generated code]*/
11991
11992
static PyObject *
11993
unicode_isdecimal_impl(PyObject *self)
11994
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
11995
1.38k
{
11996
1.38k
    Py_ssize_t i, length;
11997
1.38k
    int kind;
11998
1.38k
    const void *data;
11999
12000
1.38k
    length = PyUnicode_GET_LENGTH(self);
12001
1.38k
    kind = PyUnicode_KIND(self);
12002
1.38k
    data = PyUnicode_DATA(self);
12003
12004
    /* Shortcut for single character strings */
12005
1.38k
    if (length == 1)
12006
204
        return PyBool_FromLong(
12007
204
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12008
12009
    /* Special case for empty strings */
12010
1.17k
    if (length == 0)
12011
0
        Py_RETURN_FALSE;
12012
12013
7.02k
    for (i = 0; i < length; i++) {
12014
6.45k
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12015
614
            Py_RETURN_FALSE;
12016
6.45k
    }
12017
1.17k
    Py_RETURN_TRUE;
12018
1.17k
}
12019
12020
/*[clinic input]
12021
@permit_long_docstring_body
12022
str.isdigit as unicode_isdigit
12023
12024
Return True if the string is a digit string, False otherwise.
12025
12026
A string is a digit string if all characters in the string are digits and there
12027
is at least one character in the string.
12028
[clinic start generated code]*/
12029
12030
static PyObject *
12031
unicode_isdigit_impl(PyObject *self)
12032
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12033
1.40M
{
12034
1.40M
    Py_ssize_t i, length;
12035
1.40M
    int kind;
12036
1.40M
    const void *data;
12037
12038
1.40M
    length = PyUnicode_GET_LENGTH(self);
12039
1.40M
    kind = PyUnicode_KIND(self);
12040
1.40M
    data = PyUnicode_DATA(self);
12041
12042
    /* Shortcut for single character strings */
12043
1.40M
    if (length == 1) {
12044
1.40M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12045
1.40M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12046
1.40M
    }
12047
12048
    /* Special case for empty strings */
12049
510
    if (length == 0)
12050
0
        Py_RETURN_FALSE;
12051
12052
1.82k
    for (i = 0; i < length; i++) {
12053
1.31k
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12054
0
            Py_RETURN_FALSE;
12055
1.31k
    }
12056
510
    Py_RETURN_TRUE;
12057
510
}
12058
12059
/*[clinic input]
12060
@permit_long_docstring_body
12061
str.isnumeric as unicode_isnumeric
12062
12063
Return True if the string is a numeric string, False otherwise.
12064
12065
A string is numeric if all characters in the string are numeric and there is at
12066
least one character in the string.
12067
[clinic start generated code]*/
12068
12069
static PyObject *
12070
unicode_isnumeric_impl(PyObject *self)
12071
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12072
0
{
12073
0
    Py_ssize_t i, length;
12074
0
    int kind;
12075
0
    const void *data;
12076
12077
0
    length = PyUnicode_GET_LENGTH(self);
12078
0
    kind = PyUnicode_KIND(self);
12079
0
    data = PyUnicode_DATA(self);
12080
12081
    /* Shortcut for single character strings */
12082
0
    if (length == 1)
12083
0
        return PyBool_FromLong(
12084
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12085
12086
    /* Special case for empty strings */
12087
0
    if (length == 0)
12088
0
        Py_RETURN_FALSE;
12089
12090
0
    for (i = 0; i < length; i++) {
12091
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12092
0
            Py_RETURN_FALSE;
12093
0
    }
12094
0
    Py_RETURN_TRUE;
12095
0
}
12096
12097
Py_ssize_t
12098
_PyUnicode_ScanIdentifier(PyObject *self)
12099
60.9k
{
12100
60.9k
    Py_ssize_t i;
12101
60.9k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12102
60.9k
    if (len == 0) {
12103
        /* an empty string is not a valid identifier */
12104
0
        return 0;
12105
0
    }
12106
12107
60.9k
    int kind = PyUnicode_KIND(self);
12108
60.9k
    const void *data = PyUnicode_DATA(self);
12109
60.9k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12110
    /* PEP 3131 says that the first character must be in
12111
       XID_Start and subsequent characters in XID_Continue,
12112
       and for the ASCII range, the 2.x rules apply (i.e
12113
       start with letters and underscore, continue with
12114
       letters, digits, underscore). However, given the current
12115
       definition of XID_Start and XID_Continue, it is sufficient
12116
       to check just for these, except that _ must be allowed
12117
       as starting an identifier.  */
12118
60.9k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12119
804
        return 0;
12120
804
    }
12121
12122
489k
    for (i = 1; i < len; i++) {
12123
429k
        ch = PyUnicode_READ(kind, data, i);
12124
429k
        if (!_PyUnicode_IsXidContinue(ch)) {
12125
338
            return i;
12126
338
        }
12127
429k
    }
12128
59.8k
    return i;
12129
60.1k
}
12130
12131
int
12132
PyUnicode_IsIdentifier(PyObject *self)
12133
50.1k
{
12134
50.1k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12135
50.1k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12136
    /* an empty string is not a valid identifier */
12137
50.1k
    return len && i == len;
12138
50.1k
}
12139
12140
/*[clinic input]
12141
@permit_long_docstring_body
12142
str.isidentifier as unicode_isidentifier
12143
12144
Return True if the string is a valid Python identifier, False otherwise.
12145
12146
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12147
such as "def" or "class".
12148
[clinic start generated code]*/
12149
12150
static PyObject *
12151
unicode_isidentifier_impl(PyObject *self)
12152
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12153
47.8k
{
12154
47.8k
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12155
47.8k
}
12156
12157
/*[clinic input]
12158
@permit_long_summary
12159
str.isprintable as unicode_isprintable
12160
12161
Return True if all characters in the string are printable, False otherwise.
12162
12163
A character is printable if repr() may use it in its output.
12164
[clinic start generated code]*/
12165
12166
static PyObject *
12167
unicode_isprintable_impl(PyObject *self)
12168
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12169
1.45M
{
12170
1.45M
    Py_ssize_t i, length;
12171
1.45M
    int kind;
12172
1.45M
    const void *data;
12173
12174
1.45M
    length = PyUnicode_GET_LENGTH(self);
12175
1.45M
    kind = PyUnicode_KIND(self);
12176
1.45M
    data = PyUnicode_DATA(self);
12177
12178
    /* Shortcut for single character strings */
12179
1.45M
    if (length == 1)
12180
1.45M
        return PyBool_FromLong(
12181
1.45M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12182
12183
0
    for (i = 0; i < length; i++) {
12184
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12185
0
            Py_RETURN_FALSE;
12186
0
        }
12187
0
    }
12188
0
    Py_RETURN_TRUE;
12189
0
}
12190
12191
/*[clinic input]
12192
@permit_long_docstring_body
12193
str.join as unicode_join
12194
12195
    iterable: object
12196
    /
12197
12198
Concatenate any number of strings.
12199
12200
The string whose method is called is inserted in between each given string.
12201
The result is returned as a new string.
12202
12203
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12204
[clinic start generated code]*/
12205
12206
static PyObject *
12207
unicode_join(PyObject *self, PyObject *iterable)
12208
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12209
17.4M
{
12210
17.4M
    return PyUnicode_Join(self, iterable);
12211
17.4M
}
12212
12213
static Py_ssize_t
12214
unicode_length(PyObject *self)
12215
24.8M
{
12216
24.8M
    return PyUnicode_GET_LENGTH(self);
12217
24.8M
}
12218
12219
/*[clinic input]
12220
str.ljust as unicode_ljust
12221
12222
    width: Py_ssize_t
12223
    fillchar: Py_UCS4 = ' '
12224
    /
12225
12226
Return a left-justified string of length width.
12227
12228
Padding is done using the specified fill character (default is a space).
12229
[clinic start generated code]*/
12230
12231
static PyObject *
12232
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12233
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12234
130
{
12235
130
    if (PyUnicode_GET_LENGTH(self) >= width)
12236
62
        return unicode_result_unchanged(self);
12237
12238
68
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12239
130
}
12240
12241
/*[clinic input]
12242
str.lower as unicode_lower
12243
12244
Return a copy of the string converted to lowercase.
12245
[clinic start generated code]*/
12246
12247
static PyObject *
12248
unicode_lower_impl(PyObject *self)
12249
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12250
66.8M
{
12251
66.8M
    if (PyUnicode_IS_ASCII(self))
12252
63.7M
        return ascii_upper_or_lower(self, 1);
12253
3.09M
    return case_operation(self, do_lower);
12254
66.8M
}
12255
12256
64.0M
#define LEFTSTRIP 0
12257
77.6M
#define RIGHTSTRIP 1
12258
44.6M
#define BOTHSTRIP 2
12259
12260
/* Arrays indexed by above */
12261
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12262
12263
0
#define STRIPNAME(i) (stripfuncnames[i])
12264
12265
/* externally visible for str.strip(unicode) */
12266
PyObject *
12267
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12268
3.65M
{
12269
3.65M
    const void *data;
12270
3.65M
    int kind;
12271
3.65M
    Py_ssize_t i, j, len;
12272
3.65M
    BLOOM_MASK sepmask;
12273
3.65M
    Py_ssize_t seplen;
12274
12275
3.65M
    kind = PyUnicode_KIND(self);
12276
3.65M
    data = PyUnicode_DATA(self);
12277
3.65M
    len = PyUnicode_GET_LENGTH(self);
12278
3.65M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12279
3.65M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12280
3.65M
                              PyUnicode_DATA(sepobj),
12281
3.65M
                              seplen);
12282
12283
3.65M
    i = 0;
12284
3.65M
    if (striptype != RIGHTSTRIP) {
12285
464k
        while (i < len) {
12286
461k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12287
461k
            if (!BLOOM(sepmask, ch))
12288
430k
                break;
12289
31.5k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12290
2.57k
                break;
12291
29.0k
            i++;
12292
29.0k
        }
12293
435k
    }
12294
12295
3.65M
    j = len;
12296
3.65M
    if (striptype != LEFTSTRIP) {
12297
3.22M
        j--;
12298
3.87M
        while (j >= i) {
12299
2.94M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12300
2.94M
            if (!BLOOM(sepmask, ch))
12301
2.21M
                break;
12302
729k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12303
78.3k
                break;
12304
651k
            j--;
12305
651k
        }
12306
12307
3.22M
        j++;
12308
3.22M
    }
12309
12310
3.65M
    return PyUnicode_Substring(self, i, j);
12311
3.65M
}
12312
12313
PyObject*
12314
_PyUnicode_BinarySlice(PyObject *container, PyObject *start_o, PyObject *stop_o)
12315
27.0M
{
12316
27.0M
    assert(PyUnicode_CheckExact(container));
12317
27.0M
    Py_ssize_t len = PyUnicode_GET_LENGTH(container);
12318
27.0M
    Py_ssize_t istart, istop;
12319
27.0M
    if (!_PyEval_UnpackIndices(start_o, stop_o, len, &istart, &istop)) {
12320
0
        return NULL;
12321
0
    }
12322
27.0M
    return PyUnicode_Substring(container, istart, istop);
12323
27.0M
}
12324
12325
PyObject*
12326
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12327
255M
{
12328
255M
    const unsigned char *data;
12329
255M
    int kind;
12330
255M
    Py_ssize_t length;
12331
12332
255M
    length = PyUnicode_GET_LENGTH(self);
12333
255M
    end = Py_MIN(end, length);
12334
12335
255M
    if (start == 0 && end == length)
12336
65.7M
        return unicode_result_unchanged(self);
12337
12338
189M
    if (start < 0 || end < 0) {
12339
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12340
0
        return NULL;
12341
0
    }
12342
189M
    if (start >= length || end < start)
12343
4.95M
        _Py_RETURN_UNICODE_EMPTY();
12344
12345
184M
    length = end - start;
12346
184M
    if (PyUnicode_IS_ASCII(self)) {
12347
64.1M
        data = PyUnicode_1BYTE_DATA(self);
12348
64.1M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12349
64.1M
    }
12350
120M
    else {
12351
120M
        kind = PyUnicode_KIND(self);
12352
120M
        data = PyUnicode_1BYTE_DATA(self);
12353
120M
        return PyUnicode_FromKindAndData(kind,
12354
120M
                                         data + kind * start,
12355
120M
                                         length);
12356
120M
    }
12357
184M
}
12358
12359
static PyObject *
12360
do_strip(PyObject *self, int striptype)
12361
58.4M
{
12362
58.4M
    Py_ssize_t len, i, j;
12363
12364
58.4M
    len = PyUnicode_GET_LENGTH(self);
12365
12366
58.4M
    if (PyUnicode_IS_ASCII(self)) {
12367
45.6M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12368
12369
45.6M
        i = 0;
12370
45.6M
        if (striptype != RIGHTSTRIP) {
12371
42.0M
            while (i < len) {
12372
35.0M
                Py_UCS1 ch = data[i];
12373
35.0M
                if (!_Py_ascii_whitespace[ch])
12374
29.3M
                    break;
12375
5.66M
                i++;
12376
5.66M
            }
12377
36.3M
        }
12378
12379
45.6M
        j = len;
12380
45.6M
        if (striptype != LEFTSTRIP) {
12381
45.3M
            j--;
12382
50.2M
            while (j >= i) {
12383
38.2M
                Py_UCS1 ch = data[j];
12384
38.2M
                if (!_Py_ascii_whitespace[ch])
12385
33.3M
                    break;
12386
4.89M
                j--;
12387
4.89M
            }
12388
45.3M
            j++;
12389
45.3M
        }
12390
45.6M
    }
12391
12.7M
    else {
12392
12.7M
        int kind = PyUnicode_KIND(self);
12393
12.7M
        const void *data = PyUnicode_DATA(self);
12394
12395
12.7M
        i = 0;
12396
12.7M
        if (striptype != RIGHTSTRIP) {
12397
12.2M
            while (i < len) {
12398
12.2M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12399
12.2M
                if (!Py_UNICODE_ISSPACE(ch))
12400
9.79M
                    break;
12401
2.49M
                i++;
12402
2.49M
            }
12403
9.79M
        }
12404
12405
12.7M
        j = len;
12406
12.7M
        if (striptype != LEFTSTRIP) {
12407
11.5M
            j--;
12408
14.2M
            while (j >= i) {
12409
14.2M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12410
14.2M
                if (!Py_UNICODE_ISSPACE(ch))
12411
11.5M
                    break;
12412
2.68M
                j--;
12413
2.68M
            }
12414
11.5M
            j++;
12415
11.5M
        }
12416
12.7M
    }
12417
12418
58.4M
    return PyUnicode_Substring(self, i, j);
12419
58.4M
}
12420
12421
12422
static PyObject *
12423
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12424
62.1M
{
12425
62.1M
    if (sep != Py_None) {
12426
3.65M
        if (PyUnicode_Check(sep))
12427
3.65M
            return _PyUnicode_XStrip(self, striptype, sep);
12428
0
        else {
12429
0
            PyErr_Format(PyExc_TypeError,
12430
0
                         "%s arg must be None or str",
12431
0
                         STRIPNAME(striptype));
12432
0
            return NULL;
12433
0
        }
12434
3.65M
    }
12435
12436
58.4M
    return do_strip(self, striptype);
12437
62.1M
}
12438
12439
12440
/*[clinic input]
12441
@permit_long_summary
12442
str.strip as unicode_strip
12443
12444
    chars: object = None
12445
    /
12446
12447
Return a copy of the string with leading and trailing whitespace removed.
12448
12449
If chars is given and not None, remove characters in chars instead.
12450
[clinic start generated code]*/
12451
12452
static PyObject *
12453
unicode_strip_impl(PyObject *self, PyObject *chars)
12454
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12455
44.6M
{
12456
44.6M
    return do_argstrip(self, BOTHSTRIP, chars);
12457
44.6M
}
12458
12459
12460
/*[clinic input]
12461
str.lstrip as unicode_lstrip
12462
12463
    chars: object = None
12464
    /
12465
12466
Return a copy of the string with leading whitespace removed.
12467
12468
If chars is given and not None, remove characters in chars instead.
12469
[clinic start generated code]*/
12470
12471
static PyObject *
12472
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12473
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12474
1.96M
{
12475
1.96M
    return do_argstrip(self, LEFTSTRIP, chars);
12476
1.96M
}
12477
12478
12479
/*[clinic input]
12480
str.rstrip as unicode_rstrip
12481
12482
    chars: object = None
12483
    /
12484
12485
Return a copy of the string with trailing whitespace removed.
12486
12487
If chars is given and not None, remove characters in chars instead.
12488
[clinic start generated code]*/
12489
12490
static PyObject *
12491
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12492
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12493
15.5M
{
12494
15.5M
    return do_argstrip(self, RIGHTSTRIP, chars);
12495
15.5M
}
12496
12497
12498
static PyObject*
12499
unicode_repeat(PyObject *str, Py_ssize_t len)
12500
305k
{
12501
305k
    PyObject *u;
12502
305k
    Py_ssize_t nchars, n;
12503
12504
305k
    if (len < 1)
12505
24.8k
        _Py_RETURN_UNICODE_EMPTY();
12506
12507
    /* no repeat, return original string */
12508
280k
    if (len == 1)
12509
25.1k
        return unicode_result_unchanged(str);
12510
12511
255k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12512
0
        PyErr_SetString(PyExc_OverflowError,
12513
0
                        "repeated string is too long");
12514
0
        return NULL;
12515
0
    }
12516
255k
    nchars = len * PyUnicode_GET_LENGTH(str);
12517
12518
255k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12519
255k
    if (!u)
12520
0
        return NULL;
12521
255k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12522
12523
255k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12524
253k
        int kind = PyUnicode_KIND(str);
12525
253k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12526
253k
        if (kind == PyUnicode_1BYTE_KIND) {
12527
253k
            void *to = PyUnicode_DATA(u);
12528
253k
            memset(to, (unsigned char)fill_char, len);
12529
253k
        }
12530
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12531
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12532
0
            for (n = 0; n < len; ++n)
12533
0
                ucs2[n] = fill_char;
12534
0
        } else {
12535
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12536
0
            assert(kind == PyUnicode_4BYTE_KIND);
12537
0
            for (n = 0; n < len; ++n)
12538
0
                ucs4[n] = fill_char;
12539
0
        }
12540
253k
    }
12541
2.24k
    else {
12542
2.24k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12543
2.24k
        char *to = (char *) PyUnicode_DATA(u);
12544
2.24k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12545
2.24k
            PyUnicode_GET_LENGTH(str) * char_size);
12546
2.24k
    }
12547
12548
255k
    assert(_PyUnicode_CheckConsistency(u, 1));
12549
255k
    return u;
12550
255k
}
12551
12552
PyObject *
12553
PyUnicode_Replace(PyObject *str,
12554
                  PyObject *substr,
12555
                  PyObject *replstr,
12556
                  Py_ssize_t maxcount)
12557
0
{
12558
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12559
0
            ensure_unicode(replstr) < 0)
12560
0
        return NULL;
12561
0
    return replace(str, substr, replstr, maxcount);
12562
0
}
12563
12564
/*[clinic input]
12565
@permit_long_docstring_body
12566
str.replace as unicode_replace
12567
12568
    old: unicode
12569
    new: unicode
12570
    /
12571
    count: Py_ssize_t = -1
12572
        Maximum number of occurrences to replace.
12573
        -1 (the default value) means replace all occurrences.
12574
12575
Return a copy with all occurrences of substring old replaced by new.
12576
12577
If the optional argument count is given, only the first count occurrences are
12578
replaced.
12579
[clinic start generated code]*/
12580
12581
static PyObject *
12582
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12583
                     Py_ssize_t count)
12584
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12585
16.3M
{
12586
16.3M
    return replace(self, old, new, count);
12587
16.3M
}
12588
12589
/*[clinic input]
12590
@permit_long_docstring_body
12591
str.removeprefix as unicode_removeprefix
12592
12593
    prefix: unicode
12594
    /
12595
12596
Return a str with the given prefix string removed if present.
12597
12598
If the string starts with the prefix string, return string[len(prefix):].
12599
Otherwise, return a copy of the original string.
12600
[clinic start generated code]*/
12601
12602
static PyObject *
12603
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12604
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12605
308
{
12606
308
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12607
308
    if (match == -1) {
12608
0
        return NULL;
12609
0
    }
12610
308
    if (match) {
12611
80
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12612
80
                                   PyUnicode_GET_LENGTH(self));
12613
80
    }
12614
228
    return unicode_result_unchanged(self);
12615
308
}
12616
12617
/*[clinic input]
12618
str.removesuffix as unicode_removesuffix
12619
12620
    suffix: unicode
12621
    /
12622
12623
Return a str with the given suffix string removed if present.
12624
12625
If the string ends with the suffix string and that suffix is not empty,
12626
return string[:-len(suffix)]. Otherwise, return a copy of the original
12627
string.
12628
[clinic start generated code]*/
12629
12630
static PyObject *
12631
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12632
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12633
0
{
12634
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12635
0
    if (match == -1) {
12636
0
        return NULL;
12637
0
    }
12638
0
    if (match) {
12639
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12640
0
                                            - PyUnicode_GET_LENGTH(suffix));
12641
0
    }
12642
0
    return unicode_result_unchanged(self);
12643
0
}
12644
12645
static PyObject *
12646
unicode_repr(PyObject *unicode)
12647
13.2M
{
12648
13.2M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12649
13.2M
    const void *idata = PyUnicode_DATA(unicode);
12650
12651
    /* Compute length of output, quote characters, and
12652
       maximum character */
12653
13.2M
    Py_ssize_t osize = 0;
12654
13.2M
    Py_UCS4 maxch = 127;
12655
13.2M
    Py_ssize_t squote = 0;
12656
13.2M
    Py_ssize_t dquote = 0;
12657
13.2M
    int ikind = PyUnicode_KIND(unicode);
12658
295M
    for (Py_ssize_t i = 0; i < isize; i++) {
12659
282M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12660
282M
        Py_ssize_t incr = 1;
12661
282M
        switch (ch) {
12662
209k
        case '\'': squote++; break;
12663
532k
        case '"':  dquote++; break;
12664
4.52M
        case '\\': case '\t': case '\r': case '\n':
12665
4.52M
            incr = 2;
12666
4.52M
            break;
12667
277M
        default:
12668
            /* Fast-path ASCII */
12669
277M
            if (ch < ' ' || ch == 0x7f)
12670
161M
                incr = 4; /* \xHH */
12671
115M
            else if (ch < 0x7f)
12672
103M
                ;
12673
11.4M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12674
11.1M
                maxch = (ch > maxch) ? ch : maxch;
12675
363k
            else if (ch < 0x100)
12676
71.5k
                incr = 4; /* \xHH */
12677
292k
            else if (ch < 0x10000)
12678
91.1k
                incr = 6; /* \uHHHH */
12679
201k
            else
12680
201k
                incr = 10; /* \uHHHHHHHH */
12681
282M
        }
12682
282M
        if (osize > PY_SSIZE_T_MAX - incr) {
12683
0
            PyErr_SetString(PyExc_OverflowError,
12684
0
                            "string is too long to generate repr");
12685
0
            return NULL;
12686
0
        }
12687
282M
        osize += incr;
12688
282M
    }
12689
12690
13.2M
    Py_UCS4 quote = '\'';
12691
13.2M
    int changed = (osize != isize);
12692
13.2M
    if (squote) {
12693
106k
        changed = 1;
12694
106k
        if (dquote)
12695
            /* Both squote and dquote present. Use squote,
12696
               and escape them */
12697
10.3k
            osize += squote;
12698
96.2k
        else
12699
96.2k
            quote = '"';
12700
106k
    }
12701
13.2M
    osize += 2;   /* quotes */
12702
12703
13.2M
    PyObject *repr = PyUnicode_New(osize, maxch);
12704
13.2M
    if (repr == NULL)
12705
0
        return NULL;
12706
13.2M
    int okind = PyUnicode_KIND(repr);
12707
13.2M
    void *odata = PyUnicode_DATA(repr);
12708
12709
13.2M
    if (!changed) {
12710
6.86M
        PyUnicode_WRITE(okind, odata, 0, quote);
12711
12712
6.86M
        _PyUnicode_FastCopyCharacters(repr, 1,
12713
6.86M
                                      unicode, 0,
12714
6.86M
                                      isize);
12715
12716
6.86M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12717
6.86M
    }
12718
6.39M
    else {
12719
6.39M
        switch (okind) {
12720
6.11M
        case PyUnicode_1BYTE_KIND:
12721
6.11M
            ucs1lib_repr(unicode, quote, odata);
12722
6.11M
            break;
12723
264k
        case PyUnicode_2BYTE_KIND:
12724
264k
            ucs2lib_repr(unicode, quote, odata);
12725
264k
            break;
12726
9.83k
        default:
12727
9.83k
            assert(okind == PyUnicode_4BYTE_KIND);
12728
9.83k
            ucs4lib_repr(unicode, quote, odata);
12729
6.39M
        }
12730
6.39M
    }
12731
12732
13.2M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12733
13.2M
    return repr;
12734
13.2M
}
12735
12736
/*[clinic input]
12737
@permit_long_summary
12738
str.rfind as unicode_rfind = str.count
12739
12740
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12741
12742
Optional arguments start and end are interpreted as in slice notation.
12743
Return -1 on failure.
12744
[clinic start generated code]*/
12745
12746
static Py_ssize_t
12747
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12748
                   Py_ssize_t end)
12749
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12750
242k
{
12751
242k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12752
242k
    if (result < 0) {
12753
8.92k
        return -1;
12754
8.92k
    }
12755
233k
    return result;
12756
242k
}
12757
12758
/*[clinic input]
12759
@permit_long_summary
12760
str.rindex as unicode_rindex = str.count
12761
12762
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12763
12764
Optional arguments start and end are interpreted as in slice notation.
12765
Raises ValueError when the substring is not found.
12766
[clinic start generated code]*/
12767
12768
static Py_ssize_t
12769
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12770
                    Py_ssize_t end)
12771
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12772
106k
{
12773
106k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12774
106k
    if (result == -1) {
12775
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12776
0
    }
12777
106k
    else if (result < 0) {
12778
0
        return -1;
12779
0
    }
12780
106k
    return result;
12781
106k
}
12782
12783
/*[clinic input]
12784
str.rjust as unicode_rjust
12785
12786
    width: Py_ssize_t
12787
    fillchar: Py_UCS4 = ' '
12788
    /
12789
12790
Return a right-justified string of length width.
12791
12792
Padding is done using the specified fill character (default is a space).
12793
[clinic start generated code]*/
12794
12795
static PyObject *
12796
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12797
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12798
0
{
12799
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12800
0
        return unicode_result_unchanged(self);
12801
12802
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12803
0
}
12804
12805
PyObject *
12806
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12807
0
{
12808
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12809
0
        return NULL;
12810
12811
0
    return split(s, sep, maxsplit);
12812
0
}
12813
12814
/*[clinic input]
12815
@permit_long_summary
12816
str.split as unicode_split
12817
12818
    sep: object = None
12819
        The separator used to split the string.
12820
12821
        When set to None (the default value), will split on any whitespace
12822
        character (including \n \r \t \f and spaces) and will discard
12823
        empty strings from the result.
12824
    maxsplit: Py_ssize_t = -1
12825
        Maximum number of splits.
12826
        -1 (the default value) means no limit.
12827
12828
Return a list of the substrings in the string, using sep as the separator string.
12829
12830
Splitting starts at the front of the string and works to the end.
12831
12832
Note, str.split() is mainly useful for data that has been intentionally
12833
delimited.  With natural text that includes punctuation, consider using
12834
the regular expression module.
12835
12836
[clinic start generated code]*/
12837
12838
static PyObject *
12839
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12840
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12841
20.4M
{
12842
20.4M
    if (sep == Py_None)
12843
173k
        return split(self, NULL, maxsplit);
12844
20.2M
    if (PyUnicode_Check(sep))
12845
20.2M
        return split(self, sep, maxsplit);
12846
12847
0
    PyErr_Format(PyExc_TypeError,
12848
0
                 "must be str or None, not %.100s",
12849
0
                 Py_TYPE(sep)->tp_name);
12850
0
    return NULL;
12851
20.2M
}
12852
12853
PyObject *
12854
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12855
8.24M
{
12856
8.24M
    PyObject* out;
12857
8.24M
    int kind1, kind2;
12858
8.24M
    const void *buf1, *buf2;
12859
8.24M
    Py_ssize_t len1, len2;
12860
12861
8.24M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12862
0
        return NULL;
12863
12864
8.24M
    kind1 = PyUnicode_KIND(str_obj);
12865
8.24M
    kind2 = PyUnicode_KIND(sep_obj);
12866
8.24M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12867
8.24M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12868
8.24M
    if (kind1 < kind2 || len1 < len2) {
12869
1.25k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12870
1.25k
        return PyTuple_Pack(3, str_obj, empty, empty);
12871
1.25k
    }
12872
8.24M
    buf1 = PyUnicode_DATA(str_obj);
12873
8.24M
    buf2 = PyUnicode_DATA(sep_obj);
12874
8.24M
    if (kind2 != kind1) {
12875
87.2k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12876
87.2k
        if (!buf2)
12877
0
            return NULL;
12878
87.2k
    }
12879
12880
8.24M
    switch (kind1) {
12881
8.16M
    case PyUnicode_1BYTE_KIND:
12882
8.16M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12883
2.55M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12884
5.60M
        else
12885
5.60M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12886
8.16M
        break;
12887
74.7k
    case PyUnicode_2BYTE_KIND:
12888
74.7k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12889
74.7k
        break;
12890
12.5k
    case PyUnicode_4BYTE_KIND:
12891
12.5k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12892
12.5k
        break;
12893
0
    default:
12894
0
        Py_UNREACHABLE();
12895
8.24M
    }
12896
12897
8.24M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12898
8.24M
    if (kind2 != kind1)
12899
87.2k
        PyMem_Free((void *)buf2);
12900
12901
8.24M
    return out;
12902
8.24M
}
12903
12904
12905
PyObject *
12906
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12907
51.9k
{
12908
51.9k
    PyObject* out;
12909
51.9k
    int kind1, kind2;
12910
51.9k
    const void *buf1, *buf2;
12911
51.9k
    Py_ssize_t len1, len2;
12912
12913
51.9k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12914
0
        return NULL;
12915
12916
51.9k
    kind1 = PyUnicode_KIND(str_obj);
12917
51.9k
    kind2 = PyUnicode_KIND(sep_obj);
12918
51.9k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12919
51.9k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12920
51.9k
    if (kind1 < kind2 || len1 < len2) {
12921
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12922
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12923
0
    }
12924
51.9k
    buf1 = PyUnicode_DATA(str_obj);
12925
51.9k
    buf2 = PyUnicode_DATA(sep_obj);
12926
51.9k
    if (kind2 != kind1) {
12927
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12928
0
        if (!buf2)
12929
0
            return NULL;
12930
0
    }
12931
12932
51.9k
    switch (kind1) {
12933
51.9k
    case PyUnicode_1BYTE_KIND:
12934
51.9k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12935
51.9k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12936
0
        else
12937
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12938
51.9k
        break;
12939
0
    case PyUnicode_2BYTE_KIND:
12940
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12941
0
        break;
12942
0
    case PyUnicode_4BYTE_KIND:
12943
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12944
0
        break;
12945
0
    default:
12946
0
        Py_UNREACHABLE();
12947
51.9k
    }
12948
12949
51.9k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12950
51.9k
    if (kind2 != kind1)
12951
0
        PyMem_Free((void *)buf2);
12952
12953
51.9k
    return out;
12954
51.9k
}
12955
12956
/*[clinic input]
12957
@permit_long_docstring_body
12958
str.partition as unicode_partition
12959
12960
    sep: object
12961
    /
12962
12963
Partition the string into three parts using the given separator.
12964
12965
This will search for the separator in the string.  If the separator is found,
12966
returns a 3-tuple containing the part before the separator, the separator
12967
itself, and the part after it.
12968
12969
If the separator is not found, returns a 3-tuple containing the original string
12970
and two empty strings.
12971
[clinic start generated code]*/
12972
12973
static PyObject *
12974
unicode_partition(PyObject *self, PyObject *sep)
12975
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
12976
8.24M
{
12977
8.24M
    return PyUnicode_Partition(self, sep);
12978
8.24M
}
12979
12980
/*[clinic input]
12981
@permit_long_docstring_body
12982
str.rpartition as unicode_rpartition = str.partition
12983
12984
Partition the string into three parts using the given separator.
12985
12986
This will search for the separator in the string, starting at the end. If
12987
the separator is found, returns a 3-tuple containing the part before the
12988
separator, the separator itself, and the part after it.
12989
12990
If the separator is not found, returns a 3-tuple containing two empty strings
12991
and the original string.
12992
[clinic start generated code]*/
12993
12994
static PyObject *
12995
unicode_rpartition(PyObject *self, PyObject *sep)
12996
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
12997
51.9k
{
12998
51.9k
    return PyUnicode_RPartition(self, sep);
12999
51.9k
}
13000
13001
PyObject *
13002
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13003
0
{
13004
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13005
0
        return NULL;
13006
13007
0
    return rsplit(s, sep, maxsplit);
13008
0
}
13009
13010
/*[clinic input]
13011
@permit_long_summary
13012
str.rsplit as unicode_rsplit = str.split
13013
13014
Return a list of the substrings in the string, using sep as the separator string.
13015
13016
Splitting starts at the end of the string and works to the front.
13017
[clinic start generated code]*/
13018
13019
static PyObject *
13020
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13021
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13022
78
{
13023
78
    if (sep == Py_None)
13024
0
        return rsplit(self, NULL, maxsplit);
13025
78
    if (PyUnicode_Check(sep))
13026
78
        return rsplit(self, sep, maxsplit);
13027
13028
0
    PyErr_Format(PyExc_TypeError,
13029
0
                 "must be str or None, not %.100s",
13030
0
                 Py_TYPE(sep)->tp_name);
13031
0
    return NULL;
13032
78
}
13033
13034
/*[clinic input]
13035
@permit_long_docstring_body
13036
str.splitlines as unicode_splitlines
13037
13038
    keepends: bool = False
13039
13040
Return a list of the lines in the string, breaking at line boundaries.
13041
13042
Line breaks are not included in the resulting list unless keepends is given and
13043
true.
13044
[clinic start generated code]*/
13045
13046
static PyObject *
13047
unicode_splitlines_impl(PyObject *self, int keepends)
13048
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13049
14.3k
{
13050
14.3k
    return PyUnicode_Splitlines(self, keepends);
13051
14.3k
}
13052
13053
static
13054
PyObject *unicode_str(PyObject *self)
13055
2.77M
{
13056
2.77M
    return unicode_result_unchanged(self);
13057
2.77M
}
13058
13059
/*[clinic input]
13060
@permit_long_summary
13061
str.swapcase as unicode_swapcase
13062
13063
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13064
[clinic start generated code]*/
13065
13066
static PyObject *
13067
unicode_swapcase_impl(PyObject *self)
13068
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13069
0
{
13070
0
    return case_operation(self, do_swapcase);
13071
0
}
13072
13073
static int
13074
unicode_maketrans_from_dict(PyObject *x, PyObject *newdict)
13075
0
{
13076
0
    PyObject *key, *value;
13077
0
    Py_ssize_t i = 0;
13078
0
    int res;
13079
0
    while (PyDict_Next(x, &i, &key, &value)) {
13080
0
        if (PyUnicode_Check(key)) {
13081
0
            PyObject *newkey;
13082
0
            int kind;
13083
0
            const void *data;
13084
0
            if (PyUnicode_GET_LENGTH(key) != 1) {
13085
0
                PyErr_SetString(PyExc_ValueError, "string keys in translate"
13086
0
                                "table must be of length 1");
13087
0
                return -1;
13088
0
            }
13089
0
            kind = PyUnicode_KIND(key);
13090
0
            data = PyUnicode_DATA(key);
13091
0
            newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13092
0
            if (!newkey)
13093
0
                return -1;
13094
0
            res = PyDict_SetItem(newdict, newkey, value);
13095
0
            Py_DECREF(newkey);
13096
0
            if (res < 0)
13097
0
                return -1;
13098
0
        }
13099
0
        else if (PyLong_Check(key)) {
13100
0
            if (PyDict_SetItem(newdict, key, value) < 0)
13101
0
                return -1;
13102
0
        }
13103
0
        else {
13104
0
            PyErr_SetString(PyExc_TypeError, "keys in translate table must"
13105
0
                            "be strings or integers");
13106
0
            return -1;
13107
0
        }
13108
0
    }
13109
0
    return 0;
13110
0
}
13111
13112
/*[clinic input]
13113
13114
@staticmethod
13115
str.maketrans as unicode_maketrans
13116
13117
  x: object
13118
13119
  y: unicode=NULL
13120
13121
  z: unicode=NULL
13122
13123
  /
13124
13125
Return a translation table usable for str.translate().
13126
13127
If there is only one argument, it must be a dictionary mapping Unicode
13128
ordinals (integers) or characters to Unicode ordinals, strings or None.
13129
Character keys will be then converted to ordinals.
13130
If there are two arguments, they must be strings of equal length, and
13131
in the resulting dictionary, each character in x will be mapped to the
13132
character at the same position in y. If there is a third argument, it
13133
must be a string, whose characters will be mapped to None in the result.
13134
[clinic start generated code]*/
13135
13136
static PyObject *
13137
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13138
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13139
4
{
13140
4
    PyObject *new = NULL, *key, *value;
13141
4
    Py_ssize_t i = 0;
13142
4
    int res;
13143
13144
4
    new = PyDict_New();
13145
4
    if (!new)
13146
0
        return NULL;
13147
4
    if (y != NULL) {
13148
4
        int x_kind, y_kind, z_kind;
13149
4
        const void *x_data, *y_data, *z_data;
13150
13151
        /* x must be a string too, of equal length */
13152
4
        if (!PyUnicode_Check(x)) {
13153
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13154
0
                            "be a string if there is a second argument");
13155
0
            goto err;
13156
0
        }
13157
4
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13158
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13159
0
                            "arguments must have equal length");
13160
0
            goto err;
13161
0
        }
13162
        /* create entries for translating chars in x to those in y */
13163
4
        x_kind = PyUnicode_KIND(x);
13164
4
        y_kind = PyUnicode_KIND(y);
13165
4
        x_data = PyUnicode_DATA(x);
13166
4
        y_data = PyUnicode_DATA(y);
13167
36
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13168
32
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13169
32
            if (!key)
13170
0
                goto err;
13171
32
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13172
32
            if (!value) {
13173
0
                Py_DECREF(key);
13174
0
                goto err;
13175
0
            }
13176
32
            res = PyDict_SetItem(new, key, value);
13177
32
            Py_DECREF(key);
13178
32
            Py_DECREF(value);
13179
32
            if (res < 0)
13180
0
                goto err;
13181
32
        }
13182
        /* create entries for deleting chars in z */
13183
4
        if (z != NULL) {
13184
0
            z_kind = PyUnicode_KIND(z);
13185
0
            z_data = PyUnicode_DATA(z);
13186
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13187
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13188
0
                if (!key)
13189
0
                    goto err;
13190
0
                res = PyDict_SetItem(new, key, Py_None);
13191
0
                Py_DECREF(key);
13192
0
                if (res < 0)
13193
0
                    goto err;
13194
0
            }
13195
0
        }
13196
4
    } else {
13197
        /* x must be a dict */
13198
0
        if (!PyAnyDict_CheckExact(x)) {
13199
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13200
0
                            "to maketrans it must be a dict");
13201
0
            goto err;
13202
0
        }
13203
        /* copy entries into the new dict, converting string keys to int keys */
13204
0
        int errcode;
13205
0
        Py_BEGIN_CRITICAL_SECTION(x);
13206
0
        errcode = unicode_maketrans_from_dict(x, new);
13207
0
        Py_END_CRITICAL_SECTION();
13208
0
        if (errcode < 0)
13209
0
            goto err;
13210
0
    }
13211
4
    return new;
13212
0
  err:
13213
0
    Py_DECREF(new);
13214
0
    return NULL;
13215
4
}
13216
13217
/*[clinic input]
13218
@permit_long_docstring_body
13219
str.translate as unicode_translate
13220
13221
    table: object
13222
        Translation table, which must be a mapping of Unicode ordinals to
13223
        Unicode ordinals, strings, or None.
13224
    /
13225
13226
Replace each character in the string using the given translation table.
13227
13228
The table must implement lookup/indexing via __getitem__, for instance a
13229
dictionary or list.  If this operation raises LookupError, the character is
13230
left untouched.  Characters mapped to None are deleted.
13231
[clinic start generated code]*/
13232
13233
static PyObject *
13234
unicode_translate(PyObject *self, PyObject *table)
13235
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13236
12.3k
{
13237
12.3k
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13238
12.3k
}
13239
13240
/*[clinic input]
13241
str.upper as unicode_upper
13242
13243
Return a copy of the string converted to uppercase.
13244
[clinic start generated code]*/
13245
13246
static PyObject *
13247
unicode_upper_impl(PyObject *self)
13248
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13249
306
{
13250
306
    if (PyUnicode_IS_ASCII(self))
13251
306
        return ascii_upper_or_lower(self, 0);
13252
0
    return case_operation(self, do_upper);
13253
306
}
13254
13255
/*[clinic input]
13256
@permit_long_summary
13257
str.zfill as unicode_zfill
13258
13259
    width: Py_ssize_t
13260
    /
13261
13262
Pad a numeric string with zeros on the left, to fill a field of the given width.
13263
13264
The string is never truncated.
13265
[clinic start generated code]*/
13266
13267
static PyObject *
13268
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13269
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13270
0
{
13271
0
    Py_ssize_t fill;
13272
0
    PyObject *u;
13273
0
    int kind;
13274
0
    const void *data;
13275
0
    Py_UCS4 chr;
13276
13277
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13278
0
        return unicode_result_unchanged(self);
13279
13280
0
    fill = width - PyUnicode_GET_LENGTH(self);
13281
13282
0
    u = pad(self, fill, 0, '0');
13283
13284
0
    if (u == NULL)
13285
0
        return NULL;
13286
13287
0
    kind = PyUnicode_KIND(u);
13288
0
    data = PyUnicode_DATA(u);
13289
0
    chr = PyUnicode_READ(kind, data, fill);
13290
13291
0
    if (chr == '+' || chr == '-') {
13292
        /* move sign to beginning of string */
13293
0
        PyUnicode_WRITE(kind, data, 0, chr);
13294
0
        PyUnicode_WRITE(kind, data, fill, '0');
13295
0
    }
13296
13297
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13298
0
    return u;
13299
0
}
13300
13301
/*[clinic input]
13302
@permit_long_summary
13303
@text_signature "($self, prefix[, start[, end]], /)"
13304
str.startswith as unicode_startswith
13305
13306
    prefix as subobj: object
13307
        A string or a tuple of strings to try.
13308
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13309
        Optional start position. Default: start of the string.
13310
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13311
        Optional stop position. Default: end of the string.
13312
    /
13313
13314
Return True if the string starts with the specified prefix, False otherwise.
13315
[clinic start generated code]*/
13316
13317
static PyObject *
13318
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13319
                        Py_ssize_t end)
13320
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13321
34.0M
{
13322
34.0M
    if (PyTuple_Check(subobj)) {
13323
1.44M
        Py_ssize_t i;
13324
5.22M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13325
3.79M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13326
3.79M
            if (!PyUnicode_Check(substring)) {
13327
0
                PyErr_Format(PyExc_TypeError,
13328
0
                             "tuple for startswith must only contain str, "
13329
0
                             "not %.100s",
13330
0
                             Py_TYPE(substring)->tp_name);
13331
0
                return NULL;
13332
0
            }
13333
3.79M
            int result = tailmatch(self, substring, start, end, -1);
13334
3.79M
            if (result < 0) {
13335
0
                return NULL;
13336
0
            }
13337
3.79M
            if (result) {
13338
15.0k
                Py_RETURN_TRUE;
13339
15.0k
            }
13340
3.79M
        }
13341
        /* nothing matched */
13342
1.44M
        Py_RETURN_FALSE;
13343
1.44M
    }
13344
32.6M
    if (!PyUnicode_Check(subobj)) {
13345
0
        PyErr_Format(PyExc_TypeError,
13346
0
                     "startswith first arg must be str or "
13347
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13348
0
        return NULL;
13349
0
    }
13350
32.6M
    int result = tailmatch(self, subobj, start, end, -1);
13351
32.6M
    if (result < 0) {
13352
0
        return NULL;
13353
0
    }
13354
32.6M
    return PyBool_FromLong(result);
13355
32.6M
}
13356
13357
13358
/*[clinic input]
13359
@permit_long_summary
13360
@text_signature "($self, suffix[, start[, end]], /)"
13361
str.endswith as unicode_endswith
13362
13363
    suffix as subobj: object
13364
        A string or a tuple of strings to try.
13365
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13366
        Optional start position. Default: start of the string.
13367
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13368
        Optional stop position. Default: end of the string.
13369
    /
13370
13371
Return True if the string ends with the specified suffix, False otherwise.
13372
[clinic start generated code]*/
13373
13374
static PyObject *
13375
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13376
                      Py_ssize_t end)
13377
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13378
9.41M
{
13379
9.41M
    if (PyTuple_Check(subobj)) {
13380
179k
        Py_ssize_t i;
13381
341k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13382
316k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13383
316k
            if (!PyUnicode_Check(substring)) {
13384
0
                PyErr_Format(PyExc_TypeError,
13385
0
                             "tuple for endswith must only contain str, "
13386
0
                             "not %.100s",
13387
0
                             Py_TYPE(substring)->tp_name);
13388
0
                return NULL;
13389
0
            }
13390
316k
            int result = tailmatch(self, substring, start, end, +1);
13391
316k
            if (result < 0) {
13392
0
                return NULL;
13393
0
            }
13394
316k
            if (result) {
13395
154k
                Py_RETURN_TRUE;
13396
154k
            }
13397
316k
        }
13398
179k
        Py_RETURN_FALSE;
13399
179k
    }
13400
9.23M
    if (!PyUnicode_Check(subobj)) {
13401
0
        PyErr_Format(PyExc_TypeError,
13402
0
                     "endswith first arg must be str or "
13403
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13404
0
        return NULL;
13405
0
    }
13406
9.23M
    int result = tailmatch(self, subobj, start, end, +1);
13407
9.23M
    if (result < 0) {
13408
0
        return NULL;
13409
0
    }
13410
9.23M
    return PyBool_FromLong(result);
13411
9.23M
}
13412
13413
13414
#include "stringlib/unicode_format.h"
13415
13416
PyDoc_STRVAR(format__doc__,
13417
             "format($self, /, *args, **kwargs)\n\
13418
--\n\
13419
\n\
13420
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13421
The substitutions are identified by braces ('{' and '}').");
13422
13423
PyDoc_STRVAR(format_map__doc__,
13424
             "format_map($self, mapping, /)\n\
13425
--\n\
13426
\n\
13427
Return a formatted version of the string, using substitutions from mapping.\n\
13428
The substitutions are identified by braces ('{' and '}').");
13429
13430
/*[clinic input]
13431
str.__format__ as unicode___format__
13432
13433
    format_spec: unicode
13434
    /
13435
13436
Return a formatted version of the string as described by format_spec.
13437
[clinic start generated code]*/
13438
13439
static PyObject *
13440
unicode___format___impl(PyObject *self, PyObject *format_spec)
13441
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13442
0
{
13443
0
    _PyUnicodeWriter writer;
13444
0
    int ret;
13445
13446
0
    _PyUnicodeWriter_Init(&writer);
13447
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13448
0
                                          self, format_spec, 0,
13449
0
                                          PyUnicode_GET_LENGTH(format_spec));
13450
0
    if (ret == -1) {
13451
0
        _PyUnicodeWriter_Dealloc(&writer);
13452
0
        return NULL;
13453
0
    }
13454
0
    return _PyUnicodeWriter_Finish(&writer);
13455
0
}
13456
13457
/*[clinic input]
13458
str.__sizeof__ as unicode_sizeof
13459
13460
Return the size of the string in memory, in bytes.
13461
[clinic start generated code]*/
13462
13463
static PyObject *
13464
unicode_sizeof_impl(PyObject *self)
13465
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13466
0
{
13467
0
    Py_ssize_t size;
13468
13469
    /* If it's a compact object, account for base structure +
13470
       character data. */
13471
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13472
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13473
0
    }
13474
0
    else if (PyUnicode_IS_COMPACT(self)) {
13475
0
        size = sizeof(PyCompactUnicodeObject) +
13476
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13477
0
    }
13478
0
    else {
13479
        /* If it is a two-block object, account for base object, and
13480
           for character block if present. */
13481
0
        size = sizeof(PyUnicodeObject);
13482
0
        if (_PyUnicode_DATA_ANY(self))
13483
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13484
0
                PyUnicode_KIND(self);
13485
0
    }
13486
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13487
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13488
13489
0
    return PyLong_FromSsize_t(size);
13490
0
}
13491
13492
static PyObject *
13493
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13494
0
{
13495
0
    PyObject *copy = _PyUnicode_Copy(v);
13496
0
    if (!copy)
13497
0
        return NULL;
13498
0
    return Py_BuildValue("(N)", copy);
13499
0
}
13500
13501
/*
13502
This function searchs the longest common leading whitespace
13503
of all lines in the [src, end).
13504
It returns the length of the common leading whitespace and sets `output` to
13505
point to the beginning of the common leading whitespace if length > 0.
13506
*/
13507
static Py_ssize_t
13508
search_longest_common_leading_whitespace(
13509
    const char *const src,
13510
    const char *const end,
13511
    const char **output)
13512
0
{
13513
    // [_start, _start + _len)
13514
    // describes the current longest common leading whitespace
13515
0
    const char *_start = NULL;
13516
0
    Py_ssize_t _len = 0;
13517
13518
0
    for (const char *iter = src; iter < end; ++iter) {
13519
0
        const char *line_start = iter;
13520
0
        const char *leading_whitespace_end = NULL;
13521
13522
        // scan the whole line
13523
0
        while (iter < end && *iter != '\n') {
13524
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13525
                /* `iter` points to the first non-whitespace character
13526
                   in this line */
13527
0
                if (iter == line_start) {
13528
                    // some line has no indent, fast exit!
13529
0
                    return 0;
13530
0
                }
13531
0
                leading_whitespace_end = iter;
13532
0
            }
13533
0
            ++iter;
13534
0
        }
13535
13536
        // if this line has all white space, skip it
13537
0
        if (!leading_whitespace_end) {
13538
0
            continue;
13539
0
        }
13540
13541
0
        if (!_start) {
13542
            // update the first leading whitespace
13543
0
            _start = line_start;
13544
0
            _len = leading_whitespace_end - line_start;
13545
0
            assert(_len > 0);
13546
0
        }
13547
0
        else {
13548
            /* We then compare with the current longest leading whitespace.
13549
13550
               [line_start, leading_whitespace_end) is the leading
13551
               whitespace of this line,
13552
13553
               [_start, _start + _len) is the leading whitespace of the
13554
               current longest leading whitespace. */
13555
0
            Py_ssize_t new_len = 0;
13556
0
            const char *_iter = _start, *line_iter = line_start;
13557
13558
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13559
0
                   && *_iter == *line_iter)
13560
0
            {
13561
0
                ++_iter;
13562
0
                ++line_iter;
13563
0
                ++new_len;
13564
0
            }
13565
13566
0
            _len = new_len;
13567
0
            if (_len == 0) {
13568
                // No common things now, fast exit!
13569
0
                return 0;
13570
0
            }
13571
0
        }
13572
0
    }
13573
13574
0
    assert(_len >= 0);
13575
0
    if (_len > 0) {
13576
0
        *output = _start;
13577
0
    }
13578
0
    return _len;
13579
0
}
13580
13581
/* Dedent a string.
13582
   Intended to dedent Python source. Unlike `textwrap.dedent`, this
13583
   only supports spaces and tabs and doesn't normalize empty lines.
13584
   Return a new reference on success, NULL with exception set on error.
13585
   */
13586
PyObject *
13587
_PyUnicode_Dedent(PyObject *unicode)
13588
0
{
13589
0
    Py_ssize_t src_len = 0;
13590
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13591
0
    if (!src) {
13592
0
        return NULL;
13593
0
    }
13594
0
    assert(src_len >= 0);
13595
0
    if (src_len == 0) {
13596
0
        return Py_NewRef(unicode);
13597
0
    }
13598
13599
0
    const char *const end = src + src_len;
13600
13601
    // [whitespace_start, whitespace_start + whitespace_len)
13602
    // describes the current longest common leading whitespace
13603
0
    const char *whitespace_start = NULL;
13604
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13605
0
        src, end, &whitespace_start);
13606
13607
0
    if (whitespace_len == 0) {
13608
0
        return Py_NewRef(unicode);
13609
0
    }
13610
13611
    // now we should trigger a dedent
13612
0
    char *dest = PyMem_Malloc(src_len);
13613
0
    if (!dest) {
13614
0
        PyErr_NoMemory();
13615
0
        return NULL;
13616
0
    }
13617
0
    char *dest_iter = dest;
13618
13619
0
    for (const char *iter = src; iter < end; ++iter) {
13620
0
        const char *line_start = iter;
13621
0
        bool in_leading_space = true;
13622
13623
        // iterate over a line to find the end of a line
13624
0
        while (iter < end && *iter != '\n') {
13625
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13626
0
                in_leading_space = false;
13627
0
            }
13628
0
            ++iter;
13629
0
        }
13630
13631
        // invariant: *iter == '\n' or iter == end
13632
0
        bool append_newline = iter < end;
13633
13634
        // if this line has all white space, write '\n' and continue
13635
0
        if (in_leading_space && append_newline) {
13636
0
            *dest_iter++ = '\n';
13637
0
            continue;
13638
0
        }
13639
13640
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13641
            conditionally append '\n' */
13642
13643
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13644
0
        assert(new_line_len >= 0);
13645
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13646
13647
0
        dest_iter += new_line_len;
13648
13649
0
        if (append_newline) {
13650
0
            *dest_iter++ = '\n';
13651
0
        }
13652
0
    }
13653
13654
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13655
0
    PyMem_Free(dest);
13656
0
    return res;
13657
0
}
13658
13659
static PyMethodDef unicode_methods[] = {
13660
    UNICODE_ENCODE_METHODDEF
13661
    UNICODE_REPLACE_METHODDEF
13662
    UNICODE_SPLIT_METHODDEF
13663
    UNICODE_RSPLIT_METHODDEF
13664
    UNICODE_JOIN_METHODDEF
13665
    UNICODE_CAPITALIZE_METHODDEF
13666
    UNICODE_CASEFOLD_METHODDEF
13667
    UNICODE_TITLE_METHODDEF
13668
    UNICODE_CENTER_METHODDEF
13669
    UNICODE_COUNT_METHODDEF
13670
    UNICODE_EXPANDTABS_METHODDEF
13671
    UNICODE_FIND_METHODDEF
13672
    UNICODE_PARTITION_METHODDEF
13673
    UNICODE_INDEX_METHODDEF
13674
    UNICODE_LJUST_METHODDEF
13675
    UNICODE_LOWER_METHODDEF
13676
    UNICODE_LSTRIP_METHODDEF
13677
    UNICODE_RFIND_METHODDEF
13678
    UNICODE_RINDEX_METHODDEF
13679
    UNICODE_RJUST_METHODDEF
13680
    UNICODE_RSTRIP_METHODDEF
13681
    UNICODE_RPARTITION_METHODDEF
13682
    UNICODE_SPLITLINES_METHODDEF
13683
    UNICODE_STRIP_METHODDEF
13684
    UNICODE_SWAPCASE_METHODDEF
13685
    UNICODE_TRANSLATE_METHODDEF
13686
    UNICODE_UPPER_METHODDEF
13687
    UNICODE_STARTSWITH_METHODDEF
13688
    UNICODE_ENDSWITH_METHODDEF
13689
    UNICODE_REMOVEPREFIX_METHODDEF
13690
    UNICODE_REMOVESUFFIX_METHODDEF
13691
    UNICODE_ISASCII_METHODDEF
13692
    UNICODE_ISLOWER_METHODDEF
13693
    UNICODE_ISUPPER_METHODDEF
13694
    UNICODE_ISTITLE_METHODDEF
13695
    UNICODE_ISSPACE_METHODDEF
13696
    UNICODE_ISDECIMAL_METHODDEF
13697
    UNICODE_ISDIGIT_METHODDEF
13698
    UNICODE_ISNUMERIC_METHODDEF
13699
    UNICODE_ISALPHA_METHODDEF
13700
    UNICODE_ISALNUM_METHODDEF
13701
    UNICODE_ISIDENTIFIER_METHODDEF
13702
    UNICODE_ISPRINTABLE_METHODDEF
13703
    UNICODE_ZFILL_METHODDEF
13704
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13705
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13706
    UNICODE___FORMAT___METHODDEF
13707
    UNICODE_MAKETRANS_METHODDEF
13708
    UNICODE_SIZEOF_METHODDEF
13709
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13710
    {NULL, NULL}
13711
};
13712
13713
static PyObject *
13714
unicode_mod(PyObject *v, PyObject *w)
13715
14.0M
{
13716
14.0M
    if (!PyUnicode_Check(v))
13717
0
        Py_RETURN_NOTIMPLEMENTED;
13718
14.0M
    return PyUnicode_Format(v, w);
13719
14.0M
}
13720
13721
static PyNumberMethods unicode_as_number = {
13722
    0,              /*nb_add*/
13723
    0,              /*nb_subtract*/
13724
    0,              /*nb_multiply*/
13725
    unicode_mod,            /*nb_remainder*/
13726
};
13727
13728
static PySequenceMethods unicode_as_sequence = {
13729
    unicode_length,     /* sq_length */
13730
    PyUnicode_Concat,   /* sq_concat */
13731
    unicode_repeat,     /* sq_repeat */
13732
    unicode_getitem,    /* sq_item */
13733
    0,                  /* sq_slice */
13734
    0,                  /* sq_ass_item */
13735
    0,                  /* sq_ass_slice */
13736
    PyUnicode_Contains, /* sq_contains */
13737
};
13738
13739
static PyObject*
13740
unicode_subscript(PyObject* self, PyObject* item)
13741
63.3M
{
13742
63.3M
    if (_PyIndex_Check(item)) {
13743
43.7M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13744
43.7M
        if (i == -1 && PyErr_Occurred())
13745
0
            return NULL;
13746
43.7M
        if (i < 0)
13747
62.4k
            i += PyUnicode_GET_LENGTH(self);
13748
43.7M
        return unicode_getitem(self, i);
13749
43.7M
    } else if (PySlice_Check(item)) {
13750
19.6M
        Py_ssize_t start, stop, step, slicelength, i;
13751
19.6M
        size_t cur;
13752
19.6M
        PyObject *result;
13753
19.6M
        const void *src_data;
13754
19.6M
        void *dest_data;
13755
19.6M
        int src_kind, dest_kind;
13756
19.6M
        Py_UCS4 ch, max_char, kind_limit;
13757
13758
19.6M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13759
0
            return NULL;
13760
0
        }
13761
19.6M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13762
19.6M
                                            &start, &stop, step);
13763
13764
19.6M
        if (slicelength <= 0) {
13765
1.14M
            _Py_RETURN_UNICODE_EMPTY();
13766
18.5M
        } else if (start == 0 && step == 1 &&
13767
6.75M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13768
4.54M
            return unicode_result_unchanged(self);
13769
13.9M
        } else if (step == 1) {
13770
13.9M
            return PyUnicode_Substring(self,
13771
13.9M
                                       start, start + slicelength);
13772
13.9M
        }
13773
        /* General case */
13774
0
        src_kind = PyUnicode_KIND(self);
13775
0
        src_data = PyUnicode_DATA(self);
13776
0
        if (!PyUnicode_IS_ASCII(self)) {
13777
0
            kind_limit = kind_maxchar_limit(src_kind);
13778
0
            max_char = 0;
13779
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13780
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13781
0
                if (ch > max_char) {
13782
0
                    max_char = ch;
13783
0
                    if (max_char >= kind_limit)
13784
0
                        break;
13785
0
                }
13786
0
            }
13787
0
        }
13788
0
        else
13789
0
            max_char = 127;
13790
0
        result = PyUnicode_New(slicelength, max_char);
13791
0
        if (result == NULL)
13792
0
            return NULL;
13793
0
        dest_kind = PyUnicode_KIND(result);
13794
0
        dest_data = PyUnicode_DATA(result);
13795
13796
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13797
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13798
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13799
0
        }
13800
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13801
0
        return result;
13802
0
    } else {
13803
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13804
0
                     Py_TYPE(item)->tp_name);
13805
0
        return NULL;
13806
0
    }
13807
63.3M
}
13808
13809
static PyMappingMethods unicode_as_mapping = {
13810
    unicode_length,     /* mp_length */
13811
    unicode_subscript,  /* mp_subscript */
13812
    0,                  /* mp_ass_subscript */
13813
};
13814
13815
13816
static PyObject *
13817
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13818
13819
/*[clinic input]
13820
@classmethod
13821
str.__new__ as unicode_new
13822
13823
    object as x: object = NULL
13824
    encoding: str = NULL
13825
    errors: str = NULL
13826
13827
[clinic start generated code]*/
13828
13829
static PyObject *
13830
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13831
                 const char *errors)
13832
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13833
16.1M
{
13834
16.1M
    PyObject *unicode;
13835
16.1M
    if (x == NULL) {
13836
0
        unicode = _PyUnicode_GetEmpty();
13837
0
    }
13838
16.1M
    else if (encoding == NULL && errors == NULL) {
13839
16.1M
        unicode = PyObject_Str(x);
13840
16.1M
    }
13841
0
    else {
13842
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13843
0
    }
13844
13845
16.1M
    if (unicode != NULL && type != &PyUnicode_Type) {
13846
16.1M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13847
16.1M
    }
13848
16.1M
    return unicode;
13849
16.1M
}
13850
13851
static const char *
13852
arg_as_utf8(PyObject *obj, const char *name)
13853
646k
{
13854
646k
    if (!PyUnicode_Check(obj)) {
13855
0
        PyErr_Format(PyExc_TypeError,
13856
0
                     "str() argument '%s' must be str, not %T",
13857
0
                     name, obj);
13858
0
        return NULL;
13859
0
    }
13860
646k
    return _PyUnicode_AsUTF8NoNUL(obj);
13861
646k
}
13862
13863
static PyObject *
13864
unicode_vectorcall(PyObject *type, PyObject *const *args,
13865
                   size_t nargsf, PyObject *kwnames)
13866
416k
{
13867
416k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13868
13869
416k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13870
416k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13871
        // Fallback to unicode_new()
13872
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13873
0
        if (tuple == NULL) {
13874
0
            return NULL;
13875
0
        }
13876
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13877
0
        if (dict == NULL) {
13878
0
            Py_DECREF(tuple);
13879
0
            return NULL;
13880
0
        }
13881
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13882
0
        Py_DECREF(tuple);
13883
0
        Py_DECREF(dict);
13884
0
        return ret;
13885
0
    }
13886
416k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13887
0
        return NULL;
13888
0
    }
13889
416k
    if (nargs == 0) {
13890
3.86k
        return _PyUnicode_GetEmpty();
13891
3.86k
    }
13892
412k
    PyObject *object = args[0];
13893
412k
    if (nargs == 1) {
13894
1.24k
        return PyObject_Str(object);
13895
1.24k
    }
13896
411k
    const char *encoding = arg_as_utf8(args[1], "encoding");
13897
411k
    if (encoding == NULL) {
13898
0
        return NULL;
13899
0
    }
13900
411k
    const char *errors = NULL;
13901
411k
    if (nargs == 3) {
13902
235k
        errors = arg_as_utf8(args[2], "errors");
13903
235k
        if (errors == NULL) {
13904
0
            return NULL;
13905
0
        }
13906
235k
    }
13907
411k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13908
411k
}
13909
13910
static PyObject *
13911
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13912
16.1M
{
13913
16.1M
    PyObject *self;
13914
16.1M
    Py_ssize_t length, char_size;
13915
16.1M
    int share_utf8;
13916
16.1M
    int kind;
13917
16.1M
    void *data;
13918
13919
16.1M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13920
16.1M
    assert(_PyUnicode_CHECK(unicode));
13921
13922
16.1M
    self = type->tp_alloc(type, 0);
13923
16.1M
    if (self == NULL) {
13924
0
        return NULL;
13925
0
    }
13926
16.1M
    kind = PyUnicode_KIND(unicode);
13927
16.1M
    length = PyUnicode_GET_LENGTH(unicode);
13928
13929
16.1M
    _PyUnicode_LENGTH(self) = length;
13930
#ifdef Py_DEBUG
13931
    _PyUnicode_HASH(self) = -1;
13932
#else
13933
16.1M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13934
16.1M
#endif
13935
16.1M
    _PyUnicode_STATE(self).interned = 0;
13936
16.1M
    _PyUnicode_STATE(self).kind = kind;
13937
16.1M
    _PyUnicode_STATE(self).compact = 0;
13938
16.1M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13939
16.1M
    _PyUnicode_STATE(self).statically_allocated = 0;
13940
16.1M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13941
16.1M
    PyUnicode_SET_UTF8(self, NULL);
13942
16.1M
    _PyUnicode_DATA_ANY(self) = NULL;
13943
13944
16.1M
    share_utf8 = 0;
13945
16.1M
    if (kind == PyUnicode_1BYTE_KIND) {
13946
13.6M
        char_size = 1;
13947
13.6M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13948
13.5M
            share_utf8 = 1;
13949
13.6M
    }
13950
2.52M
    else if (kind == PyUnicode_2BYTE_KIND) {
13951
2.47M
        char_size = 2;
13952
2.47M
    }
13953
58.4k
    else {
13954
58.4k
        assert(kind == PyUnicode_4BYTE_KIND);
13955
58.4k
        char_size = 4;
13956
58.4k
    }
13957
13958
    /* Ensure we won't overflow the length. */
13959
16.1M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13960
0
        PyErr_NoMemory();
13961
0
        goto onError;
13962
0
    }
13963
16.1M
    data = PyMem_Malloc((length + 1) * char_size);
13964
16.1M
    if (data == NULL) {
13965
0
        PyErr_NoMemory();
13966
0
        goto onError;
13967
0
    }
13968
13969
16.1M
    _PyUnicode_DATA_ANY(self) = data;
13970
16.1M
    if (share_utf8) {
13971
13.5M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13972
13.5M
        PyUnicode_SET_UTF8(self, data);
13973
13.5M
    }
13974
13975
16.1M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13976
16.1M
    assert(_PyUnicode_CheckConsistency(self, 1));
13977
#ifdef Py_DEBUG
13978
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13979
#endif
13980
16.1M
    return self;
13981
13982
0
onError:
13983
0
    Py_DECREF(self);
13984
0
    return NULL;
13985
16.1M
}
13986
13987
void
13988
_PyUnicode_ExactDealloc(PyObject *op)
13989
62.8M
{
13990
62.8M
    assert(PyUnicode_CheckExact(op));
13991
62.8M
    unicode_dealloc(op);
13992
62.8M
}
13993
13994
PyDoc_STRVAR(unicode_doc,
13995
"str(object='') -> str\n\
13996
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
13997
\n\
13998
Create a new string object from the given object. If encoding or\n\
13999
errors is specified, then the object must expose a data buffer\n\
14000
that will be decoded using the given encoding and error handler.\n\
14001
Otherwise, returns the result of object.__str__() (if defined)\n\
14002
or repr(object).\n\
14003
encoding defaults to 'utf-8'.\n\
14004
errors defaults to 'strict'.");
14005
14006
static PyObject *unicode_iter(PyObject *seq);
14007
14008
PyTypeObject PyUnicode_Type = {
14009
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14010
    "str",                        /* tp_name */
14011
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14012
    0,                            /* tp_itemsize */
14013
    /* Slots */
14014
    unicode_dealloc,              /* tp_dealloc */
14015
    0,                            /* tp_vectorcall_offset */
14016
    0,                            /* tp_getattr */
14017
    0,                            /* tp_setattr */
14018
    0,                            /* tp_as_async */
14019
    unicode_repr,                 /* tp_repr */
14020
    &unicode_as_number,           /* tp_as_number */
14021
    &unicode_as_sequence,         /* tp_as_sequence */
14022
    &unicode_as_mapping,          /* tp_as_mapping */
14023
    unicode_hash,                 /* tp_hash*/
14024
    0,                            /* tp_call*/
14025
    unicode_str,                  /* tp_str */
14026
    PyObject_GenericGetAttr,      /* tp_getattro */
14027
    0,                            /* tp_setattro */
14028
    0,                            /* tp_as_buffer */
14029
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14030
        Py_TPFLAGS_UNICODE_SUBCLASS |
14031
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14032
    unicode_doc,                  /* tp_doc */
14033
    0,                            /* tp_traverse */
14034
    0,                            /* tp_clear */
14035
    PyUnicode_RichCompare,        /* tp_richcompare */
14036
    0,                            /* tp_weaklistoffset */
14037
    unicode_iter,                 /* tp_iter */
14038
    0,                            /* tp_iternext */
14039
    unicode_methods,              /* tp_methods */
14040
    0,                            /* tp_members */
14041
    0,                            /* tp_getset */
14042
    0,                            /* tp_base */
14043
    0,                            /* tp_dict */
14044
    0,                            /* tp_descr_get */
14045
    0,                            /* tp_descr_set */
14046
    0,                            /* tp_dictoffset */
14047
    0,                            /* tp_init */
14048
    0,                            /* tp_alloc */
14049
    unicode_new,                  /* tp_new */
14050
    PyObject_Free,                /* tp_free */
14051
    .tp_vectorcall = unicode_vectorcall,
14052
};
14053
14054
/* Initialize the Unicode implementation */
14055
14056
static void
14057
_init_global_state(void)
14058
36
{
14059
36
    static int initialized = 0;
14060
36
    if (initialized) {
14061
0
        return;
14062
0
    }
14063
36
    initialized = 1;
14064
14065
    /* initialize the linebreak bloom filter */
14066
36
    const Py_UCS2 linebreak[] = {
14067
36
        0x000A, /* LINE FEED */
14068
36
        0x000D, /* CARRIAGE RETURN */
14069
36
        0x001C, /* FILE SEPARATOR */
14070
36
        0x001D, /* GROUP SEPARATOR */
14071
36
        0x001E, /* RECORD SEPARATOR */
14072
36
        0x0085, /* NEXT LINE */
14073
36
        0x2028, /* LINE SEPARATOR */
14074
36
        0x2029, /* PARAGRAPH SEPARATOR */
14075
36
    };
14076
36
    bloom_linebreak = make_bloom_mask(
14077
36
        PyUnicode_2BYTE_KIND, linebreak,
14078
36
        Py_ARRAY_LENGTH(linebreak));
14079
36
}
14080
14081
void
14082
_PyUnicode_InitState(PyInterpreterState *interp)
14083
36
{
14084
36
    if (!_Py_IsMainInterpreter(interp)) {
14085
0
        return;
14086
0
    }
14087
36
    _init_global_state();
14088
36
}
14089
14090
14091
PyStatus
14092
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14093
36
{
14094
36
    if (_Py_IsMainInterpreter(interp)) {
14095
36
        PyStatus status = init_global_interned_strings(interp);
14096
36
        if (_PyStatus_EXCEPTION(status)) {
14097
0
            return status;
14098
0
        }
14099
36
    }
14100
36
    assert(INTERNED_STRINGS);
14101
14102
36
    if (init_interned_dict(interp)) {
14103
0
        PyErr_Clear();
14104
0
        return _PyStatus_ERR("failed to create interned dict");
14105
0
    }
14106
14107
36
    return _PyStatus_OK();
14108
36
}
14109
14110
14111
PyStatus
14112
_PyUnicode_InitTypes(PyInterpreterState *interp)
14113
36
{
14114
36
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14115
0
        goto error;
14116
0
    }
14117
36
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14118
0
        goto error;
14119
0
    }
14120
36
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14121
0
        goto error;
14122
0
    }
14123
36
    return _PyStatus_OK();
14124
14125
0
error:
14126
0
    return _PyStatus_ERR("Can't initialize unicode types");
14127
36
}
14128
14129
static /* non-null */ PyObject*
14130
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14131
40.1k
{
14132
    // Note that this steals a reference to `s`, but in many cases that
14133
    // stolen ref is returned, requiring no decref/incref.
14134
14135
40.1k
    assert(s != NULL);
14136
40.1k
    assert(_PyUnicode_CHECK(s));
14137
40.1k
    assert(_PyUnicode_STATE(s).statically_allocated);
14138
40.1k
    assert(!PyUnicode_CHECK_INTERNED(s));
14139
14140
#ifdef Py_DEBUG
14141
    /* We must not add process-global interned string if there's already a
14142
     * per-interpreter interned_dict, which might contain duplicates.
14143
     */
14144
    PyObject *interned = get_interned_dict(interp);
14145
    assert(interned == NULL);
14146
#endif
14147
14148
    /* Look in the global cache first. */
14149
40.1k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14150
    /* We should only init each string once */
14151
40.1k
    assert(r == NULL);
14152
    /* but just in case (for the non-debug build), handle this */
14153
40.1k
    if (r != NULL && r != s) {
14154
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14155
0
        assert(_PyUnicode_CHECK(r));
14156
0
        Py_DECREF(s);
14157
0
        return Py_NewRef(r);
14158
0
    }
14159
14160
40.1k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14161
0
        Py_FatalError("failed to intern static string");
14162
0
    }
14163
14164
40.1k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14165
40.1k
    return s;
14166
40.1k
}
14167
14168
void
14169
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14170
40.1k
{
14171
    // This should only be called as part of runtime initialization
14172
40.1k
    assert(!Py_IsInitialized());
14173
14174
40.1k
    *p = intern_static(interp, *p);
14175
40.1k
    assert(*p);
14176
40.1k
}
14177
14178
static void
14179
immortalize_interned(PyObject *s)
14180
292k
{
14181
292k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14182
292k
    assert(!_Py_IsImmortal(s));
14183
#ifdef Py_REF_DEBUG
14184
    /* The reference count value should be excluded from the RefTotal.
14185
       The decrements to these objects will not be registered so they
14186
       need to be accounted for in here. */
14187
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14188
        _Py_DecRefTotal(_PyThreadState_GET());
14189
    }
14190
#endif
14191
292k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14192
292k
    _Py_SetImmortal(s);
14193
292k
}
14194
14195
static /* non-null */ PyObject*
14196
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14197
              bool immortalize)
14198
90.6M
{
14199
    // Note that this steals a reference to `s`, but in many cases that
14200
    // stolen ref is returned, requiring no decref/incref.
14201
14202
#ifdef Py_DEBUG
14203
    assert(s != NULL);
14204
    assert(_PyUnicode_CHECK(s));
14205
#else
14206
90.6M
    if (s == NULL || !PyUnicode_Check(s)) {
14207
0
        return s;
14208
0
    }
14209
90.6M
#endif
14210
14211
    /* If it's a subclass, we don't really know what putting
14212
       it in the interned dict might do. */
14213
90.6M
    if (!PyUnicode_CheckExact(s)) {
14214
0
        return s;
14215
0
    }
14216
14217
    /* Is it already interned? */
14218
90.6M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14219
5.03M
        case SSTATE_NOT_INTERNED:
14220
            // no, go on
14221
5.03M
            break;
14222
32.0k
        case SSTATE_INTERNED_MORTAL:
14223
            // yes but we might need to make it immortal
14224
32.0k
            if (immortalize) {
14225
5.28k
                immortalize_interned(s);
14226
5.28k
            }
14227
32.0k
            return s;
14228
85.6M
        default:
14229
            // all done
14230
85.6M
            return s;
14231
90.6M
    }
14232
14233
    /* Statically allocated strings must be already interned. */
14234
90.6M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14235
14236
#if Py_GIL_DISABLED
14237
    /* In the free-threaded build, all interned strings are immortal */
14238
    immortalize = 1;
14239
#endif
14240
14241
    /* If it's already immortal, intern it as such */
14242
5.03M
    if (_Py_IsImmortal(s)) {
14243
0
        immortalize = 1;
14244
0
    }
14245
14246
    /* if it's a short string, get the singleton */
14247
5.03M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14248
16.5k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14249
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14250
0
        assert(PyUnicode_CHECK_INTERNED(r));
14251
0
        Py_DECREF(s);
14252
0
        return r;
14253
0
    }
14254
#ifdef Py_DEBUG
14255
    assert(!unicode_is_singleton(s));
14256
#endif
14257
14258
    /* Look in the global cache now. */
14259
5.03M
    {
14260
5.03M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14261
5.03M
        if (r != NULL) {
14262
526k
            assert(_PyUnicode_STATE(r).statically_allocated);
14263
526k
            assert(r != s);  // r must be statically_allocated; s is not
14264
526k
            Py_DECREF(s);
14265
526k
            return Py_NewRef(r);
14266
526k
        }
14267
5.03M
    }
14268
14269
    /* Do a setdefault on the per-interpreter cache. */
14270
4.50M
    PyObject *interned = get_interned_dict(interp);
14271
4.50M
    assert(interned != NULL);
14272
#ifdef Py_GIL_DISABLED
14273
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14274
#endif
14275
4.50M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14276
4.50M
    PyObject *t;
14277
4.50M
    {
14278
4.50M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14279
4.50M
        if (res < 0) {
14280
0
            PyErr_Clear();
14281
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14282
0
            return s;
14283
0
        }
14284
4.50M
        else if (res == 1) {
14285
            // value was already present (not inserted)
14286
3.69M
            Py_DECREF(s);
14287
3.69M
            if (immortalize &&
14288
1.02M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14289
10.1k
                immortalize_interned(t);
14290
10.1k
            }
14291
3.69M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14292
3.69M
            return t;
14293
3.69M
        }
14294
804k
        else {
14295
            // value was newly inserted
14296
804k
            assert (s == t);
14297
804k
            Py_DECREF(t);
14298
804k
        }
14299
4.50M
    }
14300
14301
    /* NOT_INTERNED -> INTERNED_MORTAL */
14302
14303
4.50M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14304
14305
804k
    if (!_Py_IsImmortal(s)) {
14306
        /* The two references in interned dict (key and value) are not counted.
14307
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14308
804k
        Py_DECREF(s);
14309
804k
        Py_DECREF(s);
14310
804k
    }
14311
804k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14312
14313
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14314
14315
#ifdef Py_DEBUG
14316
    if (_Py_IsImmortal(s)) {
14317
        assert(immortalize);
14318
    }
14319
#endif
14320
804k
    if (immortalize) {
14321
276k
        immortalize_interned(s);
14322
276k
    }
14323
14324
804k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14325
804k
    return s;
14326
4.50M
}
14327
14328
void
14329
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14330
14.0M
{
14331
14.0M
    *p = intern_common(interp, *p, 1);
14332
14.0M
    assert(*p);
14333
14.0M
}
14334
14335
void
14336
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14337
76.6M
{
14338
76.6M
    *p = intern_common(interp, *p, 0);
14339
76.6M
    assert(*p);
14340
76.6M
}
14341
14342
14343
void
14344
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14345
0
{
14346
0
    _PyUnicode_InternImmortal(interp, p);
14347
0
    return;
14348
0
}
14349
14350
void
14351
PyUnicode_InternInPlace(PyObject **p)
14352
0
{
14353
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14354
0
    _PyUnicode_InternMortal(interp, p);
14355
0
}
14356
14357
// Public-looking name kept for the stable ABI; user should not call this:
14358
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14359
void
14360
PyUnicode_InternImmortal(PyObject **p)
14361
0
{
14362
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14363
0
    _PyUnicode_InternImmortal(interp, p);
14364
0
}
14365
14366
PyObject *
14367
PyUnicode_InternFromString(const char *cp)
14368
972k
{
14369
972k
    PyObject *s = PyUnicode_FromString(cp);
14370
972k
    if (s == NULL) {
14371
0
        return NULL;
14372
0
    }
14373
972k
    PyInterpreterState *interp = _PyInterpreterState_GET();
14374
972k
    _PyUnicode_InternMortal(interp, &s);
14375
972k
    return s;
14376
972k
}
14377
14378
14379
void
14380
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14381
0
{
14382
0
    PyObject *interned = get_interned_dict(interp);
14383
0
    if (interned == NULL) {
14384
0
        return;
14385
0
    }
14386
0
    assert(PyDict_CheckExact(interned));
14387
14388
0
    if (has_shared_intern_dict(interp)) {
14389
        // the dict doesn't belong to this interpreter, skip the debug
14390
        // checks on it and just clear the pointer to it
14391
0
        clear_interned_dict(interp);
14392
0
        return;
14393
0
    }
14394
14395
#ifdef INTERNED_STATS
14396
    fprintf(stderr, "releasing %zd interned strings\n",
14397
            PyDict_GET_SIZE(interned));
14398
14399
    Py_ssize_t total_length = 0;
14400
#endif
14401
0
    Py_ssize_t pos = 0;
14402
0
    PyObject *s, *ignored_value;
14403
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14404
0
        int shared = 0;
14405
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14406
0
        case SSTATE_INTERNED_IMMORTAL:
14407
            /* Make immortal interned strings mortal again. */
14408
            // Skip the Immortal Instance check and restore
14409
            // the two references (key and value) ignored
14410
            // by PyUnicode_InternInPlace().
14411
0
            _Py_SetMortal(s, 2);
14412
#ifdef Py_REF_DEBUG
14413
            /* let's be pedantic with the ref total */
14414
            _Py_IncRefTotal(_PyThreadState_GET());
14415
            _Py_IncRefTotal(_PyThreadState_GET());
14416
#endif
14417
#ifdef INTERNED_STATS
14418
            total_length += PyUnicode_GET_LENGTH(s);
14419
#endif
14420
0
            break;
14421
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14422
            /* It is shared between interpreters, so we should unmark it
14423
               only when this is the last interpreter in which it's
14424
               interned.  We immortalize all the statically initialized
14425
               strings during startup, so we can rely on the
14426
               main interpreter to be the last one. */
14427
0
            if (!_Py_IsMainInterpreter(interp)) {
14428
0
                shared = 1;
14429
0
            }
14430
0
            break;
14431
0
        case SSTATE_INTERNED_MORTAL:
14432
            // Restore 2 references held by the interned dict; these will
14433
            // be decref'd by clear_interned_dict's PyDict_Clear.
14434
0
            _Py_RefcntAdd(s, 2);
14435
#ifdef Py_REF_DEBUG
14436
            /* let's be pedantic with the ref total */
14437
            _Py_IncRefTotal(_PyThreadState_GET());
14438
            _Py_IncRefTotal(_PyThreadState_GET());
14439
#endif
14440
0
            break;
14441
0
        case SSTATE_NOT_INTERNED:
14442
0
            _Py_FALLTHROUGH;
14443
0
        default:
14444
0
            Py_UNREACHABLE();
14445
0
        }
14446
0
        if (!shared) {
14447
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14448
0
        }
14449
0
    }
14450
#ifdef INTERNED_STATS
14451
    fprintf(stderr,
14452
            "total length of all interned strings: %zd characters\n",
14453
            total_length);
14454
#endif
14455
14456
0
    struct _Py_unicode_state *state = &interp->unicode;
14457
0
    struct _Py_unicode_ids *ids = &state->ids;
14458
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14459
0
        Py_XINCREF(ids->array[i]);
14460
0
    }
14461
0
    clear_interned_dict(interp);
14462
0
    if (_Py_IsMainInterpreter(interp)) {
14463
0
        clear_global_interned_strings();
14464
0
    }
14465
0
}
14466
14467
14468
/********************* Unicode Iterator **************************/
14469
14470
typedef struct {
14471
    PyObject_HEAD
14472
    Py_ssize_t it_index;
14473
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14474
} unicodeiterobject;
14475
14476
static void
14477
unicodeiter_dealloc(PyObject *op)
14478
869k
{
14479
869k
    unicodeiterobject *it = (unicodeiterobject *)op;
14480
869k
    _PyObject_GC_UNTRACK(it);
14481
869k
    Py_XDECREF(it->it_seq);
14482
869k
    PyObject_GC_Del(it);
14483
869k
}
14484
14485
static int
14486
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14487
1
{
14488
1
    unicodeiterobject *it = (unicodeiterobject *)op;
14489
1
    Py_VISIT(it->it_seq);
14490
1
    return 0;
14491
1
}
14492
14493
static PyObject *
14494
unicodeiter_next(PyObject *op)
14495
38.4M
{
14496
38.4M
    unicodeiterobject *it = (unicodeiterobject *)op;
14497
38.4M
    PyObject *seq;
14498
14499
38.4M
    assert(it != NULL);
14500
38.4M
    seq = it->it_seq;
14501
38.4M
    if (seq == NULL)
14502
0
        return NULL;
14503
38.4M
    assert(_PyUnicode_CHECK(seq));
14504
14505
38.4M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14506
37.8M
        int kind = PyUnicode_KIND(seq);
14507
37.8M
        const void *data = PyUnicode_DATA(seq);
14508
37.8M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14509
37.8M
        it->it_index++;
14510
37.8M
        return unicode_char(chr);
14511
37.8M
    }
14512
14513
610k
    it->it_seq = NULL;
14514
610k
    Py_DECREF(seq);
14515
610k
    return NULL;
14516
38.4M
}
14517
14518
static PyObject *
14519
unicode_ascii_iter_next(PyObject *op)
14520
4.81M
{
14521
4.81M
    unicodeiterobject *it = (unicodeiterobject *)op;
14522
4.81M
    assert(it != NULL);
14523
4.81M
    PyObject *seq = it->it_seq;
14524
4.81M
    if (seq == NULL) {
14525
0
        return NULL;
14526
0
    }
14527
4.81M
    assert(_PyUnicode_CHECK(seq));
14528
4.81M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14529
4.81M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14530
4.56M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14531
4.56M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14532
4.56M
                                              data, it->it_index);
14533
4.56M
        it->it_index++;
14534
4.56M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14535
4.56M
    }
14536
247k
    it->it_seq = NULL;
14537
247k
    Py_DECREF(seq);
14538
247k
    return NULL;
14539
4.81M
}
14540
14541
static PyObject *
14542
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14543
279k
{
14544
279k
    unicodeiterobject *it = (unicodeiterobject *)op;
14545
279k
    Py_ssize_t len = 0;
14546
279k
    if (it->it_seq)
14547
279k
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14548
279k
    return PyLong_FromSsize_t(len);
14549
279k
}
14550
14551
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14552
14553
static PyObject *
14554
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14555
0
{
14556
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14557
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14558
14559
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14560
     * call must be before access of iterator pointers.
14561
     * see issue #101765 */
14562
14563
0
    if (it->it_seq != NULL) {
14564
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14565
0
    } else {
14566
0
        PyObject *u = _PyUnicode_GetEmpty();
14567
0
        if (u == NULL) {
14568
0
            Py_XDECREF(iter);
14569
0
            return NULL;
14570
0
        }
14571
0
        return Py_BuildValue("N(N)", iter, u);
14572
0
    }
14573
0
}
14574
14575
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14576
14577
static PyObject *
14578
unicodeiter_setstate(PyObject *op, PyObject *state)
14579
0
{
14580
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14581
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14582
0
    if (index == -1 && PyErr_Occurred())
14583
0
        return NULL;
14584
0
    if (it->it_seq != NULL) {
14585
0
        if (index < 0)
14586
0
            index = 0;
14587
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14588
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14589
0
        it->it_index = index;
14590
0
    }
14591
0
    Py_RETURN_NONE;
14592
0
}
14593
14594
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14595
14596
static PyMethodDef unicodeiter_methods[] = {
14597
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14598
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14599
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14600
    {NULL,      NULL}       /* sentinel */
14601
};
14602
14603
PyTypeObject PyUnicodeIter_Type = {
14604
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14605
    "str_iterator",         /* tp_name */
14606
    sizeof(unicodeiterobject),      /* tp_basicsize */
14607
    0,                  /* tp_itemsize */
14608
    /* methods */
14609
    unicodeiter_dealloc,/* tp_dealloc */
14610
    0,                  /* tp_vectorcall_offset */
14611
    0,                  /* tp_getattr */
14612
    0,                  /* tp_setattr */
14613
    0,                  /* tp_as_async */
14614
    0,                  /* tp_repr */
14615
    0,                  /* tp_as_number */
14616
    0,                  /* tp_as_sequence */
14617
    0,                  /* tp_as_mapping */
14618
    0,                  /* tp_hash */
14619
    0,                  /* tp_call */
14620
    0,                  /* tp_str */
14621
    PyObject_GenericGetAttr,        /* tp_getattro */
14622
    0,                  /* tp_setattro */
14623
    0,                  /* tp_as_buffer */
14624
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14625
    0,                  /* tp_doc */
14626
    unicodeiter_traverse, /* tp_traverse */
14627
    0,                  /* tp_clear */
14628
    0,                  /* tp_richcompare */
14629
    0,                  /* tp_weaklistoffset */
14630
    PyObject_SelfIter,          /* tp_iter */
14631
    unicodeiter_next,   /* tp_iternext */
14632
    unicodeiter_methods,            /* tp_methods */
14633
    0,
14634
};
14635
14636
PyTypeObject _PyUnicodeASCIIIter_Type = {
14637
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14638
    .tp_name = "str_ascii_iterator",
14639
    .tp_basicsize = sizeof(unicodeiterobject),
14640
    .tp_dealloc = unicodeiter_dealloc,
14641
    .tp_getattro = PyObject_GenericGetAttr,
14642
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14643
    .tp_traverse = unicodeiter_traverse,
14644
    .tp_iter = PyObject_SelfIter,
14645
    .tp_iternext = unicode_ascii_iter_next,
14646
    .tp_methods = unicodeiter_methods,
14647
};
14648
14649
static PyObject *
14650
unicode_iter(PyObject *seq)
14651
869k
{
14652
869k
    unicodeiterobject *it;
14653
14654
869k
    if (!PyUnicode_Check(seq)) {
14655
0
        PyErr_BadInternalCall();
14656
0
        return NULL;
14657
0
    }
14658
869k
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14659
258k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14660
258k
    }
14661
611k
    else {
14662
611k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14663
611k
    }
14664
869k
    if (it == NULL)
14665
0
        return NULL;
14666
869k
    it->it_index = 0;
14667
869k
    it->it_seq = Py_NewRef(seq);
14668
869k
    _PyObject_GC_TRACK(it);
14669
869k
    return (PyObject *)it;
14670
869k
}
14671
14672
static int
14673
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14674
144
{
14675
144
    int res;
14676
144
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14677
144
    if (res == -2) {
14678
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14679
0
        return -1;
14680
0
    }
14681
144
    if (res < 0) {
14682
0
        PyErr_NoMemory();
14683
0
        return -1;
14684
0
    }
14685
144
    return 0;
14686
144
}
14687
14688
14689
static int
14690
config_get_codec_name(wchar_t **config_encoding)
14691
72
{
14692
72
    char *encoding;
14693
72
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14694
0
        return -1;
14695
0
    }
14696
14697
72
    PyObject *name_obj = NULL;
14698
72
    PyObject *codec = _PyCodec_Lookup(encoding);
14699
72
    PyMem_RawFree(encoding);
14700
14701
72
    if (!codec)
14702
0
        goto error;
14703
14704
72
    name_obj = PyObject_GetAttrString(codec, "name");
14705
72
    Py_CLEAR(codec);
14706
72
    if (!name_obj) {
14707
0
        goto error;
14708
0
    }
14709
14710
72
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14711
72
    Py_DECREF(name_obj);
14712
72
    if (wname == NULL) {
14713
0
        goto error;
14714
0
    }
14715
14716
72
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14717
72
    if (raw_wname == NULL) {
14718
0
        PyMem_Free(wname);
14719
0
        PyErr_NoMemory();
14720
0
        goto error;
14721
0
    }
14722
14723
72
    PyMem_RawFree(*config_encoding);
14724
72
    *config_encoding = raw_wname;
14725
14726
72
    PyMem_Free(wname);
14727
72
    return 0;
14728
14729
0
error:
14730
0
    Py_XDECREF(codec);
14731
0
    Py_XDECREF(name_obj);
14732
0
    return -1;
14733
72
}
14734
14735
14736
static PyStatus
14737
init_stdio_encoding(PyInterpreterState *interp)
14738
36
{
14739
    /* Update the stdio encoding to the normalized Python codec name. */
14740
36
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14741
36
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14742
0
        return _PyStatus_ERR("failed to get the Python codec name "
14743
0
                             "of the stdio encoding");
14744
0
    }
14745
36
    return _PyStatus_OK();
14746
36
}
14747
14748
14749
static int
14750
init_fs_codec(PyInterpreterState *interp)
14751
36
{
14752
36
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14753
14754
36
    _Py_error_handler error_handler;
14755
36
    error_handler = get_error_handler_wide(config->filesystem_errors);
14756
36
    if (error_handler == _Py_ERROR_UNKNOWN) {
14757
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14758
0
        return -1;
14759
0
    }
14760
14761
36
    char *encoding, *errors;
14762
36
    if (encode_wstr_utf8(config->filesystem_encoding,
14763
36
                         &encoding,
14764
36
                         "filesystem_encoding") < 0) {
14765
0
        return -1;
14766
0
    }
14767
14768
36
    if (encode_wstr_utf8(config->filesystem_errors,
14769
36
                         &errors,
14770
36
                         "filesystem_errors") < 0) {
14771
0
        PyMem_RawFree(encoding);
14772
0
        return -1;
14773
0
    }
14774
14775
36
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14776
36
    PyMem_RawFree(fs_codec->encoding);
14777
36
    fs_codec->encoding = encoding;
14778
    /* encoding has been normalized by init_fs_encoding() */
14779
36
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14780
36
    PyMem_RawFree(fs_codec->errors);
14781
36
    fs_codec->errors = errors;
14782
36
    fs_codec->error_handler = error_handler;
14783
14784
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14785
    assert(fs_codec->utf8 == 1);
14786
#endif
14787
14788
    /* At this point, PyUnicode_EncodeFSDefault() and
14789
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14790
       the C implementation of the filesystem encoding. */
14791
14792
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14793
       global configuration variables. */
14794
36
    if (_Py_IsMainInterpreter(interp)) {
14795
14796
36
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14797
36
                                      fs_codec->errors) < 0) {
14798
0
            PyErr_NoMemory();
14799
0
            return -1;
14800
0
        }
14801
36
    }
14802
36
    return 0;
14803
36
}
14804
14805
14806
static PyStatus
14807
init_fs_encoding(PyThreadState *tstate)
14808
36
{
14809
36
    PyInterpreterState *interp = tstate->interp;
14810
14811
    /* Update the filesystem encoding to the normalized Python codec name.
14812
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14813
       (Python codec name). */
14814
36
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14815
36
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14816
0
        _Py_DumpPathConfig(tstate);
14817
0
        return _PyStatus_ERR("failed to get the Python codec "
14818
0
                             "of the filesystem encoding");
14819
0
    }
14820
14821
36
    if (init_fs_codec(interp) < 0) {
14822
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14823
0
    }
14824
36
    return _PyStatus_OK();
14825
36
}
14826
14827
14828
PyStatus
14829
_PyUnicode_InitEncodings(PyThreadState *tstate)
14830
36
{
14831
36
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14832
36
    if (_PyStatus_EXCEPTION(status)) {
14833
0
        return status;
14834
0
    }
14835
36
    status = init_fs_encoding(tstate);
14836
36
    if (_PyStatus_EXCEPTION(status)) {
14837
0
        return status;
14838
0
    }
14839
14840
36
    return init_stdio_encoding(tstate->interp);
14841
36
}
14842
14843
14844
static void
14845
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14846
0
{
14847
0
    PyMem_RawFree(fs_codec->encoding);
14848
0
    fs_codec->encoding = NULL;
14849
0
    fs_codec->utf8 = 0;
14850
0
    PyMem_RawFree(fs_codec->errors);
14851
0
    fs_codec->errors = NULL;
14852
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14853
0
}
14854
14855
14856
#ifdef MS_WINDOWS
14857
int
14858
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
14859
{
14860
    PyInterpreterState *interp = _PyInterpreterState_GET();
14861
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
14862
14863
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
14864
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
14865
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
14866
    if (encoding == NULL || errors == NULL) {
14867
        PyMem_RawFree(encoding);
14868
        PyMem_RawFree(errors);
14869
        PyErr_NoMemory();
14870
        return -1;
14871
    }
14872
14873
    PyMem_RawFree(config->filesystem_encoding);
14874
    config->filesystem_encoding = encoding;
14875
    PyMem_RawFree(config->filesystem_errors);
14876
    config->filesystem_errors = errors;
14877
14878
    return init_fs_codec(interp);
14879
}
14880
#endif
14881
14882
14883
#ifdef Py_DEBUG
14884
static inline int
14885
unicode_is_finalizing(void)
14886
{
14887
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14888
}
14889
#endif
14890
14891
14892
void
14893
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14894
0
{
14895
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14896
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14897
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14898
0
}
14899
14900
14901
void
14902
_PyUnicode_Fini(PyInterpreterState *interp)
14903
0
{
14904
0
    struct _Py_unicode_state *state = &interp->unicode;
14905
14906
0
    if (!has_shared_intern_dict(interp)) {
14907
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14908
0
        assert(get_interned_dict(interp) == NULL);
14909
0
    }
14910
14911
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14912
14913
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14914
    // subsequent initialization of interpreter.
14915
0
    interp->unicode.ucnhash_capi = NULL;
14916
14917
0
    unicode_clear_identifiers(state);
14918
0
}
14919
14920
/* A _string module, to export formatter_parser and formatter_field_name_split
14921
   to the string.Formatter class implemented in Python. */
14922
14923
static PyMethodDef _string_methods[] = {
14924
    {"formatter_field_name_split", formatter_field_name_split,
14925
     METH_O, PyDoc_STR("split the argument as a field name")},
14926
    {"formatter_parser", formatter_parser,
14927
     METH_O, PyDoc_STR("parse the argument as a format string")},
14928
    {NULL, NULL}
14929
};
14930
14931
static PyModuleDef_Slot module_slots[] = {
14932
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14933
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14934
    {0, NULL}
14935
};
14936
14937
static struct PyModuleDef _string_module = {
14938
    PyModuleDef_HEAD_INIT,
14939
    .m_name = "_string",
14940
    .m_doc = PyDoc_STR("string helper module"),
14941
    .m_size = 0,
14942
    .m_methods = _string_methods,
14943
    .m_slots = module_slots,
14944
};
14945
14946
PyMODINIT_FUNC
14947
PyInit__string(void)
14948
10
{
14949
10
    return PyModuleDef_Init(&_string_module);
14950
10
}
14951
14952
14953
#undef PyUnicode_KIND
14954
int PyUnicode_KIND(PyObject *op)
14955
0
{
14956
0
    if (!PyUnicode_Check(op)) {
14957
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14958
0
        return -1;
14959
0
    }
14960
0
    return _PyASCIIObject_CAST(op)->state.kind;
14961
0
}
14962
14963
#undef PyUnicode_DATA
14964
void* PyUnicode_DATA(PyObject *op)
14965
0
{
14966
0
    if (!PyUnicode_Check(op)) {
14967
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14968
0
        return NULL;
14969
0
    }
14970
0
    return _PyUnicode_DATA(op);
14971
0
}