Coverage Report

Created: 2026-03-08 06:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def converter_init(self):
90
        if self.default is not unspecified:
91
            self.c_default = ascii(self.default)
92
            if len(self.c_default) > 4 or self.c_default[0] != "'":
93
                self.c_default = hex(ord(self.default))
94
95
[python start generated code]*/
96
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
97
98
/* --- Globals ------------------------------------------------------------
99
100
NOTE: In the interpreter's initialization phase, some globals are currently
101
      initialized dynamically as needed. In the process Unicode objects may
102
      be created before the Unicode type is ready.
103
104
*/
105
106
22.0M
#define MAX_UNICODE _Py_MAX_UNICODE
107
290M
#define ensure_unicode _PyUnicode_EnsureUnicode
108
109
#ifdef Py_DEBUG
110
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
111
#else
112
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
113
#endif
114
115
static inline char* _PyUnicode_UTF8(PyObject *op)
116
244M
{
117
244M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
118
244M
}
119
120
static inline char* PyUnicode_UTF8(PyObject *op)
121
82.1M
{
122
82.1M
    assert(_PyUnicode_CHECK(op));
123
82.1M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
124
64.6M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
125
64.6M
    }
126
17.4M
    else {
127
17.4M
         return _PyUnicode_UTF8(op);
128
17.4M
    }
129
82.1M
}
130
131
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
132
29.7M
{
133
29.7M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
134
29.7M
}
135
136
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
137
35.2M
{
138
35.2M
    assert(_PyUnicode_CHECK(op));
139
35.2M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
140
32.0M
         return _PyASCIIObject_CAST(op)->length;
141
32.0M
    }
142
3.17M
    else {
143
3.17M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
144
3.17M
    }
145
35.2M
}
146
147
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
148
29.7M
{
149
29.7M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
150
29.7M
}
151
152
#define _PyUnicode_LENGTH(op)                           \
153
650M
    (_PyASCIIObject_CAST(op)->length)
154
#define _PyUnicode_STATE(op)                            \
155
4.10G
    (_PyASCIIObject_CAST(op)->state)
156
#define _PyUnicode_HASH(op)                             \
157
600M
    (_PyASCIIObject_CAST(op)->hash)
158
159
1.25G
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
160
161
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
162
69.4M
{
163
69.4M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
164
69.4M
}
165
166
#define _PyUnicode_DATA_ANY(op)                         \
167
64.2M
    (_PyUnicodeObject_CAST(op)->data.any)
168
169
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
170
0
{
171
0
    assert(_PyUnicode_CHECK(op));
172
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
173
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
174
0
}
175
176
/* true if the Unicode object has an allocated UTF-8 memory block
177
   (not shared with other data) */
178
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
179
649M
{
180
649M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
181
213M
            && _PyUnicode_UTF8(op) != NULL
182
13.6M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
183
649M
}
184
185
186
280M
#define LATIN1 _Py_LATIN1_CHR
187
188
/* Forward declaration */
189
static PyObject *
190
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
191
                    const char *errors);
192
static PyObject *
193
unicode_decode_utf8(const char *s, Py_ssize_t size,
194
                    _Py_error_handler error_handler, const char *errors,
195
                    Py_ssize_t *consumed);
196
#ifdef Py_DEBUG
197
static inline int unicode_is_finalizing(void);
198
static int unicode_is_singleton(PyObject *unicode);
199
#endif
200
201
202
// Return a reference to the immortal empty string singleton.
203
PyObject*
204
_PyUnicode_GetEmpty(void)
205
140M
{
206
140M
    _Py_DECLARE_STR(empty, "");
207
140M
    return &_Py_STR(empty);
208
140M
}
209
210
/* This dictionary holds per-interpreter interned strings.
211
 * See InternalDocs/string_interning.md for details.
212
 */
213
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
214
5.25M
{
215
5.25M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
216
5.25M
}
217
218
/* This hashtable holds statically allocated interned strings.
219
 * See InternalDocs/string_interning.md for details.
220
 */
221
5.44M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
222
223
/* Get number of all interned strings for the current interpreter. */
224
Py_ssize_t
225
_PyUnicode_InternedSize(void)
226
0
{
227
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
228
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
229
0
}
230
231
/* Get number of immortal interned strings for the current interpreter. */
232
Py_ssize_t
233
_PyUnicode_InternedSize_Immortal(void)
234
0
{
235
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
236
0
    PyObject *key, *value;
237
0
    Py_ssize_t pos = 0;
238
0
    Py_ssize_t count = 0;
239
240
    // It's tempting to keep a count and avoid a loop here. But, this function
241
    // is intended for refleak tests. It spends extra work to report the true
242
    // value, to help detect bugs in optimizations.
243
244
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
245
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
246
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
247
0
           count++;
248
0
       }
249
0
    }
250
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
251
0
}
252
253
static Py_hash_t unicode_hash(PyObject *);
254
255
static Py_uhash_t
256
hashtable_unicode_hash(const void *key)
257
5.44M
{
258
5.44M
    return unicode_hash((PyObject *)key);
259
5.44M
}
260
261
static int
262
hashtable_unicode_compare(const void *key1, const void *key2)
263
562k
{
264
562k
    PyObject *obj1 = (PyObject *)key1;
265
562k
    PyObject *obj2 = (PyObject *)key2;
266
562k
    if (obj1 != NULL && obj2 != NULL) {
267
562k
        return unicode_eq(obj1, obj2);
268
562k
    }
269
0
    else {
270
0
        return obj1 == obj2;
271
0
    }
272
562k
}
273
274
/* Return true if this interpreter should share the main interpreter's
275
   intern_dict.  That's important for interpreters which load basic
276
   single-phase init extension modules (m_size == -1).  There could be interned
277
   immortal strings that are shared between interpreters, due to the
278
   PyDict_Update(mdict, m_copy) call in import_find_extension().
279
280
   It's not safe to deallocate those strings until all interpreters that
281
   potentially use them are freed.  By storing them in the main interpreter, we
282
   ensure they get freed after all other interpreters are freed.
283
*/
284
static bool
285
has_shared_intern_dict(PyInterpreterState *interp)
286
34
{
287
34
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
288
34
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
289
34
}
290
291
static int
292
init_interned_dict(PyInterpreterState *interp)
293
34
{
294
34
    assert(get_interned_dict(interp) == NULL);
295
34
    PyObject *interned;
296
34
    if (has_shared_intern_dict(interp)) {
297
0
        interned = get_interned_dict(_PyInterpreterState_Main());
298
0
        Py_INCREF(interned);
299
0
    }
300
34
    else {
301
34
        interned = PyDict_New();
302
34
        if (interned == NULL) {
303
0
            return -1;
304
0
        }
305
34
    }
306
34
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
307
34
    return 0;
308
34
}
309
310
static void
311
clear_interned_dict(PyInterpreterState *interp)
312
0
{
313
0
    PyObject *interned = get_interned_dict(interp);
314
0
    if (interned != NULL) {
315
0
        if (!has_shared_intern_dict(interp)) {
316
            // only clear if the dict belongs to this interpreter
317
0
            PyDict_Clear(interned);
318
0
        }
319
0
        Py_DECREF(interned);
320
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
321
0
    }
322
0
}
323
324
static PyStatus
325
init_global_interned_strings(PyInterpreterState *interp)
326
34
{
327
34
    assert(INTERNED_STRINGS == NULL);
328
34
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
329
330
34
    INTERNED_STRINGS = _Py_hashtable_new_full(
331
34
        hashtable_unicode_hash,
332
34
        hashtable_unicode_compare,
333
        // Objects stored here are immortal and statically allocated,
334
        // so we don't need key_destroy_func & value_destroy_func:
335
34
        NULL,
336
34
        NULL,
337
34
        &hashtable_alloc
338
34
    );
339
34
    if (INTERNED_STRINGS == NULL) {
340
0
        PyErr_Clear();
341
0
        return _PyStatus_ERR("failed to create global interned dict");
342
0
    }
343
344
    /* Intern statically allocated string identifiers, deepfreeze strings,
345
        * and one-byte latin-1 strings.
346
        * This must be done before any module initialization so that statically
347
        * allocated string identifiers are used instead of heap allocated strings.
348
        * Deepfreeze uses the interned identifiers if present to save space
349
        * else generates them and they are interned to speed up dict lookups.
350
    */
351
34
    _PyUnicode_InitStaticStrings(interp);
352
353
8.73k
    for (int i = 0; i < 256; i++) {
354
8.70k
        PyObject *s = LATIN1(i);
355
8.70k
        _PyUnicode_InternStatic(interp, &s);
356
8.70k
        assert(s == LATIN1(i));
357
8.70k
    }
358
#ifdef Py_DEBUG
359
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
360
361
    for (int i = 0; i < 256; i++) {
362
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
363
    }
364
#endif
365
34
    return _PyStatus_OK();
366
34
}
367
368
static void clear_global_interned_strings(void)
369
0
{
370
0
    if (INTERNED_STRINGS != NULL) {
371
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
372
0
        INTERNED_STRINGS = NULL;
373
0
    }
374
0
}
375
376
#define _Py_RETURN_UNICODE_EMPTY()   \
377
51.6M
    do {                             \
378
51.6M
        return _PyUnicode_GetEmpty();\
379
51.6M
    } while (0)
380
381
382
/* Fast detection of the most frequent whitespace characters */
383
const unsigned char _Py_ascii_whitespace[] = {
384
    0, 0, 0, 0, 0, 0, 0, 0,
385
/*     case 0x0009: * CHARACTER TABULATION */
386
/*     case 0x000A: * LINE FEED */
387
/*     case 0x000B: * LINE TABULATION */
388
/*     case 0x000C: * FORM FEED */
389
/*     case 0x000D: * CARRIAGE RETURN */
390
    0, 1, 1, 1, 1, 1, 0, 0,
391
    0, 0, 0, 0, 0, 0, 0, 0,
392
/*     case 0x001C: * FILE SEPARATOR */
393
/*     case 0x001D: * GROUP SEPARATOR */
394
/*     case 0x001E: * RECORD SEPARATOR */
395
/*     case 0x001F: * UNIT SEPARATOR */
396
    0, 0, 0, 0, 1, 1, 1, 1,
397
/*     case 0x0020: * SPACE */
398
    1, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
    0, 0, 0, 0, 0, 0, 0, 0,
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0,
409
    0, 0, 0, 0, 0, 0, 0, 0,
410
    0, 0, 0, 0, 0, 0, 0, 0
411
};
412
413
/* forward */
414
static PyObject* get_latin1_char(unsigned char ch);
415
416
417
static PyObject *
418
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
421
static PyObject *
422
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
423
424
static PyObject *
425
unicode_encode_call_errorhandler(const char *errors,
426
       PyObject **errorHandler,const char *encoding, const char *reason,
427
       PyObject *unicode, PyObject **exceptionObject,
428
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
429
430
static void
431
raise_encode_exception(PyObject **exceptionObject,
432
                       const char *encoding,
433
                       PyObject *unicode,
434
                       Py_ssize_t startpos, Py_ssize_t endpos,
435
                       const char *reason);
436
437
/* Same for linebreaks */
438
static const unsigned char ascii_linebreak[] = {
439
    0, 0, 0, 0, 0, 0, 0, 0,
440
/*         0x000A, * LINE FEED */
441
/*         0x000B, * LINE TABULATION */
442
/*         0x000C, * FORM FEED */
443
/*         0x000D, * CARRIAGE RETURN */
444
    0, 0, 1, 1, 1, 1, 0, 0,
445
    0, 0, 0, 0, 0, 0, 0, 0,
446
/*         0x001C, * FILE SEPARATOR */
447
/*         0x001D, * GROUP SEPARATOR */
448
/*         0x001E, * RECORD SEPARATOR */
449
    0, 0, 0, 0, 1, 1, 1, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
    0, 0, 0, 0, 0, 0, 0, 0,
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0,
461
    0, 0, 0, 0, 0, 0, 0, 0,
462
    0, 0, 0, 0, 0, 0, 0, 0
463
};
464
465
static int convert_uc(PyObject *obj, void *addr);
466
467
struct encoding_map;
468
#include "clinic/unicodeobject.c.h"
469
470
_Py_error_handler
471
_Py_GetErrorHandler(const char *errors)
472
976k
{
473
976k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
474
242k
        return _Py_ERROR_STRICT;
475
242k
    }
476
733k
    if (strcmp(errors, "surrogateescape") == 0) {
477
499k
        return _Py_ERROR_SURROGATEESCAPE;
478
499k
    }
479
234k
    if (strcmp(errors, "replace") == 0) {
480
234k
        return _Py_ERROR_REPLACE;
481
234k
    }
482
0
    if (strcmp(errors, "ignore") == 0) {
483
0
        return _Py_ERROR_IGNORE;
484
0
    }
485
0
    if (strcmp(errors, "backslashreplace") == 0) {
486
0
        return _Py_ERROR_BACKSLASHREPLACE;
487
0
    }
488
0
    if (strcmp(errors, "surrogatepass") == 0) {
489
0
        return _Py_ERROR_SURROGATEPASS;
490
0
    }
491
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
492
0
        return _Py_ERROR_XMLCHARREFREPLACE;
493
0
    }
494
0
    return _Py_ERROR_OTHER;
495
0
}
496
497
498
static _Py_error_handler
499
get_error_handler_wide(const wchar_t *errors)
500
11.7k
{
501
11.7k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
502
0
        return _Py_ERROR_STRICT;
503
0
    }
504
11.7k
    if (wcscmp(errors, L"surrogateescape") == 0) {
505
11.7k
        return _Py_ERROR_SURROGATEESCAPE;
506
11.7k
    }
507
0
    if (wcscmp(errors, L"replace") == 0) {
508
0
        return _Py_ERROR_REPLACE;
509
0
    }
510
0
    if (wcscmp(errors, L"ignore") == 0) {
511
0
        return _Py_ERROR_IGNORE;
512
0
    }
513
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
514
0
        return _Py_ERROR_BACKSLASHREPLACE;
515
0
    }
516
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
517
0
        return _Py_ERROR_SURROGATEPASS;
518
0
    }
519
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
520
0
        return _Py_ERROR_XMLCHARREFREPLACE;
521
0
    }
522
0
    return _Py_ERROR_OTHER;
523
0
}
524
525
526
static inline int
527
unicode_check_encoding_errors(const char *encoding, const char *errors)
528
27.4M
{
529
27.4M
    if (encoding == NULL && errors == NULL) {
530
15.4M
        return 0;
531
15.4M
    }
532
533
11.9M
    PyInterpreterState *interp = _PyInterpreterState_GET();
534
11.9M
#ifndef Py_DEBUG
535
    /* In release mode, only check in development mode (-X dev) */
536
11.9M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
537
11.9M
        return 0;
538
11.9M
    }
539
#else
540
    /* Always check in debug mode */
541
#endif
542
543
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
544
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
545
0
    if (!interp->unicode.fs_codec.encoding) {
546
0
        return 0;
547
0
    }
548
549
    /* Disable checks during Python finalization. For example, it allows to
550
     * call PyObject_Dump() during finalization for debugging purpose.
551
     */
552
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
553
0
        return 0;
554
0
    }
555
556
0
    if (encoding != NULL
557
        // Fast path for the most common built-in encodings. Even if the codec
558
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
559
        // create a temporary Unicode string (the key in the cache).
560
0
        && strcmp(encoding, "utf-8") != 0
561
0
        && strcmp(encoding, "utf8") != 0
562
0
        && strcmp(encoding, "ascii") != 0)
563
0
    {
564
0
        PyObject *handler = _PyCodec_Lookup(encoding);
565
0
        if (handler == NULL) {
566
0
            return -1;
567
0
        }
568
0
        Py_DECREF(handler);
569
0
    }
570
571
0
    if (errors != NULL
572
        // Fast path for the most common built-in error handlers.
573
0
        && strcmp(errors, "strict") != 0
574
0
        && strcmp(errors, "ignore") != 0
575
0
        && strcmp(errors, "replace") != 0
576
0
        && strcmp(errors, "surrogateescape") != 0
577
0
        && strcmp(errors, "surrogatepass") != 0)
578
0
    {
579
0
        PyObject *handler = PyCodec_LookupError(errors);
580
0
        if (handler == NULL) {
581
0
            return -1;
582
0
        }
583
0
        Py_DECREF(handler);
584
0
    }
585
0
    return 0;
586
0
}
587
588
589
int
590
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
591
0
{
592
0
#define CHECK(expr) \
593
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
594
595
0
    assert(op != NULL);
596
0
    CHECK(PyUnicode_Check(op));
597
598
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
599
0
    int kind = ascii->state.kind;
600
601
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
602
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
603
0
    }
604
0
    else {
605
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
606
0
        void *data;
607
608
0
        if (ascii->state.compact == 1) {
609
0
            data = compact + 1;
610
0
            CHECK(kind == PyUnicode_1BYTE_KIND
611
0
                                 || kind == PyUnicode_2BYTE_KIND
612
0
                                 || kind == PyUnicode_4BYTE_KIND);
613
0
            CHECK(ascii->state.ascii == 0);
614
0
            CHECK(_PyUnicode_UTF8(op) != data);
615
0
        }
616
0
        else {
617
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
618
619
0
            data = unicode->data.any;
620
0
            CHECK(kind == PyUnicode_1BYTE_KIND
621
0
                     || kind == PyUnicode_2BYTE_KIND
622
0
                     || kind == PyUnicode_4BYTE_KIND);
623
0
            CHECK(ascii->state.compact == 0);
624
0
            CHECK(data != NULL);
625
0
            if (ascii->state.ascii) {
626
0
                CHECK(_PyUnicode_UTF8(op) == data);
627
0
                CHECK(compact->utf8_length == ascii->length);
628
0
            }
629
0
            else {
630
0
                CHECK(_PyUnicode_UTF8(op) != data);
631
0
            }
632
0
        }
633
0
#ifndef Py_GIL_DISABLED
634
0
        if (_PyUnicode_UTF8(op) == NULL)
635
0
            CHECK(compact->utf8_length == 0);
636
0
#endif
637
0
    }
638
639
    /* check that the best kind is used: O(n) operation */
640
0
    if (check_content) {
641
0
        Py_ssize_t i;
642
0
        Py_UCS4 maxchar = 0;
643
0
        const void *data;
644
0
        Py_UCS4 ch;
645
646
0
        data = PyUnicode_DATA(ascii);
647
0
        for (i=0; i < ascii->length; i++)
648
0
        {
649
0
            ch = PyUnicode_READ(kind, data, i);
650
0
            if (ch > maxchar)
651
0
                maxchar = ch;
652
0
        }
653
0
        if (kind == PyUnicode_1BYTE_KIND) {
654
0
            if (ascii->state.ascii == 0) {
655
0
                CHECK(maxchar >= 128);
656
0
                CHECK(maxchar <= 255);
657
0
            }
658
0
            else
659
0
                CHECK(maxchar < 128);
660
0
        }
661
0
        else if (kind == PyUnicode_2BYTE_KIND) {
662
0
            CHECK(maxchar >= 0x100);
663
0
            CHECK(maxchar <= 0xFFFF);
664
0
        }
665
0
        else {
666
0
            CHECK(maxchar >= 0x10000);
667
0
            CHECK(maxchar <= MAX_UNICODE);
668
0
        }
669
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
670
0
    }
671
672
    /* Check interning state */
673
#ifdef Py_DEBUG
674
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
675
    // extensions can make immortal strings mortal (but with a high enough
676
    // refcount).
677
    // The other way is extremely unlikely (worth a potential failed assertion
678
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
679
    switch (PyUnicode_CHECK_INTERNED(op)) {
680
        case SSTATE_NOT_INTERNED:
681
            if (ascii->state.statically_allocated) {
682
                // This state is for two exceptions:
683
                // - strings are currently checked before they're interned
684
                // - the 256 one-latin1-character strings
685
                //   are static but use SSTATE_NOT_INTERNED
686
            }
687
            else {
688
                CHECK(!_Py_IsImmortal(op));
689
            }
690
            break;
691
        case SSTATE_INTERNED_MORTAL:
692
            CHECK(!ascii->state.statically_allocated);
693
            CHECK(!_Py_IsImmortal(op));
694
            break;
695
        case SSTATE_INTERNED_IMMORTAL:
696
            CHECK(!ascii->state.statically_allocated);
697
            break;
698
        case SSTATE_INTERNED_IMMORTAL_STATIC:
699
            CHECK(ascii->state.statically_allocated);
700
            break;
701
        default:
702
            Py_UNREACHABLE();
703
    }
704
#endif
705
706
0
    return 1;
707
708
0
#undef CHECK
709
0
}
710
711
PyObject*
712
_PyUnicode_Result(PyObject *unicode)
713
57.1M
{
714
57.1M
    assert(_PyUnicode_CHECK(unicode));
715
716
57.1M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
717
57.1M
    if (length == 0) {
718
305
        PyObject *empty = _PyUnicode_GetEmpty();
719
305
        if (unicode != empty) {
720
0
            Py_DECREF(unicode);
721
0
        }
722
305
        return empty;
723
305
    }
724
725
57.1M
    if (length == 1) {
726
844k
        int kind = PyUnicode_KIND(unicode);
727
844k
        if (kind == PyUnicode_1BYTE_KIND) {
728
126k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
729
126k
            Py_UCS1 ch = data[0];
730
126k
            PyObject *latin1_char = LATIN1(ch);
731
126k
            if (unicode != latin1_char) {
732
120k
                Py_DECREF(unicode);
733
120k
            }
734
126k
            return latin1_char;
735
126k
        }
736
844k
    }
737
738
57.1M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
739
57.0M
    return unicode;
740
57.1M
}
741
1.52M
#define unicode_result _PyUnicode_Result
742
743
static PyObject*
744
unicode_result_unchanged(PyObject *unicode)
745
159M
{
746
159M
    if (PyUnicode_CheckExact(unicode)) {
747
157M
        return Py_NewRef(unicode);
748
157M
    }
749
2.92M
    else
750
        /* Subtype -- return genuine unicode string with the same value. */
751
2.92M
        return _PyUnicode_Copy(unicode);
752
159M
}
753
754
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
755
   ASCII, Latin1, UTF-8, etc. */
756
static char*
757
backslashreplace(PyBytesWriter *writer, char *str,
758
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
759
0
{
760
0
    Py_ssize_t size, i;
761
0
    Py_UCS4 ch;
762
0
    int kind;
763
0
    const void *data;
764
765
0
    kind = PyUnicode_KIND(unicode);
766
0
    data = PyUnicode_DATA(unicode);
767
768
0
    size = 0;
769
    /* determine replacement size */
770
0
    for (i = collstart; i < collend; ++i) {
771
0
        Py_ssize_t incr;
772
773
0
        ch = PyUnicode_READ(kind, data, i);
774
0
        if (ch < 0x100)
775
0
            incr = 2+2;
776
0
        else if (ch < 0x10000)
777
0
            incr = 2+4;
778
0
        else {
779
0
            assert(ch <= MAX_UNICODE);
780
0
            incr = 2+8;
781
0
        }
782
0
        if (size > PY_SSIZE_T_MAX - incr) {
783
0
            PyErr_SetString(PyExc_OverflowError,
784
0
                            "encoded result is too long for a Python string");
785
0
            return NULL;
786
0
        }
787
0
        size += incr;
788
0
    }
789
790
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
791
0
    if (str == NULL) {
792
0
        return NULL;
793
0
    }
794
795
    /* generate replacement */
796
0
    for (i = collstart; i < collend; ++i) {
797
0
        ch = PyUnicode_READ(kind, data, i);
798
0
        *str++ = '\\';
799
0
        if (ch >= 0x00010000) {
800
0
            *str++ = 'U';
801
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
802
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
803
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
804
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
805
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
806
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
807
0
        }
808
0
        else if (ch >= 0x100) {
809
0
            *str++ = 'u';
810
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
811
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
812
0
        }
813
0
        else
814
0
            *str++ = 'x';
815
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
816
0
        *str++ = Py_hexdigits[ch&0xf];
817
0
    }
818
0
    return str;
819
0
}
820
821
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
822
   ASCII, Latin1, UTF-8, etc. */
823
static char*
824
xmlcharrefreplace(PyBytesWriter *writer, char *str,
825
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
826
0
{
827
0
    Py_ssize_t size, i;
828
0
    Py_UCS4 ch;
829
0
    int kind;
830
0
    const void *data;
831
832
0
    kind = PyUnicode_KIND(unicode);
833
0
    data = PyUnicode_DATA(unicode);
834
835
0
    size = 0;
836
    /* determine replacement size */
837
0
    for (i = collstart; i < collend; ++i) {
838
0
        Py_ssize_t incr;
839
840
0
        ch = PyUnicode_READ(kind, data, i);
841
0
        if (ch < 10)
842
0
            incr = 2+1+1;
843
0
        else if (ch < 100)
844
0
            incr = 2+2+1;
845
0
        else if (ch < 1000)
846
0
            incr = 2+3+1;
847
0
        else if (ch < 10000)
848
0
            incr = 2+4+1;
849
0
        else if (ch < 100000)
850
0
            incr = 2+5+1;
851
0
        else if (ch < 1000000)
852
0
            incr = 2+6+1;
853
0
        else {
854
0
            assert(ch <= MAX_UNICODE);
855
0
            incr = 2+7+1;
856
0
        }
857
0
        if (size > PY_SSIZE_T_MAX - incr) {
858
0
            PyErr_SetString(PyExc_OverflowError,
859
0
                            "encoded result is too long for a Python string");
860
0
            return NULL;
861
0
        }
862
0
        size += incr;
863
0
    }
864
865
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
866
0
    if (str == NULL) {
867
0
        return NULL;
868
0
    }
869
870
    /* generate replacement */
871
0
    for (i = collstart; i < collend; ++i) {
872
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
873
0
        if (size < 0) {
874
0
            return NULL;
875
0
        }
876
0
        str += size;
877
0
    }
878
0
    return str;
879
0
}
880
881
/* --- Bloom Filters ----------------------------------------------------- */
882
883
/* stuff to implement simple "bloom filters" for Unicode characters.
884
   to keep things simple, we use a single bitmask, using the least 5
885
   bits from each unicode characters as the bit index. */
886
887
/* the linebreak mask is set up by _PyUnicode_Init() below */
888
889
#if LONG_BIT >= 128
890
#define BLOOM_WIDTH 128
891
#elif LONG_BIT >= 64
892
55.1M
#define BLOOM_WIDTH 64
893
#elif LONG_BIT >= 32
894
#define BLOOM_WIDTH 32
895
#else
896
#error "LONG_BIT is smaller than 32"
897
#endif
898
899
20.9M
#define BLOOM_MASK unsigned long
900
901
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
902
903
79.0M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
904
905
#define BLOOM_LINEBREAK(ch)                                             \
906
250M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
907
250M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
908
909
static inline BLOOM_MASK
910
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
911
10.4M
{
912
10.4M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
913
10.4M
    do {                                               \
914
10.4M
        TYPE *data = (TYPE *)PTR;                      \
915
10.4M
        TYPE *end = data + LEN;                        \
916
10.4M
        Py_UCS4 ch;                                    \
917
22.8M
        for (; data != end; data++) {                  \
918
12.4M
            ch = *data;                                \
919
12.4M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
920
12.4M
        }                                              \
921
10.4M
        break;                                         \
922
10.4M
    } while (0)
923
924
    /* calculate simple bloom-style bitmask for a given unicode string */
925
926
10.4M
    BLOOM_MASK mask;
927
928
10.4M
    mask = 0;
929
10.4M
    switch (kind) {
930
10.4M
    case PyUnicode_1BYTE_KIND:
931
10.4M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
932
10.4M
        break;
933
34
    case PyUnicode_2BYTE_KIND:
934
34
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
935
34
        break;
936
0
    case PyUnicode_4BYTE_KIND:
937
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
938
0
        break;
939
0
    default:
940
0
        Py_UNREACHABLE();
941
10.4M
    }
942
10.4M
    return mask;
943
944
10.4M
#undef BLOOM_UPDATE
945
10.4M
}
946
947
/* Compilation of templated routines */
948
949
1.39M
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
950
951
#include "stringlib/asciilib.h"
952
#include "stringlib/fastsearch.h"
953
#include "stringlib/partition.h"
954
#include "stringlib/split.h"
955
#include "stringlib/count.h"
956
#include "stringlib/find.h"
957
#include "stringlib/find_max_char.h"
958
#include "stringlib/undef.h"
959
960
#include "stringlib/ucs1lib.h"
961
#include "stringlib/fastsearch.h"
962
#include "stringlib/partition.h"
963
#include "stringlib/split.h"
964
#include "stringlib/count.h"
965
#include "stringlib/find.h"
966
#include "stringlib/replace.h"
967
#include "stringlib/repr.h"
968
#include "stringlib/find_max_char.h"
969
#include "stringlib/undef.h"
970
971
#include "stringlib/ucs2lib.h"
972
#include "stringlib/fastsearch.h"
973
#include "stringlib/partition.h"
974
#include "stringlib/split.h"
975
#include "stringlib/count.h"
976
#include "stringlib/find.h"
977
#include "stringlib/replace.h"
978
#include "stringlib/repr.h"
979
#include "stringlib/find_max_char.h"
980
#include "stringlib/undef.h"
981
982
#include "stringlib/ucs4lib.h"
983
#include "stringlib/fastsearch.h"
984
#include "stringlib/partition.h"
985
#include "stringlib/split.h"
986
#include "stringlib/count.h"
987
#include "stringlib/find.h"
988
#include "stringlib/replace.h"
989
#include "stringlib/repr.h"
990
#include "stringlib/find_max_char.h"
991
#include "stringlib/undef.h"
992
993
#undef STRINGLIB_GET_EMPTY
994
995
/* --- Unicode Object ----------------------------------------------------- */
996
997
static inline Py_ssize_t
998
findchar(const void *s, int kind,
999
         Py_ssize_t size, Py_UCS4 ch,
1000
         int direction)
1001
219M
{
1002
219M
    switch (kind) {
1003
209M
    case PyUnicode_1BYTE_KIND:
1004
209M
        if ((Py_UCS1) ch != ch)
1005
3.67k
            return -1;
1006
209M
        if (direction > 0)
1007
209M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1008
72.9k
        else
1009
72.9k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1010
8.17M
    case PyUnicode_2BYTE_KIND:
1011
8.17M
        if ((Py_UCS2) ch != ch)
1012
0
            return -1;
1013
8.17M
        if (direction > 0)
1014
7.95M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1015
222k
        else
1016
222k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1017
2.08M
    case PyUnicode_4BYTE_KIND:
1018
2.08M
        if (direction > 0)
1019
1.96M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1020
124k
        else
1021
124k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1022
0
    default:
1023
0
        Py_UNREACHABLE();
1024
219M
    }
1025
219M
}
1026
1027
#ifdef Py_DEBUG
1028
/* Fill the data of a Unicode string with invalid characters to detect bugs
1029
   earlier.
1030
1031
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1032
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1033
   invalid character in Unicode 6.0. */
1034
static void
1035
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1036
{
1037
    int kind = PyUnicode_KIND(unicode);
1038
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1039
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1040
    if (length <= old_length)
1041
        return;
1042
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1043
}
1044
#endif
1045
1046
static PyObject*
1047
resize_copy(PyObject *unicode, Py_ssize_t length)
1048
0
{
1049
0
    Py_ssize_t copy_length;
1050
0
    PyObject *copy;
1051
1052
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1053
0
    if (copy == NULL)
1054
0
        return NULL;
1055
1056
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1057
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1058
0
    return copy;
1059
0
}
1060
1061
PyObject*
1062
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1063
66.1M
{
1064
66.1M
    Py_ssize_t char_size;
1065
66.1M
    Py_ssize_t struct_size;
1066
66.1M
    Py_ssize_t new_size;
1067
66.1M
    PyObject *new_unicode;
1068
#ifdef Py_DEBUG
1069
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1070
#endif
1071
1072
66.1M
    if (!_PyUnicode_IsModifiable(unicode)) {
1073
0
        PyObject *copy = resize_copy(unicode, length);
1074
0
        if (copy == NULL) {
1075
0
            return NULL;
1076
0
        }
1077
0
        Py_DECREF(unicode);
1078
0
        return copy;
1079
0
    }
1080
66.1M
    assert(PyUnicode_IS_COMPACT(unicode));
1081
1082
66.1M
    char_size = PyUnicode_KIND(unicode);
1083
66.1M
    if (PyUnicode_IS_ASCII(unicode))
1084
50.8M
        struct_size = sizeof(PyASCIIObject);
1085
15.2M
    else
1086
15.2M
        struct_size = sizeof(PyCompactUnicodeObject);
1087
1088
66.1M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1089
0
        PyErr_NoMemory();
1090
0
        return NULL;
1091
0
    }
1092
66.1M
    new_size = (struct_size + (length + 1) * char_size);
1093
1094
66.1M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1095
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1096
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1097
0
        PyUnicode_SET_UTF8(unicode, NULL);
1098
0
    }
1099
#ifdef Py_TRACE_REFS
1100
    _Py_ForgetReference(unicode);
1101
#endif
1102
66.1M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1103
1104
66.1M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1105
66.1M
    if (new_unicode == NULL) {
1106
0
        _Py_NewReferenceNoTotal(unicode);
1107
0
        PyErr_NoMemory();
1108
0
        return NULL;
1109
0
    }
1110
66.1M
    unicode = new_unicode;
1111
66.1M
    _Py_NewReferenceNoTotal(unicode);
1112
1113
66.1M
    _PyUnicode_LENGTH(unicode) = length;
1114
#ifdef Py_DEBUG
1115
    unicode_fill_invalid(unicode, old_length);
1116
#endif
1117
66.1M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1118
66.1M
                    length, 0);
1119
66.1M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1120
66.1M
    return unicode;
1121
66.1M
}
1122
1123
static int
1124
resize_inplace(PyObject *unicode, Py_ssize_t length)
1125
0
{
1126
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1127
0
    assert(Py_REFCNT(unicode) == 1);
1128
1129
0
    Py_ssize_t new_size;
1130
0
    Py_ssize_t char_size;
1131
0
    int share_utf8;
1132
0
    void *data;
1133
#ifdef Py_DEBUG
1134
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1135
#endif
1136
1137
0
    data = _PyUnicode_DATA_ANY(unicode);
1138
0
    char_size = PyUnicode_KIND(unicode);
1139
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1140
1141
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1142
0
        PyErr_NoMemory();
1143
0
        return -1;
1144
0
    }
1145
0
    new_size = (length + 1) * char_size;
1146
1147
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1148
0
    {
1149
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1150
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1151
0
        PyUnicode_SET_UTF8(unicode, NULL);
1152
0
    }
1153
1154
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1155
0
    if (data == NULL) {
1156
0
        PyErr_NoMemory();
1157
0
        return -1;
1158
0
    }
1159
0
    _PyUnicode_DATA_ANY(unicode) = data;
1160
0
    if (share_utf8) {
1161
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1162
0
        PyUnicode_SET_UTF8(unicode, data);
1163
0
    }
1164
0
    _PyUnicode_LENGTH(unicode) = length;
1165
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1166
#ifdef Py_DEBUG
1167
    unicode_fill_invalid(unicode, old_length);
1168
#endif
1169
1170
    /* check for integer overflow */
1171
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1172
0
        PyErr_NoMemory();
1173
0
        return -1;
1174
0
    }
1175
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1176
0
    return 0;
1177
0
}
1178
1179
static const char*
1180
unicode_kind_name(PyObject *unicode)
1181
0
{
1182
    /* don't check consistency: unicode_kind_name() is called from
1183
       _PyUnicode_Dump() */
1184
0
    if (!PyUnicode_IS_COMPACT(unicode))
1185
0
    {
1186
0
        switch (PyUnicode_KIND(unicode))
1187
0
        {
1188
0
        case PyUnicode_1BYTE_KIND:
1189
0
            if (PyUnicode_IS_ASCII(unicode))
1190
0
                return "legacy ascii";
1191
0
            else
1192
0
                return "legacy latin1";
1193
0
        case PyUnicode_2BYTE_KIND:
1194
0
            return "legacy UCS2";
1195
0
        case PyUnicode_4BYTE_KIND:
1196
0
            return "legacy UCS4";
1197
0
        default:
1198
0
            return "<legacy invalid kind>";
1199
0
        }
1200
0
    }
1201
0
    switch (PyUnicode_KIND(unicode)) {
1202
0
    case PyUnicode_1BYTE_KIND:
1203
0
        if (PyUnicode_IS_ASCII(unicode))
1204
0
            return "ascii";
1205
0
        else
1206
0
            return "latin1";
1207
0
    case PyUnicode_2BYTE_KIND:
1208
0
        return "UCS2";
1209
0
    case PyUnicode_4BYTE_KIND:
1210
0
        return "UCS4";
1211
0
    default:
1212
0
        return "<invalid compact kind>";
1213
0
    }
1214
0
}
1215
1216
#ifdef Py_DEBUG
1217
/* Functions wrapping macros for use in debugger */
1218
const char *_PyUnicode_utf8(void *unicode_raw){
1219
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1220
    return PyUnicode_UTF8(unicode);
1221
}
1222
1223
const void *_PyUnicode_compact_data(void *unicode_raw) {
1224
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1225
    return _PyUnicode_COMPACT_DATA(unicode);
1226
}
1227
const void *_PyUnicode_data(void *unicode_raw) {
1228
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1229
    printf("obj %p\n", (void*)unicode);
1230
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1231
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1232
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1233
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1234
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1235
    return PyUnicode_DATA(unicode);
1236
}
1237
1238
void
1239
_PyUnicode_Dump(PyObject *op)
1240
{
1241
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1242
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1243
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1244
    const void *data;
1245
1246
    if (ascii->state.compact)
1247
    {
1248
        if (ascii->state.ascii)
1249
            data = (ascii + 1);
1250
        else
1251
            data = (compact + 1);
1252
    }
1253
    else
1254
        data = unicode->data.any;
1255
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1256
1257
    if (!ascii->state.ascii) {
1258
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1259
    }
1260
    printf(", data=%p\n", data);
1261
}
1262
#endif
1263
1264
1265
PyObject *
1266
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1267
598M
{
1268
    /* Optimization for empty strings */
1269
598M
    if (size == 0) {
1270
30.5M
        return _PyUnicode_GetEmpty();
1271
30.5M
    }
1272
1273
568M
    PyObject *obj;
1274
568M
    PyCompactUnicodeObject *unicode;
1275
568M
    void *data;
1276
568M
    int kind;
1277
568M
    int is_ascii;
1278
568M
    Py_ssize_t char_size;
1279
568M
    Py_ssize_t struct_size;
1280
1281
568M
    is_ascii = 0;
1282
568M
    struct_size = sizeof(PyCompactUnicodeObject);
1283
568M
    if (maxchar < 128) {
1284
385M
        kind = PyUnicode_1BYTE_KIND;
1285
385M
        char_size = 1;
1286
385M
        is_ascii = 1;
1287
385M
        struct_size = sizeof(PyASCIIObject);
1288
385M
    }
1289
182M
    else if (maxchar < 256) {
1290
15.8M
        kind = PyUnicode_1BYTE_KIND;
1291
15.8M
        char_size = 1;
1292
15.8M
    }
1293
166M
    else if (maxchar < 65536) {
1294
155M
        kind = PyUnicode_2BYTE_KIND;
1295
155M
        char_size = 2;
1296
155M
    }
1297
10.9M
    else {
1298
10.9M
        if (maxchar > MAX_UNICODE) {
1299
0
            PyErr_SetString(PyExc_SystemError,
1300
0
                            "invalid maximum character passed to PyUnicode_New");
1301
0
            return NULL;
1302
0
        }
1303
10.9M
        kind = PyUnicode_4BYTE_KIND;
1304
10.9M
        char_size = 4;
1305
10.9M
    }
1306
1307
    /* Ensure we won't overflow the size. */
1308
568M
    if (size < 0) {
1309
0
        PyErr_SetString(PyExc_SystemError,
1310
0
                        "Negative size passed to PyUnicode_New");
1311
0
        return NULL;
1312
0
    }
1313
568M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1314
0
        return PyErr_NoMemory();
1315
1316
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1317
     * PyObject_New() so we are able to allocate space for the object and
1318
     * it's data buffer.
1319
     */
1320
568M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1321
568M
    if (obj == NULL) {
1322
0
        return PyErr_NoMemory();
1323
0
    }
1324
568M
    _PyObject_Init(obj, &PyUnicode_Type);
1325
1326
568M
    unicode = (PyCompactUnicodeObject *)obj;
1327
568M
    if (is_ascii)
1328
385M
        data = ((PyASCIIObject*)obj) + 1;
1329
182M
    else
1330
182M
        data = unicode + 1;
1331
568M
    _PyUnicode_LENGTH(unicode) = size;
1332
568M
    _PyUnicode_HASH(unicode) = -1;
1333
568M
    _PyUnicode_STATE(unicode).interned = 0;
1334
568M
    _PyUnicode_STATE(unicode).kind = kind;
1335
568M
    _PyUnicode_STATE(unicode).compact = 1;
1336
568M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1337
568M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1338
568M
    if (is_ascii) {
1339
385M
        ((char*)data)[size] = 0;
1340
385M
    }
1341
182M
    else if (kind == PyUnicode_1BYTE_KIND) {
1342
15.8M
        ((char*)data)[size] = 0;
1343
15.8M
        unicode->utf8 = NULL;
1344
15.8M
        unicode->utf8_length = 0;
1345
15.8M
    }
1346
166M
    else {
1347
166M
        unicode->utf8 = NULL;
1348
166M
        unicode->utf8_length = 0;
1349
166M
        if (kind == PyUnicode_2BYTE_KIND)
1350
155M
            ((Py_UCS2*)data)[size] = 0;
1351
10.9M
        else /* kind == PyUnicode_4BYTE_KIND */
1352
10.9M
            ((Py_UCS4*)data)[size] = 0;
1353
166M
    }
1354
#ifdef Py_DEBUG
1355
    unicode_fill_invalid((PyObject*)unicode, 0);
1356
#endif
1357
568M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1358
568M
    return obj;
1359
568M
}
1360
1361
static int
1362
unicode_check_modifiable(PyObject *unicode)
1363
587
{
1364
587
    if (!_PyUnicode_IsModifiable(unicode)) {
1365
0
        PyErr_SetString(PyExc_SystemError,
1366
0
                        "Cannot modify a string currently used");
1367
0
        return -1;
1368
0
    }
1369
587
    return 0;
1370
587
}
1371
1372
static int
1373
_copy_characters(PyObject *to, Py_ssize_t to_start,
1374
                 PyObject *from, Py_ssize_t from_start,
1375
                 Py_ssize_t how_many, int check_maxchar)
1376
347M
{
1377
347M
    int from_kind, to_kind;
1378
347M
    const void *from_data;
1379
347M
    void *to_data;
1380
1381
347M
    assert(0 <= how_many);
1382
347M
    assert(0 <= from_start);
1383
347M
    assert(0 <= to_start);
1384
347M
    assert(PyUnicode_Check(from));
1385
347M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1386
1387
347M
    assert(to == NULL || PyUnicode_Check(to));
1388
1389
347M
    if (how_many == 0) {
1390
1.82M
        return 0;
1391
1.82M
    }
1392
1393
347M
    assert(to != NULL);
1394
345M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1395
1396
345M
    from_kind = PyUnicode_KIND(from);
1397
345M
    from_data = PyUnicode_DATA(from);
1398
345M
    to_kind = PyUnicode_KIND(to);
1399
345M
    to_data = PyUnicode_DATA(to);
1400
1401
#ifdef Py_DEBUG
1402
    if (!check_maxchar
1403
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1404
    {
1405
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1406
        Py_UCS4 ch;
1407
        Py_ssize_t i;
1408
        for (i=0; i < how_many; i++) {
1409
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1410
            assert(ch <= to_maxchar);
1411
        }
1412
    }
1413
#endif
1414
1415
345M
    if (from_kind == to_kind) {
1416
234M
        if (check_maxchar
1417
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1418
0
        {
1419
            /* Writing Latin-1 characters into an ASCII string requires to
1420
               check that all written characters are pure ASCII */
1421
0
            Py_UCS4 max_char;
1422
0
            max_char = ucs1lib_find_max_char(from_data,
1423
0
                                             (const Py_UCS1*)from_data + how_many);
1424
0
            if (max_char >= 128)
1425
0
                return -1;
1426
0
        }
1427
234M
        memcpy((char*)to_data + to_kind * to_start,
1428
234M
                  (const char*)from_data + from_kind * from_start,
1429
234M
                  to_kind * how_many);
1430
234M
    }
1431
111M
    else if (from_kind == PyUnicode_1BYTE_KIND
1432
108M
             && to_kind == PyUnicode_2BYTE_KIND)
1433
95.6M
    {
1434
95.6M
        _PyUnicode_CONVERT_BYTES(
1435
95.6M
            Py_UCS1, Py_UCS2,
1436
95.6M
            PyUnicode_1BYTE_DATA(from) + from_start,
1437
95.6M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1438
95.6M
            PyUnicode_2BYTE_DATA(to) + to_start
1439
95.6M
            );
1440
95.6M
    }
1441
15.5M
    else if (from_kind == PyUnicode_1BYTE_KIND
1442
12.6M
             && to_kind == PyUnicode_4BYTE_KIND)
1443
12.6M
    {
1444
12.6M
        _PyUnicode_CONVERT_BYTES(
1445
12.6M
            Py_UCS1, Py_UCS4,
1446
12.6M
            PyUnicode_1BYTE_DATA(from) + from_start,
1447
12.6M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1448
12.6M
            PyUnicode_4BYTE_DATA(to) + to_start
1449
12.6M
            );
1450
12.6M
    }
1451
2.81M
    else if (from_kind == PyUnicode_2BYTE_KIND
1452
2.80M
             && to_kind == PyUnicode_4BYTE_KIND)
1453
2.80M
    {
1454
2.80M
        _PyUnicode_CONVERT_BYTES(
1455
2.80M
            Py_UCS2, Py_UCS4,
1456
2.80M
            PyUnicode_2BYTE_DATA(from) + from_start,
1457
2.80M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1458
2.80M
            PyUnicode_4BYTE_DATA(to) + to_start
1459
2.80M
            );
1460
2.80M
    }
1461
12.0k
    else {
1462
12.0k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1463
1464
12.0k
        if (!check_maxchar) {
1465
12.0k
            if (from_kind == PyUnicode_2BYTE_KIND
1466
2.26k
                && to_kind == PyUnicode_1BYTE_KIND)
1467
2.26k
            {
1468
2.26k
                _PyUnicode_CONVERT_BYTES(
1469
2.26k
                    Py_UCS2, Py_UCS1,
1470
2.26k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1471
2.26k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1472
2.26k
                    PyUnicode_1BYTE_DATA(to) + to_start
1473
2.26k
                    );
1474
2.26k
            }
1475
9.73k
            else if (from_kind == PyUnicode_4BYTE_KIND
1476
9.73k
                     && to_kind == PyUnicode_1BYTE_KIND)
1477
6.46k
            {
1478
6.46k
                _PyUnicode_CONVERT_BYTES(
1479
6.46k
                    Py_UCS4, Py_UCS1,
1480
6.46k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1481
6.46k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1482
6.46k
                    PyUnicode_1BYTE_DATA(to) + to_start
1483
6.46k
                    );
1484
6.46k
            }
1485
3.27k
            else if (from_kind == PyUnicode_4BYTE_KIND
1486
3.27k
                     && to_kind == PyUnicode_2BYTE_KIND)
1487
3.27k
            {
1488
3.27k
                _PyUnicode_CONVERT_BYTES(
1489
3.27k
                    Py_UCS4, Py_UCS2,
1490
3.27k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1491
3.27k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1492
3.27k
                    PyUnicode_2BYTE_DATA(to) + to_start
1493
3.27k
                    );
1494
3.27k
            }
1495
0
            else {
1496
0
                Py_UNREACHABLE();
1497
0
            }
1498
12.0k
        }
1499
0
        else {
1500
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1501
0
            Py_UCS4 ch;
1502
0
            Py_ssize_t i;
1503
1504
0
            for (i=0; i < how_many; i++) {
1505
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1506
0
                if (ch > to_maxchar)
1507
0
                    return -1;
1508
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1509
0
            }
1510
0
        }
1511
12.0k
    }
1512
345M
    return 0;
1513
345M
}
1514
1515
void
1516
_PyUnicode_FastCopyCharacters(
1517
    PyObject *to, Py_ssize_t to_start,
1518
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1519
347M
{
1520
347M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1521
347M
}
1522
1523
Py_ssize_t
1524
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1525
                         PyObject *from, Py_ssize_t from_start,
1526
                         Py_ssize_t how_many)
1527
0
{
1528
0
    int err;
1529
1530
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1531
0
        PyErr_BadInternalCall();
1532
0
        return -1;
1533
0
    }
1534
1535
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1536
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1537
0
        return -1;
1538
0
    }
1539
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1540
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1541
0
        return -1;
1542
0
    }
1543
0
    if (how_many < 0) {
1544
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1545
0
        return -1;
1546
0
    }
1547
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1548
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1549
0
        PyErr_Format(PyExc_SystemError,
1550
0
                     "Cannot write %zi characters at %zi "
1551
0
                     "in a string of %zi characters",
1552
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1553
0
        return -1;
1554
0
    }
1555
1556
0
    if (how_many == 0)
1557
0
        return 0;
1558
1559
0
    if (unicode_check_modifiable(to))
1560
0
        return -1;
1561
1562
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1563
0
    if (err) {
1564
0
        PyErr_Format(PyExc_SystemError,
1565
0
                     "Cannot copy %s characters "
1566
0
                     "into a string of %s characters",
1567
0
                     unicode_kind_name(from),
1568
0
                     unicode_kind_name(to));
1569
0
        return -1;
1570
0
    }
1571
0
    return how_many;
1572
0
}
1573
1574
/* Find the maximum code point and count the number of surrogate pairs so a
1575
   correct string length can be computed before converting a string to UCS4.
1576
   This function counts single surrogates as a character and not as a pair.
1577
1578
   Return 0 on success, or -1 on error. */
1579
static int
1580
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1581
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1582
529k
{
1583
529k
    const wchar_t *iter;
1584
529k
    Py_UCS4 ch;
1585
1586
529k
    assert(num_surrogates != NULL && maxchar != NULL);
1587
529k
    *num_surrogates = 0;
1588
529k
    *maxchar = 0;
1589
1590
13.8M
    for (iter = begin; iter < end; ) {
1591
#if SIZEOF_WCHAR_T == 2
1592
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1593
            && (iter+1) < end
1594
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1595
        {
1596
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1597
            ++(*num_surrogates);
1598
            iter += 2;
1599
        }
1600
        else
1601
#endif
1602
13.3M
        {
1603
13.3M
            ch = *iter;
1604
13.3M
            iter++;
1605
13.3M
        }
1606
13.3M
        if (ch > *maxchar) {
1607
2.13M
            *maxchar = ch;
1608
2.13M
            if (*maxchar > MAX_UNICODE) {
1609
0
                PyErr_Format(PyExc_ValueError,
1610
0
                             "character U+%x is not in range [U+0000; U+%x]",
1611
0
                             ch, MAX_UNICODE);
1612
0
                return -1;
1613
0
            }
1614
2.13M
        }
1615
13.3M
    }
1616
529k
    return 0;
1617
529k
}
1618
1619
static void
1620
unicode_dealloc(PyObject *unicode)
1621
582M
{
1622
#ifdef Py_DEBUG
1623
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1624
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1625
    }
1626
#endif
1627
582M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1628
        /* This should never get called, but we also don't want to SEGV if
1629
        * we accidentally decref an immortal string out of existence. Since
1630
        * the string is an immortal object, just re-set the reference count.
1631
        */
1632
#ifdef Py_DEBUG
1633
        Py_UNREACHABLE();
1634
#endif
1635
0
        _Py_SetImmortal(unicode);
1636
0
        return;
1637
0
    }
1638
582M
    switch (_PyUnicode_STATE(unicode).interned) {
1639
582M
        case SSTATE_NOT_INTERNED:
1640
582M
            break;
1641
447k
        case SSTATE_INTERNED_MORTAL:
1642
            /* Remove the object from the intern dict.
1643
             * Before doing so, we set the refcount to 2: the key and value
1644
             * in the interned_dict.
1645
             */
1646
447k
            assert(Py_REFCNT(unicode) == 0);
1647
447k
            Py_SET_REFCNT(unicode, 2);
1648
#ifdef Py_REF_DEBUG
1649
            /* let's be pedantic with the ref total */
1650
            _Py_IncRefTotal(_PyThreadState_GET());
1651
            _Py_IncRefTotal(_PyThreadState_GET());
1652
#endif
1653
447k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1654
447k
            PyObject *interned = get_interned_dict(interp);
1655
447k
            assert(interned != NULL);
1656
447k
            PyObject *popped;
1657
447k
            int r = PyDict_Pop(interned, unicode, &popped);
1658
447k
            if (r == -1) {
1659
0
                PyErr_FormatUnraisable("Exception ignored while "
1660
0
                                       "removing an interned string %R",
1661
0
                                       unicode);
1662
                // We don't know what happened to the string. It's probably
1663
                // best to leak it:
1664
                // - if it was popped, there are no more references to it
1665
                //   so it can't cause trouble (except wasted memory)
1666
                // - if it wasn't popped, it'll remain interned
1667
0
                _Py_SetImmortal(unicode);
1668
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1669
0
                return;
1670
0
            }
1671
447k
            if (r == 0) {
1672
                // The interned string was not found in the interned_dict.
1673
#ifdef Py_DEBUG
1674
                Py_UNREACHABLE();
1675
#endif
1676
0
                _Py_SetImmortal(unicode);
1677
0
                return;
1678
0
            }
1679
            // Successfully popped.
1680
447k
            assert(popped == unicode);
1681
            // Only our `popped` reference should be left; remove it too.
1682
447k
            assert(Py_REFCNT(unicode) == 1);
1683
447k
            Py_SET_REFCNT(unicode, 0);
1684
#ifdef Py_REF_DEBUG
1685
            /* let's be pedantic with the ref total */
1686
            _Py_DecRefTotal(_PyThreadState_GET());
1687
#endif
1688
447k
            break;
1689
0
        default:
1690
            // As with `statically_allocated` above.
1691
#ifdef Py_REF_DEBUG
1692
            Py_UNREACHABLE();
1693
#endif
1694
0
            _Py_SetImmortal(unicode);
1695
0
            return;
1696
582M
    }
1697
582M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1698
163k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1699
163k
    }
1700
582M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1701
16.0M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1702
16.0M
    }
1703
1704
582M
    Py_TYPE(unicode)->tp_free(unicode);
1705
582M
}
1706
1707
#ifdef Py_DEBUG
1708
static int
1709
unicode_is_singleton(PyObject *unicode)
1710
{
1711
    if (unicode == &_Py_STR(empty)) {
1712
        return 1;
1713
    }
1714
1715
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1716
    if (ascii->length == 1) {
1717
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1718
        if (ch < 256 && LATIN1(ch) == unicode) {
1719
            return 1;
1720
        }
1721
    }
1722
    return 0;
1723
}
1724
#endif
1725
1726
int
1727
_PyUnicode_IsModifiable(PyObject *unicode)
1728
73.0M
{
1729
73.0M
    assert(_PyUnicode_CHECK(unicode));
1730
73.0M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1731
2.93M
        return 0;
1732
70.1M
    if (PyUnicode_HASH(unicode) != -1)
1733
0
        return 0;
1734
70.1M
    if (PyUnicode_CHECK_INTERNED(unicode))
1735
0
        return 0;
1736
70.1M
    if (!PyUnicode_CheckExact(unicode))
1737
0
        return 0;
1738
#ifdef Py_DEBUG
1739
    /* singleton refcount is greater than 1 */
1740
    assert(!unicode_is_singleton(unicode));
1741
#endif
1742
70.1M
    return 1;
1743
70.1M
}
1744
1745
static int
1746
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1747
1.95M
{
1748
1.95M
    PyObject *unicode;
1749
1.95M
    Py_ssize_t old_length;
1750
1751
1.95M
    assert(p_unicode != NULL);
1752
1.95M
    unicode = *p_unicode;
1753
1754
1.95M
    assert(unicode != NULL);
1755
1.95M
    assert(PyUnicode_Check(unicode));
1756
1.95M
    assert(0 <= length);
1757
1758
1.95M
    old_length = PyUnicode_GET_LENGTH(unicode);
1759
1.95M
    if (old_length == length)
1760
0
        return 0;
1761
1762
1.95M
    if (length == 0) {
1763
0
        PyObject *empty = _PyUnicode_GetEmpty();
1764
0
        Py_SETREF(*p_unicode, empty);
1765
0
        return 0;
1766
0
    }
1767
1768
1.95M
    if (!_PyUnicode_IsModifiable(unicode)) {
1769
0
        PyObject *copy = resize_copy(unicode, length);
1770
0
        if (copy == NULL)
1771
0
            return -1;
1772
0
        Py_SETREF(*p_unicode, copy);
1773
0
        return 0;
1774
0
    }
1775
1776
1.95M
    if (PyUnicode_IS_COMPACT(unicode)) {
1777
1.95M
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1778
1.95M
        if (new_unicode == NULL)
1779
0
            return -1;
1780
1.95M
        *p_unicode = new_unicode;
1781
1.95M
        return 0;
1782
1.95M
    }
1783
0
    return resize_inplace(unicode, length);
1784
1.95M
}
1785
1786
int
1787
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1788
0
{
1789
0
    PyObject *unicode;
1790
0
    if (p_unicode == NULL) {
1791
0
        PyErr_BadInternalCall();
1792
0
        return -1;
1793
0
    }
1794
0
    unicode = *p_unicode;
1795
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1796
0
    {
1797
0
        PyErr_BadInternalCall();
1798
0
        return -1;
1799
0
    }
1800
0
    return unicode_resize(p_unicode, length);
1801
0
}
1802
1803
static PyObject*
1804
get_latin1_char(Py_UCS1 ch)
1805
279M
{
1806
279M
    PyObject *o = LATIN1(ch);
1807
279M
    return o;
1808
279M
}
1809
1810
static PyObject*
1811
unicode_char(Py_UCS4 ch)
1812
301M
{
1813
301M
    PyObject *unicode;
1814
1815
301M
    assert(ch <= MAX_UNICODE);
1816
1817
301M
    if (ch < 256) {
1818
194M
        return get_latin1_char(ch);
1819
194M
    }
1820
1821
107M
    unicode = PyUnicode_New(1, ch);
1822
107M
    if (unicode == NULL)
1823
0
        return NULL;
1824
1825
107M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1826
107M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1827
98.6M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1828
98.6M
    } else {
1829
8.50M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1830
8.50M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1831
8.50M
    }
1832
107M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1833
107M
    return unicode;
1834
107M
}
1835
1836
1837
static inline void
1838
unicode_write_widechar(int kind, void *data,
1839
                       const wchar_t *u, Py_ssize_t size,
1840
                       Py_ssize_t num_surrogates)
1841
529k
{
1842
529k
    switch (kind) {
1843
498k
    case PyUnicode_1BYTE_KIND:
1844
498k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1845
498k
        break;
1846
1847
30.7k
    case PyUnicode_2BYTE_KIND:
1848
#if SIZEOF_WCHAR_T == 2
1849
        memcpy(data, u, size * 2);
1850
#else
1851
30.7k
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1852
30.7k
#endif
1853
30.7k
        break;
1854
1855
699
    case PyUnicode_4BYTE_KIND:
1856
699
    {
1857
#if SIZEOF_WCHAR_T == 2
1858
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1859
        // surrogate pairs.
1860
        const wchar_t *end = u + size;
1861
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1862
#  ifndef NDEBUG
1863
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1864
#  endif
1865
        for (const wchar_t *iter = u; iter < end; ) {
1866
            assert(ucs4_out < ucs4_end);
1867
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1868
                && (iter+1) < end
1869
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1870
            {
1871
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1872
                iter += 2;
1873
            }
1874
            else {
1875
                *ucs4_out++ = *iter;
1876
                iter++;
1877
            }
1878
        }
1879
        assert(ucs4_out == ucs4_end);
1880
#else
1881
699
        assert(num_surrogates == 0);
1882
699
        memcpy(data, u, size * 4);
1883
699
#endif
1884
699
        break;
1885
0
    }
1886
0
    default:
1887
0
        Py_UNREACHABLE();
1888
529k
    }
1889
529k
}
1890
1891
1892
PyObject *
1893
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1894
978k
{
1895
978k
    PyObject *unicode;
1896
978k
    Py_UCS4 maxchar = 0;
1897
978k
    Py_ssize_t num_surrogates;
1898
1899
978k
    if (u == NULL && size != 0) {
1900
0
        PyErr_BadInternalCall();
1901
0
        return NULL;
1902
0
    }
1903
1904
978k
    if (size == -1) {
1905
1.22k
        size = wcslen(u);
1906
1.22k
    }
1907
1908
    /* If the Unicode data is known at construction time, we can apply
1909
       some optimizations which share commonly used objects. */
1910
1911
    /* Optimization for empty strings */
1912
978k
    if (size == 0)
1913
357k
        _Py_RETURN_UNICODE_EMPTY();
1914
1915
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1916
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1917
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1918
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1919
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1920
        if (!converted) {
1921
            return NULL;
1922
        }
1923
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1924
        PyMem_Free(converted);
1925
        return unicode;
1926
    }
1927
#endif
1928
1929
    /* Single character Unicode objects in the Latin-1 range are
1930
       shared when using this constructor */
1931
620k
    if (size == 1 && (Py_UCS4)*u < 256)
1932
91.5k
        return get_latin1_char((unsigned char)*u);
1933
1934
    /* If not empty and not single character, copy the Unicode data
1935
       into the new object */
1936
529k
    if (find_maxchar_surrogates(u, u + size,
1937
529k
                                &maxchar, &num_surrogates) == -1)
1938
0
        return NULL;
1939
1940
529k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1941
529k
    if (!unicode)
1942
0
        return NULL;
1943
1944
529k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1945
529k
                           u, size, num_surrogates);
1946
1947
529k
    return unicode_result(unicode);
1948
529k
}
1949
1950
1951
int
1952
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1953
                              const wchar_t *str,
1954
                              Py_ssize_t size)
1955
0
{
1956
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1957
1958
0
    if (size < 0) {
1959
0
        size = wcslen(str);
1960
0
    }
1961
1962
0
    if (size == 0) {
1963
0
        return 0;
1964
0
    }
1965
1966
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1967
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1968
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1969
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1970
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1971
        if (!converted) {
1972
            return -1;
1973
        }
1974
1975
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1976
        PyMem_Free(converted);
1977
        return res;
1978
    }
1979
#endif
1980
1981
0
    Py_UCS4 maxchar = 0;
1982
0
    Py_ssize_t num_surrogates;
1983
0
    if (find_maxchar_surrogates(str, str + size,
1984
0
                                &maxchar, &num_surrogates) == -1) {
1985
0
        return -1;
1986
0
    }
1987
1988
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1989
0
        return -1;
1990
0
    }
1991
1992
0
    int kind = writer->kind;
1993
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
1994
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
1995
1996
0
    writer->pos += size - num_surrogates;
1997
0
    return 0;
1998
0
}
1999
2000
2001
PyObject *
2002
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2003
820k
{
2004
820k
    if (size < 0) {
2005
0
        PyErr_SetString(PyExc_SystemError,
2006
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2007
0
        return NULL;
2008
0
    }
2009
820k
    if (u != NULL) {
2010
820k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2011
820k
    }
2012
0
    if (size > 0) {
2013
0
        PyErr_SetString(PyExc_SystemError,
2014
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2015
0
        return NULL;
2016
0
    }
2017
0
    return _PyUnicode_GetEmpty();
2018
0
}
2019
2020
PyObject *
2021
PyUnicode_FromString(const char *u)
2022
14.7M
{
2023
14.7M
    size_t size = strlen(u);
2024
14.7M
    if (size > PY_SSIZE_T_MAX) {
2025
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2026
0
        return NULL;
2027
0
    }
2028
14.7M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2029
14.7M
}
2030
2031
2032
PyObject *
2033
_PyUnicode_FromId(_Py_Identifier *id)
2034
0
{
2035
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2036
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2037
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2038
2039
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2040
0
    if (index < 0) {
2041
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2042
2043
0
        PyMutex_Lock(&rt_ids->mutex);
2044
        // Check again to detect concurrent access. Another thread can have
2045
        // initialized the index while this thread waited for the lock.
2046
0
        index = _Py_atomic_load_ssize(&id->index);
2047
0
        if (index < 0) {
2048
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2049
0
            index = rt_ids->next_index;
2050
0
            rt_ids->next_index++;
2051
0
            _Py_atomic_store_ssize(&id->index, index);
2052
0
        }
2053
0
        PyMutex_Unlock(&rt_ids->mutex);
2054
0
    }
2055
0
    assert(index >= 0);
2056
2057
0
    PyObject *obj;
2058
0
    if (index < ids->size) {
2059
0
        obj = ids->array[index];
2060
0
        if (obj) {
2061
            // Return a borrowed reference
2062
0
            goto end;
2063
0
        }
2064
0
    }
2065
2066
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2067
0
                                       NULL, NULL);
2068
0
    if (!obj) {
2069
0
        goto end;
2070
0
    }
2071
0
    _PyUnicode_InternImmortal(interp, &obj);
2072
2073
0
    if (index >= ids->size) {
2074
        // Overallocate to reduce the number of realloc
2075
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2076
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2077
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2078
0
        if (new_array == NULL) {
2079
0
            PyErr_NoMemory();
2080
0
            obj = NULL;
2081
0
            goto end;
2082
0
        }
2083
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2084
0
        ids->array = new_array;
2085
0
        ids->size = new_size;
2086
0
    }
2087
2088
    // The array stores a strong reference
2089
0
    ids->array[index] = obj;
2090
2091
0
end:
2092
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2093
    // Return a borrowed reference
2094
0
    return obj;
2095
0
}
2096
2097
2098
static void
2099
unicode_clear_identifiers(struct _Py_unicode_state *state)
2100
0
{
2101
0
    struct _Py_unicode_ids *ids = &state->ids;
2102
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2103
0
        Py_XDECREF(ids->array[i]);
2104
0
    }
2105
0
    ids->size = 0;
2106
0
    PyMem_Free(ids->array);
2107
0
    ids->array = NULL;
2108
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2109
    // after Py_Finalize().
2110
0
}
2111
2112
2113
/* Internal function, doesn't check maximum character */
2114
2115
PyObject*
2116
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2117
131M
{
2118
131M
    const unsigned char *s = (const unsigned char *)buffer;
2119
131M
    PyObject *unicode;
2120
131M
    if (size == 1) {
2121
#ifdef Py_DEBUG
2122
        assert((unsigned char)s[0] < 128);
2123
#endif
2124
43.3M
        return get_latin1_char(s[0]);
2125
43.3M
    }
2126
87.9M
    unicode = PyUnicode_New(size, 127);
2127
87.9M
    if (!unicode)
2128
0
        return NULL;
2129
87.9M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2130
87.9M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2131
87.9M
    return unicode;
2132
87.9M
}
2133
2134
static Py_UCS4
2135
kind_maxchar_limit(int kind)
2136
0
{
2137
0
    switch (kind) {
2138
0
    case PyUnicode_1BYTE_KIND:
2139
0
        return 0x80;
2140
0
    case PyUnicode_2BYTE_KIND:
2141
0
        return 0x100;
2142
0
    case PyUnicode_4BYTE_KIND:
2143
0
        return 0x10000;
2144
0
    default:
2145
0
        Py_UNREACHABLE();
2146
0
    }
2147
0
}
2148
2149
static PyObject*
2150
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2151
52.7M
{
2152
52.7M
    PyObject *res;
2153
52.7M
    unsigned char max_char;
2154
2155
52.7M
    if (size == 0) {
2156
8.38M
        _Py_RETURN_UNICODE_EMPTY();
2157
8.38M
    }
2158
52.7M
    assert(size > 0);
2159
44.3M
    if (size == 1) {
2160
10.6M
        return get_latin1_char(u[0]);
2161
10.6M
    }
2162
2163
33.7M
    max_char = ucs1lib_find_max_char(u, u + size);
2164
33.7M
    res = PyUnicode_New(size, max_char);
2165
33.7M
    if (!res)
2166
0
        return NULL;
2167
33.7M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2168
33.7M
    assert(_PyUnicode_CheckConsistency(res, 1));
2169
33.7M
    return res;
2170
33.7M
}
2171
2172
static PyObject*
2173
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2174
99.6M
{
2175
99.6M
    PyObject *res;
2176
99.6M
    Py_UCS2 max_char;
2177
2178
99.6M
    if (size == 0)
2179
11.7M
        _Py_RETURN_UNICODE_EMPTY();
2180
99.6M
    assert(size > 0);
2181
87.8M
    if (size == 1)
2182
59.0M
        return unicode_char(u[0]);
2183
2184
28.8M
    max_char = ucs2lib_find_max_char(u, u + size);
2185
28.8M
    res = PyUnicode_New(size, max_char);
2186
28.8M
    if (!res)
2187
0
        return NULL;
2188
28.8M
    if (max_char >= 256)
2189
18.8M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2190
10.0M
    else {
2191
10.0M
        _PyUnicode_CONVERT_BYTES(
2192
10.0M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2193
10.0M
    }
2194
28.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
2195
28.8M
    return res;
2196
28.8M
}
2197
2198
static PyObject*
2199
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2200
64.5M
{
2201
64.5M
    PyObject *res;
2202
64.5M
    Py_UCS4 max_char;
2203
2204
64.5M
    if (size == 0)
2205
9.19M
        _Py_RETURN_UNICODE_EMPTY();
2206
64.5M
    assert(size > 0);
2207
55.3M
    if (size == 1)
2208
35.6M
        return unicode_char(u[0]);
2209
2210
19.6M
    max_char = ucs4lib_find_max_char(u, u + size);
2211
19.6M
    res = PyUnicode_New(size, max_char);
2212
19.6M
    if (!res)
2213
0
        return NULL;
2214
19.6M
    if (max_char < 256)
2215
13.5M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2216
19.6M
                                 PyUnicode_1BYTE_DATA(res));
2217
6.11M
    else if (max_char < 0x10000)
2218
4.71M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2219
6.11M
                                 PyUnicode_2BYTE_DATA(res));
2220
1.39M
    else
2221
1.39M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2222
19.6M
    assert(_PyUnicode_CheckConsistency(res, 1));
2223
19.6M
    return res;
2224
19.6M
}
2225
2226
2227
int
2228
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2229
                          Py_UCS4 *str,
2230
                          Py_ssize_t size)
2231
0
{
2232
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2233
2234
0
    if (size < 0) {
2235
0
        PyErr_SetString(PyExc_ValueError,
2236
0
                        "size must be positive");
2237
0
        return -1;
2238
0
    }
2239
2240
0
    if (size == 0) {
2241
0
        return 0;
2242
0
    }
2243
2244
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2245
2246
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2247
0
        return -1;
2248
0
    }
2249
2250
0
    int kind = writer->kind;
2251
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2252
0
    if (kind == PyUnicode_1BYTE_KIND) {
2253
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2254
0
                                 str, str + size,
2255
0
                                 data);
2256
0
    }
2257
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2258
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2259
0
                                 str, str + size,
2260
0
                                 data);
2261
0
    }
2262
0
    else {
2263
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2264
0
    }
2265
0
    writer->pos += size;
2266
2267
0
    return 0;
2268
0
}
2269
2270
2271
PyObject*
2272
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2273
160M
{
2274
160M
    if (size < 0) {
2275
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2276
0
        return NULL;
2277
0
    }
2278
160M
    switch (kind) {
2279
22.6M
    case PyUnicode_1BYTE_KIND:
2280
22.6M
        return _PyUnicode_FromUCS1(buffer, size);
2281
86.1M
    case PyUnicode_2BYTE_KIND:
2282
86.1M
        return _PyUnicode_FromUCS2(buffer, size);
2283
51.4M
    case PyUnicode_4BYTE_KIND:
2284
51.4M
        return _PyUnicode_FromUCS4(buffer, size);
2285
0
    default:
2286
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2287
0
        return NULL;
2288
160M
    }
2289
160M
}
2290
2291
Py_UCS4
2292
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2293
14.6M
{
2294
14.6M
    int kind;
2295
14.6M
    const void *startptr, *endptr;
2296
2297
14.6M
    assert(0 <= start);
2298
14.6M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2299
14.6M
    assert(start <= end);
2300
2301
14.6M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2302
66.9k
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2303
2304
14.5M
    if (start == end)
2305
0
        return 127;
2306
2307
14.5M
    if (PyUnicode_IS_ASCII(unicode))
2308
14.5M
        return 127;
2309
2310
22.0k
    kind = PyUnicode_KIND(unicode);
2311
22.0k
    startptr = PyUnicode_DATA(unicode);
2312
22.0k
    endptr = (char *)startptr + end * kind;
2313
22.0k
    startptr = (char *)startptr + start * kind;
2314
22.0k
    switch(kind) {
2315
1.53k
    case PyUnicode_1BYTE_KIND:
2316
1.53k
        return ucs1lib_find_max_char(startptr, endptr);
2317
4.04k
    case PyUnicode_2BYTE_KIND:
2318
4.04k
        return ucs2lib_find_max_char(startptr, endptr);
2319
16.4k
    case PyUnicode_4BYTE_KIND:
2320
16.4k
        return ucs4lib_find_max_char(startptr, endptr);
2321
0
    default:
2322
0
        Py_UNREACHABLE();
2323
22.0k
    }
2324
22.0k
}
2325
2326
/* Ensure that a string uses the most efficient storage, if it is not the
2327
   case: create a new string with of the right kind. Write NULL into *p_unicode
2328
   on error. */
2329
static void
2330
unicode_adjust_maxchar(PyObject **p_unicode)
2331
0
{
2332
0
    PyObject *unicode, *copy;
2333
0
    Py_UCS4 max_char;
2334
0
    Py_ssize_t len;
2335
0
    int kind;
2336
2337
0
    assert(p_unicode != NULL);
2338
0
    unicode = *p_unicode;
2339
0
    if (PyUnicode_IS_ASCII(unicode))
2340
0
        return;
2341
2342
0
    len = PyUnicode_GET_LENGTH(unicode);
2343
0
    kind = PyUnicode_KIND(unicode);
2344
0
    if (kind == PyUnicode_1BYTE_KIND) {
2345
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2346
0
        max_char = ucs1lib_find_max_char(u, u + len);
2347
0
        if (max_char >= 128)
2348
0
            return;
2349
0
    }
2350
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2351
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2352
0
        max_char = ucs2lib_find_max_char(u, u + len);
2353
0
        if (max_char >= 256)
2354
0
            return;
2355
0
    }
2356
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2357
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2358
0
        max_char = ucs4lib_find_max_char(u, u + len);
2359
0
        if (max_char >= 0x10000)
2360
0
            return;
2361
0
    }
2362
0
    else
2363
0
        Py_UNREACHABLE();
2364
2365
0
    copy = PyUnicode_New(len, max_char);
2366
0
    if (copy != NULL)
2367
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2368
0
    Py_DECREF(unicode);
2369
0
    *p_unicode = copy;
2370
0
}
2371
2372
PyObject*
2373
_PyUnicode_Copy(PyObject *unicode)
2374
2.92M
{
2375
2.92M
    Py_ssize_t length;
2376
2.92M
    PyObject *copy;
2377
2378
2.92M
    if (!PyUnicode_Check(unicode)) {
2379
0
        PyErr_BadInternalCall();
2380
0
        return NULL;
2381
0
    }
2382
2383
2.92M
    length = PyUnicode_GET_LENGTH(unicode);
2384
2.92M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2385
2.92M
    if (!copy)
2386
0
        return NULL;
2387
2.92M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2388
2389
2.92M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2390
2.92M
              length * PyUnicode_KIND(unicode));
2391
2.92M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2392
2.92M
    return copy;
2393
2.92M
}
2394
2395
2396
/* Widen Unicode objects to larger buffers. Don't write terminating null
2397
   character. Return NULL on error. */
2398
2399
static void*
2400
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2401
13.2M
{
2402
13.2M
    void *result;
2403
2404
13.2M
    assert(skind < kind);
2405
13.2M
    switch (kind) {
2406
11.1M
    case PyUnicode_2BYTE_KIND:
2407
11.1M
        result = PyMem_New(Py_UCS2, len);
2408
11.1M
        if (!result)
2409
0
            return PyErr_NoMemory();
2410
11.1M
        assert(skind == PyUnicode_1BYTE_KIND);
2411
11.1M
        _PyUnicode_CONVERT_BYTES(
2412
11.1M
            Py_UCS1, Py_UCS2,
2413
11.1M
            (const Py_UCS1 *)data,
2414
11.1M
            ((const Py_UCS1 *)data) + len,
2415
11.1M
            result);
2416
11.1M
        return result;
2417
2.05M
    case PyUnicode_4BYTE_KIND:
2418
2.05M
        result = PyMem_New(Py_UCS4, len);
2419
2.05M
        if (!result)
2420
0
            return PyErr_NoMemory();
2421
2.05M
        if (skind == PyUnicode_2BYTE_KIND) {
2422
0
            _PyUnicode_CONVERT_BYTES(
2423
0
                Py_UCS2, Py_UCS4,
2424
0
                (const Py_UCS2 *)data,
2425
0
                ((const Py_UCS2 *)data) + len,
2426
0
                result);
2427
0
        }
2428
2.05M
        else {
2429
2.05M
            assert(skind == PyUnicode_1BYTE_KIND);
2430
2.05M
            _PyUnicode_CONVERT_BYTES(
2431
2.05M
                Py_UCS1, Py_UCS4,
2432
2.05M
                (const Py_UCS1 *)data,
2433
2.05M
                ((const Py_UCS1 *)data) + len,
2434
2.05M
                result);
2435
2.05M
        }
2436
2.05M
        return result;
2437
0
    default:
2438
0
        Py_UNREACHABLE();
2439
0
        return NULL;
2440
13.2M
    }
2441
13.2M
}
2442
2443
static Py_UCS4*
2444
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2445
        int copy_null)
2446
78.8k
{
2447
78.8k
    int kind;
2448
78.8k
    const void *data;
2449
78.8k
    Py_ssize_t len, targetlen;
2450
78.8k
    kind = PyUnicode_KIND(string);
2451
78.8k
    data = PyUnicode_DATA(string);
2452
78.8k
    len = PyUnicode_GET_LENGTH(string);
2453
78.8k
    targetlen = len;
2454
78.8k
    if (copy_null)
2455
0
        targetlen++;
2456
78.8k
    if (!target) {
2457
0
        target = PyMem_New(Py_UCS4, targetlen);
2458
0
        if (!target) {
2459
0
            PyErr_NoMemory();
2460
0
            return NULL;
2461
0
        }
2462
0
    }
2463
78.8k
    else {
2464
78.8k
        if (targetsize < targetlen) {
2465
0
            PyErr_Format(PyExc_SystemError,
2466
0
                         "string is longer than the buffer");
2467
0
            if (copy_null && 0 < targetsize)
2468
0
                target[0] = 0;
2469
0
            return NULL;
2470
0
        }
2471
78.8k
    }
2472
78.8k
    if (kind == PyUnicode_1BYTE_KIND) {
2473
55.4k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2474
55.4k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2475
55.4k
    }
2476
23.4k
    else if (kind == PyUnicode_2BYTE_KIND) {
2477
16.9k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2478
16.9k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2479
16.9k
    }
2480
6.43k
    else if (kind == PyUnicode_4BYTE_KIND) {
2481
6.43k
        memcpy(target, data, len * sizeof(Py_UCS4));
2482
6.43k
    }
2483
0
    else {
2484
0
        Py_UNREACHABLE();
2485
0
    }
2486
78.8k
    if (copy_null)
2487
0
        target[len] = 0;
2488
78.8k
    return target;
2489
78.8k
}
2490
2491
Py_UCS4*
2492
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2493
                 int copy_null)
2494
78.8k
{
2495
78.8k
    if (target == NULL || targetsize < 0) {
2496
0
        PyErr_BadInternalCall();
2497
0
        return NULL;
2498
0
    }
2499
78.8k
    return as_ucs4(string, target, targetsize, copy_null);
2500
78.8k
}
2501
2502
Py_UCS4*
2503
PyUnicode_AsUCS4Copy(PyObject *string)
2504
0
{
2505
0
    return as_ucs4(string, NULL, 0, 1);
2506
0
}
2507
2508
/* maximum number of characters required for output of %jo or %jd or %p.
2509
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2510
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2511
   plus 1 for the terminal NUL. */
2512
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2513
2514
static int
2515
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2516
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2517
9.86M
{
2518
9.86M
    Py_ssize_t length, fill, arglen;
2519
9.86M
    Py_UCS4 maxchar;
2520
2521
9.86M
    length = PyUnicode_GET_LENGTH(str);
2522
9.86M
    if ((precision == -1 || precision >= length)
2523
9.86M
        && width <= length)
2524
9.86M
        return _PyUnicodeWriter_WriteStr(writer, str);
2525
2526
46
    if (precision != -1)
2527
46
        length = Py_MIN(precision, length);
2528
2529
46
    arglen = Py_MAX(length, width);
2530
46
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2531
17
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2532
29
    else
2533
29
        maxchar = writer->maxchar;
2534
2535
46
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2536
0
        return -1;
2537
2538
46
    fill = Py_MAX(width - length, 0);
2539
46
    if (fill && !(flags & F_LJUST)) {
2540
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2541
0
            return -1;
2542
0
        writer->pos += fill;
2543
0
    }
2544
2545
46
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2546
46
                                  str, 0, length);
2547
46
    writer->pos += length;
2548
2549
46
    if (fill && (flags & F_LJUST)) {
2550
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2551
0
            return -1;
2552
0
        writer->pos += fill;
2553
0
    }
2554
2555
46
    return 0;
2556
46
}
2557
2558
static int
2559
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2560
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2561
4.03M
{
2562
    /* UTF-8 */
2563
4.03M
    Py_ssize_t *pconsumed = NULL;
2564
4.03M
    Py_ssize_t length;
2565
4.03M
    if (precision == -1) {
2566
1.05M
        length = strlen(str);
2567
1.05M
    }
2568
2.98M
    else {
2569
2.98M
        length = 0;
2570
46.9M
        while (length < precision && str[length]) {
2571
44.0M
            length++;
2572
44.0M
        }
2573
2.98M
        if (length == precision) {
2574
            /* The input string is not NUL-terminated.  If it ends with an
2575
             * incomplete UTF-8 sequence, truncate the string just before it.
2576
             * Incomplete sequences in the middle and sequences which cannot
2577
             * be valid prefixes are still treated as errors and replaced
2578
             * with \xfffd. */
2579
1.45k
            pconsumed = &length;
2580
1.45k
        }
2581
2.98M
    }
2582
2583
4.03M
    if (width < 0) {
2584
4.03M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2585
4.03M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2586
4.03M
    }
2587
2588
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2589
0
                                                     "replace", pconsumed);
2590
0
    if (unicode == NULL)
2591
0
        return -1;
2592
2593
0
    int res = unicode_fromformat_write_str(writer, unicode,
2594
0
                                           width, -1, flags);
2595
0
    Py_DECREF(unicode);
2596
0
    return res;
2597
0
}
2598
2599
static int
2600
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2601
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2602
0
{
2603
0
    Py_ssize_t length;
2604
0
    if (precision == -1) {
2605
0
        length = wcslen(str);
2606
0
    }
2607
0
    else {
2608
0
        length = 0;
2609
0
        while (length < precision && str[length]) {
2610
0
            length++;
2611
0
        }
2612
0
    }
2613
2614
0
    if (width < 0) {
2615
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2616
0
                                             str, length);
2617
0
    }
2618
2619
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2620
0
    if (unicode == NULL)
2621
0
        return -1;
2622
2623
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2624
0
    Py_DECREF(unicode);
2625
0
    return res;
2626
0
}
2627
2628
0
#define F_LONG 1
2629
0
#define F_LONGLONG 2
2630
209k
#define F_SIZE 3
2631
0
#define F_PTRDIFF 4
2632
0
#define F_INTMAX 5
2633
2634
static const char*
2635
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2636
                       const char *f, va_list *vargs)
2637
33.0M
{
2638
33.0M
    const char *p;
2639
33.0M
    Py_ssize_t len;
2640
33.0M
    int flags = 0;
2641
33.0M
    Py_ssize_t width;
2642
33.0M
    Py_ssize_t precision;
2643
2644
33.0M
    p = f;
2645
33.0M
    f++;
2646
33.0M
    if (*f == '%') {
2647
4.49M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2648
0
            return NULL;
2649
4.49M
        f++;
2650
4.49M
        return f;
2651
4.49M
    }
2652
2653
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2654
    /* Flags '+', ' ' and '#' are not particularly useful.
2655
     * They are not worth the implementation and maintenance costs.
2656
     * In addition, '#' should add "0" for "o" conversions for compatibility
2657
     * with printf, but it would confuse Python users. */
2658
28.5M
    while (1) {
2659
28.5M
        switch (*f++) {
2660
0
        case '-': flags |= F_LJUST; continue;
2661
1.59k
        case '0': flags |= F_ZERO; continue;
2662
0
        case '#': flags |= F_ALT; continue;
2663
28.5M
        }
2664
28.5M
        f--;
2665
28.5M
        break;
2666
28.5M
    }
2667
2668
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2669
28.5M
    width = -1;
2670
28.5M
    if (*f == '*') {
2671
0
        width = va_arg(*vargs, int);
2672
0
        if (width < 0) {
2673
0
            flags |= F_LJUST;
2674
0
            width = -width;
2675
0
        }
2676
0
        f++;
2677
0
    }
2678
28.5M
    else if (Py_ISDIGIT((unsigned)*f)) {
2679
1.59k
        width = *f - '0';
2680
1.59k
        f++;
2681
1.59k
        while (Py_ISDIGIT((unsigned)*f)) {
2682
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2683
0
                PyErr_SetString(PyExc_ValueError,
2684
0
                                "width too big");
2685
0
                return NULL;
2686
0
            }
2687
0
            width = (width * 10) + (*f - '0');
2688
0
            f++;
2689
0
        }
2690
1.59k
    }
2691
28.5M
    precision = -1;
2692
28.5M
    if (*f == '.') {
2693
2.98M
        f++;
2694
2.98M
        if (*f == '*') {
2695
0
            precision = va_arg(*vargs, int);
2696
0
            if (precision < 0) {
2697
0
                precision = -2;
2698
0
            }
2699
0
            f++;
2700
0
        }
2701
2.98M
        else if (Py_ISDIGIT((unsigned)*f)) {
2702
2.98M
            precision = (*f - '0');
2703
2.98M
            f++;
2704
8.96M
            while (Py_ISDIGIT((unsigned)*f)) {
2705
5.97M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2706
0
                    PyErr_SetString(PyExc_ValueError,
2707
0
                                    "precision too big");
2708
0
                    return NULL;
2709
0
                }
2710
5.97M
                precision = (precision * 10) + (*f - '0');
2711
5.97M
                f++;
2712
5.97M
            }
2713
2.98M
        }
2714
2.98M
    }
2715
2716
28.5M
    int sizemod = 0;
2717
28.5M
    if (*f == 'l') {
2718
0
        if (f[1] == 'l') {
2719
0
            sizemod = F_LONGLONG;
2720
0
            f += 2;
2721
0
        }
2722
0
        else {
2723
0
            sizemod = F_LONG;
2724
0
            ++f;
2725
0
        }
2726
0
    }
2727
28.5M
    else if (*f == 'z') {
2728
104k
        sizemod = F_SIZE;
2729
104k
        ++f;
2730
104k
    }
2731
28.4M
    else if (*f == 't') {
2732
0
        sizemod = F_PTRDIFF;
2733
0
        ++f;
2734
0
    }
2735
28.4M
    else if (*f == 'j') {
2736
0
        sizemod = F_INTMAX;
2737
0
        ++f;
2738
0
    }
2739
28.5M
    if (f[0] != '\0' && f[1] == '\0')
2740
5.40M
        writer->overallocate = 0;
2741
2742
28.5M
    switch (*f) {
2743
9.35M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2744
9.35M
        break;
2745
5.30M
    case 'c': case 'p':
2746
5.30M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2747
5.30M
        break;
2748
5.30M
    case 's':
2749
4.03M
    case 'V':
2750
4.03M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2751
4.03M
        break;
2752
9.86M
    default:
2753
9.86M
        if (sizemod) goto invalid_format;
2754
9.86M
        break;
2755
28.5M
    }
2756
2757
28.5M
    switch (*f) {
2758
5.30M
    case 'c':
2759
5.30M
    {
2760
5.30M
        int ordinal = va_arg(*vargs, int);
2761
5.30M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2762
0
            PyErr_SetString(PyExc_OverflowError,
2763
0
                            "character argument not in range(0x110000)");
2764
0
            return NULL;
2765
0
        }
2766
5.30M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2767
0
            return NULL;
2768
5.30M
        break;
2769
5.30M
    }
2770
2771
9.33M
    case 'd': case 'i':
2772
9.35M
    case 'o': case 'u': case 'x': case 'X':
2773
9.35M
    {
2774
9.35M
        char buffer[MAX_INTMAX_CHARS];
2775
2776
        // Fill buffer using sprinf, with one of many possible format
2777
        // strings, like "%llX" for `long long` in hexadecimal.
2778
        // The type/size is in `sizemod`; the format is in `*f`.
2779
2780
        // Use macros with nested switches to keep the sprintf format strings
2781
        // as compile-time literals, avoiding warnings and maybe allowing
2782
        // optimizations.
2783
2784
        // `SPRINT` macro does one sprintf
2785
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2786
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2787
9.35M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2788
9.35M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2789
2790
        // One inner switch to handle all format variants
2791
9.35M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2792
9.35M
            switch (*f) {                                                     \
2793
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2794
15.8k
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2795
1.17k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2796
933
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2797
9.33M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2798
9.35M
            }
2799
2800
        // Outer switch to handle all the sizes/types
2801
9.35M
        switch (sizemod) {
2802
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2803
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2804
104k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2805
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2806
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2807
9.24M
            default:         DO_SPRINTS("", int, unsigned int); break;
2808
9.35M
        }
2809
9.35M
        #undef SPRINT
2810
9.35M
        #undef DO_SPRINTS
2811
2812
9.35M
        assert(len >= 0);
2813
2814
9.35M
        int sign = (buffer[0] == '-');
2815
9.35M
        len -= sign;
2816
2817
9.35M
        precision = Py_MAX(precision, len);
2818
9.35M
        width = Py_MAX(width, precision + sign);
2819
9.35M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2820
1.59k
            precision = width - sign;
2821
1.59k
        }
2822
2823
9.35M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2824
9.35M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2825
2826
9.35M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2827
0
            return NULL;
2828
2829
9.35M
        if (spacepad && !(flags & F_LJUST)) {
2830
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2831
0
                return NULL;
2832
0
            writer->pos += spacepad;
2833
0
        }
2834
2835
9.35M
        if (sign) {
2836
765
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2837
0
                return NULL;
2838
765
        }
2839
2840
9.35M
        if (zeropad) {
2841
587
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2842
0
                return NULL;
2843
587
            writer->pos += zeropad;
2844
587
        }
2845
2846
9.35M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2847
0
            return NULL;
2848
2849
9.35M
        if (spacepad && (flags & F_LJUST)) {
2850
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2851
0
                return NULL;
2852
0
            writer->pos += spacepad;
2853
0
        }
2854
9.35M
        break;
2855
9.35M
    }
2856
2857
9.35M
    case 'p':
2858
1
    {
2859
1
        char number[MAX_INTMAX_CHARS];
2860
2861
1
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2862
1
        assert(len >= 0);
2863
2864
        /* %p is ill-defined:  ensure leading 0x. */
2865
1
        if (number[1] == 'X')
2866
0
            number[1] = 'x';
2867
1
        else if (number[1] != 'x') {
2868
0
            memmove(number + 2, number,
2869
0
                    strlen(number) + 1);
2870
0
            number[0] = '0';
2871
0
            number[1] = 'x';
2872
0
            len += 2;
2873
0
        }
2874
2875
1
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2876
0
            return NULL;
2877
1
        break;
2878
1
    }
2879
2880
4.03M
    case 's':
2881
4.03M
    {
2882
4.03M
        if (sizemod) {
2883
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2884
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2885
0
                return NULL;
2886
0
        }
2887
4.03M
        else {
2888
            /* UTF-8 */
2889
4.03M
            const char *s = va_arg(*vargs, const char*);
2890
4.03M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2891
0
                return NULL;
2892
4.03M
        }
2893
4.03M
        break;
2894
4.03M
    }
2895
2896
4.56M
    case 'U':
2897
4.56M
    {
2898
4.56M
        PyObject *obj = va_arg(*vargs, PyObject *);
2899
4.56M
        assert(obj && _PyUnicode_CHECK(obj));
2900
2901
4.56M
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2902
0
            return NULL;
2903
4.56M
        break;
2904
4.56M
    }
2905
2906
4.56M
    case 'V':
2907
516
    {
2908
516
        PyObject *obj = va_arg(*vargs, PyObject *);
2909
516
        const char *str;
2910
516
        const wchar_t *wstr;
2911
516
        if (sizemod) {
2912
0
            wstr = va_arg(*vargs, const wchar_t*);
2913
0
        }
2914
516
        else {
2915
516
            str = va_arg(*vargs, const char *);
2916
516
        }
2917
516
        if (obj) {
2918
0
            assert(_PyUnicode_CHECK(obj));
2919
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2920
0
                return NULL;
2921
0
        }
2922
516
        else if (sizemod) {
2923
0
            assert(wstr != NULL);
2924
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2925
0
                return NULL;
2926
0
        }
2927
516
        else {
2928
516
            assert(str != NULL);
2929
516
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2930
0
                return NULL;
2931
516
        }
2932
516
        break;
2933
516
    }
2934
2935
1.80k
    case 'S':
2936
1.80k
    {
2937
1.80k
        PyObject *obj = va_arg(*vargs, PyObject *);
2938
1.80k
        PyObject *str;
2939
1.80k
        assert(obj);
2940
1.80k
        str = PyObject_Str(obj);
2941
1.80k
        if (!str)
2942
0
            return NULL;
2943
1.80k
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2944
0
            Py_DECREF(str);
2945
0
            return NULL;
2946
0
        }
2947
1.80k
        Py_DECREF(str);
2948
1.80k
        break;
2949
1.80k
    }
2950
2951
1.51k
    case 'R':
2952
1.51k
    {
2953
1.51k
        PyObject *obj = va_arg(*vargs, PyObject *);
2954
1.51k
        PyObject *repr;
2955
1.51k
        assert(obj);
2956
1.51k
        repr = PyObject_Repr(obj);
2957
1.51k
        if (!repr)
2958
0
            return NULL;
2959
1.51k
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2960
0
            Py_DECREF(repr);
2961
0
            return NULL;
2962
0
        }
2963
1.51k
        Py_DECREF(repr);
2964
1.51k
        break;
2965
1.51k
    }
2966
2967
0
    case 'A':
2968
0
    {
2969
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2970
0
        PyObject *ascii;
2971
0
        assert(obj);
2972
0
        ascii = PyObject_ASCII(obj);
2973
0
        if (!ascii)
2974
0
            return NULL;
2975
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2976
0
            Py_DECREF(ascii);
2977
0
            return NULL;
2978
0
        }
2979
0
        Py_DECREF(ascii);
2980
0
        break;
2981
0
    }
2982
2983
5.29M
    case 'T':
2984
5.29M
    {
2985
5.29M
        PyObject *obj = va_arg(*vargs, PyObject *);
2986
5.29M
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2987
2988
5.29M
        PyObject *type_name;
2989
5.29M
        if (flags & F_ALT) {
2990
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2991
0
        }
2992
5.29M
        else {
2993
5.29M
            type_name = PyType_GetFullyQualifiedName(type);
2994
5.29M
        }
2995
5.29M
        Py_DECREF(type);
2996
5.29M
        if (!type_name) {
2997
0
            return NULL;
2998
0
        }
2999
3000
5.29M
        if (unicode_fromformat_write_str(writer, type_name,
3001
5.29M
                                         width, precision, flags) == -1) {
3002
0
            Py_DECREF(type_name);
3003
0
            return NULL;
3004
0
        }
3005
5.29M
        Py_DECREF(type_name);
3006
5.29M
        break;
3007
5.29M
    }
3008
3009
0
    case 'N':
3010
0
    {
3011
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3012
0
        assert(type_raw != NULL);
3013
3014
0
        if (!PyType_Check(type_raw)) {
3015
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3016
0
            return NULL;
3017
0
        }
3018
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3019
3020
0
        PyObject *type_name;
3021
0
        if (flags & F_ALT) {
3022
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3023
0
        }
3024
0
        else {
3025
0
            type_name = PyType_GetFullyQualifiedName(type);
3026
0
        }
3027
0
        if (!type_name) {
3028
0
            return NULL;
3029
0
        }
3030
0
        if (unicode_fromformat_write_str(writer, type_name,
3031
0
                                         width, precision, flags) == -1) {
3032
0
            Py_DECREF(type_name);
3033
0
            return NULL;
3034
0
        }
3035
0
        Py_DECREF(type_name);
3036
0
        break;
3037
0
    }
3038
3039
0
    default:
3040
0
    invalid_format:
3041
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3042
0
        return NULL;
3043
28.5M
    }
3044
3045
28.5M
    f++;
3046
28.5M
    return f;
3047
28.5M
}
3048
3049
static int
3050
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3051
14.0M
{
3052
14.0M
    Py_ssize_t len = strlen(format);
3053
14.0M
    writer->min_length += len + 100;
3054
14.0M
    writer->overallocate = 1;
3055
3056
    // Copy varags to be able to pass a reference to a subfunction.
3057
14.0M
    va_list vargs2;
3058
14.0M
    va_copy(vargs2, vargs);
3059
3060
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3061
    // to be encoded to ASCII.
3062
14.0M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3063
14.0M
    if (!is_ascii) {
3064
0
        Py_ssize_t i;
3065
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3066
0
        PyErr_Format(PyExc_ValueError,
3067
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3068
0
            "string, got a non-ASCII byte: 0x%02x",
3069
0
            (unsigned char)format[i]);
3070
0
        goto fail;
3071
0
    }
3072
3073
81.7M
    for (const char *f = format; *f; ) {
3074
67.7M
        if (*f == '%') {
3075
33.0M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3076
33.0M
            if (f == NULL)
3077
0
                goto fail;
3078
33.0M
        }
3079
34.6M
        else {
3080
34.6M
            const char *p = strchr(f, '%');
3081
34.6M
            if (p != NULL) {
3082
26.0M
                len = p - f;
3083
26.0M
            }
3084
8.62M
            else {
3085
8.62M
                len = strlen(f);
3086
8.62M
                writer->overallocate = 0;
3087
8.62M
            }
3088
3089
34.6M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3090
0
                goto fail;
3091
0
            }
3092
34.6M
            f += len;
3093
34.6M
        }
3094
67.7M
    }
3095
14.0M
    va_end(vargs2);
3096
14.0M
    return 0;
3097
3098
0
  fail:
3099
0
    va_end(vargs2);
3100
0
    return -1;
3101
14.0M
}
3102
3103
PyObject *
3104
PyUnicode_FromFormatV(const char *format, va_list vargs)
3105
14.0M
{
3106
14.0M
    _PyUnicodeWriter writer;
3107
14.0M
    _PyUnicodeWriter_Init(&writer);
3108
3109
14.0M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3110
0
        _PyUnicodeWriter_Dealloc(&writer);
3111
0
        return NULL;
3112
0
    }
3113
14.0M
    return _PyUnicodeWriter_Finish(&writer);
3114
14.0M
}
3115
3116
PyObject *
3117
PyUnicode_FromFormat(const char *format, ...)
3118
837k
{
3119
837k
    PyObject* ret;
3120
837k
    va_list vargs;
3121
3122
837k
    va_start(vargs, format);
3123
837k
    ret = PyUnicode_FromFormatV(format, vargs);
3124
837k
    va_end(vargs);
3125
837k
    return ret;
3126
837k
}
3127
3128
int
3129
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3130
0
{
3131
0
    va_list vargs;
3132
0
    va_start(vargs, format);
3133
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3134
0
    va_end(vargs);
3135
0
    return res;
3136
0
}
3137
3138
int
3139
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3140
                         va_list vargs)
3141
0
{
3142
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3143
0
    Py_ssize_t old_pos = _writer->pos;
3144
3145
0
    int res = unicode_from_format(_writer, format, vargs);
3146
3147
0
    if (res < 0) {
3148
0
        _writer->pos = old_pos;
3149
0
    }
3150
0
    return res;
3151
0
}
3152
3153
static Py_ssize_t
3154
unicode_get_widechar_size(PyObject *unicode)
3155
237k
{
3156
237k
    Py_ssize_t res;
3157
3158
237k
    assert(unicode != NULL);
3159
237k
    assert(_PyUnicode_CHECK(unicode));
3160
3161
237k
    res = _PyUnicode_LENGTH(unicode);
3162
#if SIZEOF_WCHAR_T == 2
3163
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3164
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3165
        const Py_UCS4 *end = s + res;
3166
        for (; s < end; ++s) {
3167
            if (*s > 0xFFFF) {
3168
                ++res;
3169
            }
3170
        }
3171
    }
3172
#endif
3173
237k
    return res;
3174
237k
}
3175
3176
static void
3177
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3178
237k
{
3179
237k
    assert(unicode != NULL);
3180
237k
    assert(_PyUnicode_CHECK(unicode));
3181
3182
237k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3183
699
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3184
699
        return;
3185
699
    }
3186
3187
236k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3188
205k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3189
4.71M
        for (; size--; ++s, ++w) {
3190
4.50M
            *w = *s;
3191
4.50M
        }
3192
205k
    }
3193
30.7k
    else {
3194
30.7k
#if SIZEOF_WCHAR_T == 4
3195
30.7k
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3196
30.7k
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3197
1.18M
        for (; size--; ++s, ++w) {
3198
1.15M
            *w = *s;
3199
1.15M
        }
3200
#else
3201
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3202
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3203
        for (; size--; ++s, ++w) {
3204
            Py_UCS4 ch = *s;
3205
            if (ch > 0xFFFF) {
3206
                assert(ch <= MAX_UNICODE);
3207
                /* encode surrogate pair in this case */
3208
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3209
                if (!size--)
3210
                    break;
3211
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3212
            }
3213
            else {
3214
                *w = ch;
3215
            }
3216
        }
3217
#endif
3218
30.7k
    }
3219
236k
}
3220
3221
#ifdef HAVE_WCHAR_H
3222
3223
/* Convert a Unicode object to a wide character string.
3224
3225
   - If w is NULL: return the number of wide characters (including the null
3226
     character) required to convert the unicode object. Ignore size argument.
3227
3228
   - Otherwise: return the number of wide characters (excluding the null
3229
     character) written into w. Write at most size wide characters (including
3230
     the null character). */
3231
Py_ssize_t
3232
PyUnicode_AsWideChar(PyObject *unicode,
3233
                     wchar_t *w,
3234
                     Py_ssize_t size)
3235
5.65k
{
3236
5.65k
    Py_ssize_t res;
3237
3238
5.65k
    if (unicode == NULL) {
3239
0
        PyErr_BadInternalCall();
3240
0
        return -1;
3241
0
    }
3242
5.65k
    if (!PyUnicode_Check(unicode)) {
3243
0
        PyErr_BadArgument();
3244
0
        return -1;
3245
0
    }
3246
3247
5.65k
    res = unicode_get_widechar_size(unicode);
3248
5.65k
    if (w == NULL) {
3249
0
        return res + 1;
3250
0
    }
3251
3252
5.65k
    if (size > res) {
3253
5.65k
        size = res + 1;
3254
5.65k
    }
3255
0
    else {
3256
0
        res = size;
3257
0
    }
3258
5.65k
    unicode_copy_as_widechar(unicode, w, size);
3259
3260
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3261
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3262
       non-Unicode locales and hence needs conversion first. */
3263
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3264
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3265
            return -1;
3266
        }
3267
    }
3268
#endif
3269
3270
5.65k
    return res;
3271
5.65k
}
3272
3273
wchar_t*
3274
PyUnicode_AsWideCharString(PyObject *unicode,
3275
                           Py_ssize_t *size)
3276
231k
{
3277
231k
    wchar_t *buffer;
3278
231k
    Py_ssize_t buflen;
3279
3280
231k
    if (unicode == NULL) {
3281
0
        PyErr_BadInternalCall();
3282
0
        return NULL;
3283
0
    }
3284
231k
    if (!PyUnicode_Check(unicode)) {
3285
0
        PyErr_BadArgument();
3286
0
        return NULL;
3287
0
    }
3288
3289
231k
    buflen = unicode_get_widechar_size(unicode);
3290
231k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3291
231k
    if (buffer == NULL) {
3292
0
        PyErr_NoMemory();
3293
0
        return NULL;
3294
0
    }
3295
231k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3296
3297
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3298
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3299
       non-Unicode locales and hence needs conversion first. */
3300
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3301
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3302
            return NULL;
3303
        }
3304
    }
3305
#endif
3306
3307
231k
    if (size != NULL) {
3308
230k
        *size = buflen;
3309
230k
    }
3310
1.05k
    else if (wcslen(buffer) != (size_t)buflen) {
3311
0
        PyMem_Free(buffer);
3312
0
        PyErr_SetString(PyExc_ValueError,
3313
0
                        "embedded null character");
3314
0
        return NULL;
3315
0
    }
3316
231k
    return buffer;
3317
231k
}
3318
3319
#endif /* HAVE_WCHAR_H */
3320
3321
int
3322
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3323
0
{
3324
0
    wchar_t **p = (wchar_t **)ptr;
3325
0
    if (obj == NULL) {
3326
0
        PyMem_Free(*p);
3327
0
        *p = NULL;
3328
0
        return 1;
3329
0
    }
3330
0
    if (PyUnicode_Check(obj)) {
3331
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3332
0
        if (*p == NULL) {
3333
0
            return 0;
3334
0
        }
3335
0
        return Py_CLEANUP_SUPPORTED;
3336
0
    }
3337
0
    PyErr_Format(PyExc_TypeError,
3338
0
                 "argument must be str, not %.50s",
3339
0
                 Py_TYPE(obj)->tp_name);
3340
0
    return 0;
3341
0
}
3342
3343
int
3344
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3345
0
{
3346
0
    wchar_t **p = (wchar_t **)ptr;
3347
0
    if (obj == NULL) {
3348
0
        PyMem_Free(*p);
3349
0
        *p = NULL;
3350
0
        return 1;
3351
0
    }
3352
0
    if (obj == Py_None) {
3353
0
        *p = NULL;
3354
0
        return 1;
3355
0
    }
3356
0
    if (PyUnicode_Check(obj)) {
3357
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3358
0
        if (*p == NULL) {
3359
0
            return 0;
3360
0
        }
3361
0
        return Py_CLEANUP_SUPPORTED;
3362
0
    }
3363
0
    PyErr_Format(PyExc_TypeError,
3364
0
                 "argument must be str or None, not %.50s",
3365
0
                 Py_TYPE(obj)->tp_name);
3366
0
    return 0;
3367
0
}
3368
3369
PyObject *
3370
PyUnicode_FromOrdinal(int ordinal)
3371
3.64M
{
3372
3.64M
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3373
33
        PyErr_SetString(PyExc_ValueError,
3374
33
                        "chr() arg not in range(0x110000)");
3375
33
        return NULL;
3376
33
    }
3377
3378
3.64M
    return unicode_char((Py_UCS4)ordinal);
3379
3.64M
}
3380
3381
PyObject *
3382
PyUnicode_FromObject(PyObject *obj)
3383
4.67M
{
3384
    /* XXX Perhaps we should make this API an alias of
3385
       PyObject_Str() instead ?! */
3386
4.67M
    if (PyUnicode_CheckExact(obj)) {
3387
4.67M
        return Py_NewRef(obj);
3388
4.67M
    }
3389
0
    if (PyUnicode_Check(obj)) {
3390
        /* For a Unicode subtype that's not a Unicode object,
3391
           return a true Unicode object with the same data. */
3392
0
        return _PyUnicode_Copy(obj);
3393
0
    }
3394
0
    PyErr_Format(PyExc_TypeError,
3395
0
                 "Can't convert '%.100s' object to str implicitly",
3396
0
                 Py_TYPE(obj)->tp_name);
3397
0
    return NULL;
3398
0
}
3399
3400
PyObject *
3401
PyUnicode_FromEncodedObject(PyObject *obj,
3402
                            const char *encoding,
3403
                            const char *errors)
3404
5.61M
{
3405
5.61M
    Py_buffer buffer;
3406
5.61M
    PyObject *v;
3407
3408
5.61M
    if (obj == NULL) {
3409
0
        PyErr_BadInternalCall();
3410
0
        return NULL;
3411
0
    }
3412
3413
    /* Decoding bytes objects is the most common case and should be fast */
3414
5.61M
    if (PyBytes_Check(obj)) {
3415
5.11M
        if (PyBytes_GET_SIZE(obj) == 0) {
3416
180k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3417
0
                return NULL;
3418
0
            }
3419
180k
            _Py_RETURN_UNICODE_EMPTY();
3420
180k
        }
3421
4.93M
        return PyUnicode_Decode(
3422
4.93M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3423
4.93M
                encoding, errors);
3424
5.11M
    }
3425
3426
497k
    if (PyUnicode_Check(obj)) {
3427
0
        PyErr_SetString(PyExc_TypeError,
3428
0
                        "decoding str is not supported");
3429
0
        return NULL;
3430
0
    }
3431
3432
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3433
497k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3434
0
        PyErr_Format(PyExc_TypeError,
3435
0
                     "decoding to str: need a bytes-like object, %.80s found",
3436
0
                     Py_TYPE(obj)->tp_name);
3437
0
        return NULL;
3438
0
    }
3439
3440
497k
    if (buffer.len == 0) {
3441
0
        PyBuffer_Release(&buffer);
3442
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3443
0
            return NULL;
3444
0
        }
3445
0
        _Py_RETURN_UNICODE_EMPTY();
3446
0
    }
3447
3448
497k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3449
497k
    PyBuffer_Release(&buffer);
3450
497k
    return v;
3451
497k
}
3452
3453
/* Normalize an encoding name like encodings.normalize_encoding()
3454
   but allow to convert to lowercase if *to_lower* is true.
3455
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3456
int
3457
_Py_normalize_encoding(const char *encoding,
3458
                       char *lower,
3459
                       size_t lower_len,
3460
                       int to_lower)
3461
11.6M
{
3462
11.6M
    const char *e;
3463
11.6M
    char *l;
3464
11.6M
    char *l_end;
3465
11.6M
    int punct;
3466
3467
11.6M
    assert(encoding != NULL);
3468
3469
11.6M
    e = encoding;
3470
11.6M
    l = lower;
3471
11.6M
    l_end = &lower[lower_len - 1];
3472
11.6M
    punct = 0;
3473
162M
    while (1) {
3474
162M
        char c = *e;
3475
162M
        if (c == 0) {
3476
11.0M
            break;
3477
11.0M
        }
3478
3479
151M
        if (Py_ISALNUM(c) || c == '.') {
3480
70.5M
            if (punct && l != lower) {
3481
10.4M
                if (l == l_end) {
3482
726
                    return 0;
3483
726
                }
3484
10.4M
                *l++ = '_';
3485
10.4M
            }
3486
70.5M
            punct = 0;
3487
3488
70.5M
            if (l == l_end) {
3489
667k
                return 0;
3490
667k
            }
3491
69.8M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3492
69.8M
        }
3493
80.9M
        else {
3494
80.9M
            punct = 1;
3495
80.9M
        }
3496
3497
150M
        e++;
3498
150M
    }
3499
11.0M
    *l = '\0';
3500
11.0M
    return 1;
3501
11.6M
}
3502
3503
PyObject *
3504
PyUnicode_Decode(const char *s,
3505
                 Py_ssize_t size,
3506
                 const char *encoding,
3507
                 const char *errors)
3508
5.52M
{
3509
5.52M
    PyObject *buffer = NULL, *unicode;
3510
5.52M
    Py_buffer info;
3511
5.52M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3512
3513
5.52M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3514
0
        return NULL;
3515
0
    }
3516
3517
5.52M
    if (size == 0) {
3518
0
        _Py_RETURN_UNICODE_EMPTY();
3519
0
    }
3520
3521
5.52M
    if (encoding == NULL) {
3522
30.7k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3523
30.7k
    }
3524
3525
    /* Shortcuts for common default encodings */
3526
5.49M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3527
5.49M
        char *lower = buflower;
3528
3529
        /* Fast paths */
3530
5.49M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3531
885k
            lower += 3;
3532
885k
            if (*lower == '_') {
3533
                /* Match "utf8" and "utf_8" */
3534
884k
                lower++;
3535
884k
            }
3536
3537
885k
            if (lower[0] == '8' && lower[1] == 0) {
3538
884k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3539
884k
            }
3540
482
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3541
87
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3542
87
            }
3543
395
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3544
90
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3545
90
            }
3546
885k
        }
3547
4.60M
        else {
3548
4.60M
            if (strcmp(lower, "ascii") == 0
3549
4.15M
                || strcmp(lower, "us_ascii") == 0) {
3550
674k
                return PyUnicode_DecodeASCII(s, size, errors);
3551
674k
            }
3552
    #ifdef MS_WINDOWS
3553
            else if (strcmp(lower, "mbcs") == 0) {
3554
                return PyUnicode_DecodeMBCS(s, size, errors);
3555
            }
3556
    #endif
3557
3.93M
            else if (strcmp(lower, "latin1") == 0
3558
3.93M
                     || strcmp(lower, "latin_1") == 0
3559
904k
                     || strcmp(lower, "iso_8859_1") == 0
3560
3.05M
                     || strcmp(lower, "iso8859_1") == 0) {
3561
3.05M
                return PyUnicode_DecodeLatin1(s, size, errors);
3562
3.05M
            }
3563
4.60M
        }
3564
5.49M
    }
3565
3566
    /* Decode via the codec registry */
3567
883k
    buffer = NULL;
3568
883k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3569
0
        goto onError;
3570
883k
    buffer = PyMemoryView_FromBuffer(&info);
3571
883k
    if (buffer == NULL)
3572
0
        goto onError;
3573
883k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3574
883k
    if (unicode == NULL)
3575
79.3k
        goto onError;
3576
804k
    if (!PyUnicode_Check(unicode)) {
3577
0
        PyErr_Format(PyExc_TypeError,
3578
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3579
0
                     "use codecs.decode() to decode to arbitrary types",
3580
0
                     encoding,
3581
0
                     Py_TYPE(unicode)->tp_name);
3582
0
        Py_DECREF(unicode);
3583
0
        goto onError;
3584
0
    }
3585
804k
    Py_DECREF(buffer);
3586
804k
    return unicode_result(unicode);
3587
3588
79.3k
  onError:
3589
79.3k
    Py_XDECREF(buffer);
3590
79.3k
    return NULL;
3591
804k
}
3592
3593
PyAPI_FUNC(PyObject *)
3594
PyUnicode_AsDecodedObject(PyObject *unicode,
3595
                          const char *encoding,
3596
                          const char *errors)
3597
0
{
3598
0
    if (!PyUnicode_Check(unicode)) {
3599
0
        PyErr_BadArgument();
3600
0
        return NULL;
3601
0
    }
3602
3603
0
    if (encoding == NULL)
3604
0
        encoding = PyUnicode_GetDefaultEncoding();
3605
3606
    /* Decode via the codec registry */
3607
0
    return PyCodec_Decode(unicode, encoding, errors);
3608
0
}
3609
3610
PyAPI_FUNC(PyObject *)
3611
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3612
                           const char *encoding,
3613
                           const char *errors)
3614
0
{
3615
0
    PyObject *v;
3616
3617
0
    if (!PyUnicode_Check(unicode)) {
3618
0
        PyErr_BadArgument();
3619
0
        goto onError;
3620
0
    }
3621
3622
0
    if (encoding == NULL)
3623
0
        encoding = PyUnicode_GetDefaultEncoding();
3624
3625
    /* Decode via the codec registry */
3626
0
    v = PyCodec_Decode(unicode, encoding, errors);
3627
0
    if (v == NULL)
3628
0
        goto onError;
3629
0
    if (!PyUnicode_Check(v)) {
3630
0
        PyErr_Format(PyExc_TypeError,
3631
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3632
0
                     "use codecs.decode() to decode to arbitrary types",
3633
0
                     encoding,
3634
0
                     Py_TYPE(unicode)->tp_name);
3635
0
        Py_DECREF(v);
3636
0
        goto onError;
3637
0
    }
3638
0
    return unicode_result(v);
3639
3640
0
  onError:
3641
0
    return NULL;
3642
0
}
3643
3644
PyAPI_FUNC(PyObject *)
3645
PyUnicode_AsEncodedObject(PyObject *unicode,
3646
                          const char *encoding,
3647
                          const char *errors)
3648
0
{
3649
0
    PyObject *v;
3650
3651
0
    if (!PyUnicode_Check(unicode)) {
3652
0
        PyErr_BadArgument();
3653
0
        goto onError;
3654
0
    }
3655
3656
0
    if (encoding == NULL)
3657
0
        encoding = PyUnicode_GetDefaultEncoding();
3658
3659
    /* Encode via the codec registry */
3660
0
    v = PyCodec_Encode(unicode, encoding, errors);
3661
0
    if (v == NULL)
3662
0
        goto onError;
3663
0
    return v;
3664
3665
0
  onError:
3666
0
    return NULL;
3667
0
}
3668
3669
3670
static PyObject *
3671
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3672
                      int current_locale)
3673
816
{
3674
816
    Py_ssize_t wlen;
3675
816
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3676
816
    if (wstr == NULL) {
3677
0
        return NULL;
3678
0
    }
3679
3680
816
    if ((size_t)wlen != wcslen(wstr)) {
3681
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3682
0
        PyMem_Free(wstr);
3683
0
        return NULL;
3684
0
    }
3685
3686
816
    char *str;
3687
816
    size_t error_pos;
3688
816
    const char *reason;
3689
816
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3690
816
                                 current_locale, error_handler);
3691
816
    PyMem_Free(wstr);
3692
3693
816
    if (res != 0) {
3694
0
        if (res == -2) {
3695
0
            PyObject *exc;
3696
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3697
0
                    "locale", unicode,
3698
0
                    (Py_ssize_t)error_pos,
3699
0
                    (Py_ssize_t)(error_pos+1),
3700
0
                    reason);
3701
0
            if (exc != NULL) {
3702
0
                PyCodec_StrictErrors(exc);
3703
0
                Py_DECREF(exc);
3704
0
            }
3705
0
        }
3706
0
        else if (res == -3) {
3707
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3708
0
        }
3709
0
        else {
3710
0
            PyErr_NoMemory();
3711
0
        }
3712
0
        return NULL;
3713
0
    }
3714
3715
816
    PyObject *bytes = PyBytes_FromString(str);
3716
816
    PyMem_RawFree(str);
3717
816
    return bytes;
3718
816
}
3719
3720
PyObject *
3721
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3722
0
{
3723
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3724
0
    return unicode_encode_locale(unicode, error_handler, 1);
3725
0
}
3726
3727
PyObject *
3728
PyUnicode_EncodeFSDefault(PyObject *unicode)
3729
1.15M
{
3730
1.15M
    PyInterpreterState *interp = _PyInterpreterState_GET();
3731
1.15M
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3732
1.15M
    if (fs_codec->utf8) {
3733
1.15M
        return unicode_encode_utf8(unicode,
3734
1.15M
                                   fs_codec->error_handler,
3735
1.15M
                                   fs_codec->errors);
3736
1.15M
    }
3737
816
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3738
816
    else if (fs_codec->encoding) {
3739
0
        return PyUnicode_AsEncodedString(unicode,
3740
0
                                         fs_codec->encoding,
3741
0
                                         fs_codec->errors);
3742
0
    }
3743
816
#endif
3744
816
    else {
3745
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3746
           machinery is not ready and so cannot be used:
3747
           use wcstombs() in this case. */
3748
816
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3749
816
        const wchar_t *filesystem_errors = config->filesystem_errors;
3750
816
        assert(filesystem_errors != NULL);
3751
816
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3752
816
        assert(errors != _Py_ERROR_UNKNOWN);
3753
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3754
        return unicode_encode_utf8(unicode, errors, NULL);
3755
#else
3756
816
        return unicode_encode_locale(unicode, errors, 0);
3757
816
#endif
3758
816
    }
3759
1.15M
}
3760
3761
PyObject *
3762
PyUnicode_AsEncodedString(PyObject *unicode,
3763
                          const char *encoding,
3764
                          const char *errors)
3765
21.6M
{
3766
21.6M
    PyObject *v;
3767
21.6M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3768
3769
21.6M
    if (!PyUnicode_Check(unicode)) {
3770
0
        PyErr_BadArgument();
3771
0
        return NULL;
3772
0
    }
3773
3774
21.6M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3775
0
        return NULL;
3776
0
    }
3777
3778
21.6M
    if (encoding == NULL) {
3779
15.5M
        return _PyUnicode_AsUTF8String(unicode, errors);
3780
15.5M
    }
3781
3782
    /* Shortcuts for common default encodings */
3783
6.18M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3784
5.51M
        char *lower = buflower;
3785
3786
        /* Fast paths */
3787
5.51M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3788
5.18M
            lower += 3;
3789
5.18M
            if (*lower == '_') {
3790
                /* Match "utf8" and "utf_8" */
3791
5.18M
                lower++;
3792
5.18M
            }
3793
3794
5.18M
            if (lower[0] == '8' && lower[1] == 0) {
3795
5.18M
                return _PyUnicode_AsUTF8String(unicode, errors);
3796
5.18M
            }
3797
5.95k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3798
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3799
0
            }
3800
5.95k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3801
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3802
0
            }
3803
5.18M
        }
3804
331k
        else {
3805
331k
            if (strcmp(lower, "ascii") == 0
3806
315k
                || strcmp(lower, "us_ascii") == 0) {
3807
315k
                return _PyUnicode_AsASCIIString(unicode, errors);
3808
315k
            }
3809
#ifdef MS_WINDOWS
3810
            else if (strcmp(lower, "mbcs") == 0) {
3811
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3812
            }
3813
#endif
3814
16.2k
            else if (strcmp(lower, "latin1") == 0 ||
3815
16.2k
                     strcmp(lower, "latin_1") == 0 ||
3816
16.2k
                     strcmp(lower, "iso_8859_1") == 0 ||
3817
16.2k
                     strcmp(lower, "iso8859_1") == 0) {
3818
10
                return _PyUnicode_AsLatin1String(unicode, errors);
3819
10
            }
3820
331k
        }
3821
5.51M
    }
3822
3823
    /* Encode via the codec registry */
3824
687k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3825
687k
    if (v == NULL)
3826
0
        return NULL;
3827
3828
    /* The normal path */
3829
687k
    if (PyBytes_Check(v))
3830
687k
        return v;
3831
3832
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3833
0
    if (PyByteArray_Check(v)) {
3834
0
        int error;
3835
0
        PyObject *b;
3836
3837
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3838
0
            "encoder %s returned bytearray instead of bytes; "
3839
0
            "use codecs.encode() to encode to arbitrary types",
3840
0
            encoding);
3841
0
        if (error) {
3842
0
            Py_DECREF(v);
3843
0
            return NULL;
3844
0
        }
3845
3846
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3847
0
                                      PyByteArray_GET_SIZE(v));
3848
0
        Py_DECREF(v);
3849
0
        return b;
3850
0
    }
3851
3852
0
    PyErr_Format(PyExc_TypeError,
3853
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3854
0
                 "use codecs.encode() to encode to arbitrary types",
3855
0
                 encoding,
3856
0
                 Py_TYPE(v)->tp_name);
3857
0
    Py_DECREF(v);
3858
0
    return NULL;
3859
0
}
3860
3861
PyAPI_FUNC(PyObject *)
3862
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3863
                           const char *encoding,
3864
                           const char *errors)
3865
0
{
3866
0
    PyObject *v;
3867
3868
0
    if (!PyUnicode_Check(unicode)) {
3869
0
        PyErr_BadArgument();
3870
0
        goto onError;
3871
0
    }
3872
3873
0
    if (encoding == NULL)
3874
0
        encoding = PyUnicode_GetDefaultEncoding();
3875
3876
    /* Encode via the codec registry */
3877
0
    v = PyCodec_Encode(unicode, encoding, errors);
3878
0
    if (v == NULL)
3879
0
        goto onError;
3880
0
    if (!PyUnicode_Check(v)) {
3881
0
        PyErr_Format(PyExc_TypeError,
3882
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3883
0
                     "use codecs.encode() to encode to arbitrary types",
3884
0
                     encoding,
3885
0
                     Py_TYPE(v)->tp_name);
3886
0
        Py_DECREF(v);
3887
0
        goto onError;
3888
0
    }
3889
0
    return v;
3890
3891
0
  onError:
3892
0
    return NULL;
3893
0
}
3894
3895
static PyObject*
3896
unicode_decode_locale(const char *str, Py_ssize_t len,
3897
                      _Py_error_handler errors, int current_locale)
3898
359k
{
3899
359k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3900
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3901
0
        return NULL;
3902
0
    }
3903
3904
359k
    wchar_t *wstr;
3905
359k
    size_t wlen;
3906
359k
    const char *reason;
3907
359k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3908
359k
                                 current_locale, errors);
3909
359k
    if (res != 0) {
3910
0
        if (res == -2) {
3911
0
            PyObject *exc;
3912
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3913
0
                                        "locale", str, len,
3914
0
                                        (Py_ssize_t)wlen,
3915
0
                                        (Py_ssize_t)(wlen + 1),
3916
0
                                        reason);
3917
0
            if (exc != NULL) {
3918
0
                PyCodec_StrictErrors(exc);
3919
0
                Py_DECREF(exc);
3920
0
            }
3921
0
        }
3922
0
        else if (res == -3) {
3923
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3924
0
        }
3925
0
        else {
3926
0
            PyErr_NoMemory();
3927
0
        }
3928
0
        return NULL;
3929
0
    }
3930
3931
359k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3932
359k
    PyMem_RawFree(wstr);
3933
359k
    return unicode;
3934
359k
}
3935
3936
PyObject*
3937
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3938
                              const char *errors)
3939
0
{
3940
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3941
0
    return unicode_decode_locale(str, len, error_handler, 1);
3942
0
}
3943
3944
PyObject*
3945
PyUnicode_DecodeLocale(const char *str, const char *errors)
3946
348k
{
3947
348k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3948
348k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3949
348k
    return unicode_decode_locale(str, size, error_handler, 1);
3950
348k
}
3951
3952
3953
PyObject*
3954
203
PyUnicode_DecodeFSDefault(const char *s) {
3955
203
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3956
203
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3957
203
}
3958
3959
PyObject*
3960
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3961
207k
{
3962
207k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3963
207k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3964
207k
    if (fs_codec->utf8) {
3965
196k
        return unicode_decode_utf8(s, size,
3966
196k
                                   fs_codec->error_handler,
3967
196k
                                   fs_codec->errors,
3968
196k
                                   NULL);
3969
196k
    }
3970
10.9k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3971
10.9k
    else if (fs_codec->encoding) {
3972
0
        return PyUnicode_Decode(s, size,
3973
0
                                fs_codec->encoding,
3974
0
                                fs_codec->errors);
3975
0
    }
3976
10.9k
#endif
3977
10.9k
    else {
3978
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3979
           machinery is not ready and so cannot be used:
3980
           use mbstowcs() in this case. */
3981
10.9k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3982
10.9k
        const wchar_t *filesystem_errors = config->filesystem_errors;
3983
10.9k
        assert(filesystem_errors != NULL);
3984
10.9k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3985
10.9k
        assert(errors != _Py_ERROR_UNKNOWN);
3986
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3987
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3988
#else
3989
10.9k
        return unicode_decode_locale(s, size, errors, 0);
3990
10.9k
#endif
3991
10.9k
    }
3992
207k
}
3993
3994
3995
int
3996
PyUnicode_FSConverter(PyObject* arg, void* addr)
3997
298k
{
3998
298k
    PyObject *path = NULL;
3999
298k
    PyObject *output = NULL;
4000
298k
    Py_ssize_t size;
4001
298k
    const char *data;
4002
298k
    if (arg == NULL) {
4003
0
        Py_DECREF(*(PyObject**)addr);
4004
0
        *(PyObject**)addr = NULL;
4005
0
        return 1;
4006
0
    }
4007
298k
    path = PyOS_FSPath(arg);
4008
298k
    if (path == NULL) {
4009
0
        return 0;
4010
0
    }
4011
298k
    if (PyBytes_Check(path)) {
4012
0
        output = path;
4013
0
    }
4014
298k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4015
298k
        output = PyUnicode_EncodeFSDefault(path);
4016
298k
        Py_DECREF(path);
4017
298k
        if (!output) {
4018
0
            return 0;
4019
0
        }
4020
298k
        assert(PyBytes_Check(output));
4021
298k
    }
4022
4023
298k
    size = PyBytes_GET_SIZE(output);
4024
298k
    data = PyBytes_AS_STRING(output);
4025
298k
    if ((size_t)size != strlen(data)) {
4026
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4027
0
        Py_DECREF(output);
4028
0
        return 0;
4029
0
    }
4030
298k
    *(PyObject**)addr = output;
4031
298k
    return Py_CLEANUP_SUPPORTED;
4032
298k
}
4033
4034
4035
int
4036
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4037
110k
{
4038
110k
    if (arg == NULL) {
4039
0
        Py_DECREF(*(PyObject**)addr);
4040
0
        *(PyObject**)addr = NULL;
4041
0
        return 1;
4042
0
    }
4043
4044
110k
    PyObject *path = PyOS_FSPath(arg);
4045
110k
    if (path == NULL) {
4046
0
        return 0;
4047
0
    }
4048
4049
110k
    PyObject *output = NULL;
4050
110k
    if (PyUnicode_Check(path)) {
4051
110k
        output = path;
4052
110k
    }
4053
0
    else if (PyBytes_Check(path)) {
4054
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4055
0
                                                  PyBytes_GET_SIZE(path));
4056
0
        Py_DECREF(path);
4057
0
        if (!output) {
4058
0
            return 0;
4059
0
        }
4060
0
    }
4061
0
    else {
4062
0
        PyErr_Format(PyExc_TypeError,
4063
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4064
0
                     Py_TYPE(arg)->tp_name);
4065
0
        Py_DECREF(path);
4066
0
        return 0;
4067
0
    }
4068
4069
110k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4070
110k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4071
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4072
0
        Py_DECREF(output);
4073
0
        return 0;
4074
0
    }
4075
110k
    *(PyObject**)addr = output;
4076
110k
    return Py_CLEANUP_SUPPORTED;
4077
110k
}
4078
4079
4080
static int unicode_fill_utf8(PyObject *unicode);
4081
4082
4083
static int
4084
unicode_ensure_utf8(PyObject *unicode)
4085
24.5M
{
4086
24.5M
    int err = 0;
4087
24.5M
    if (PyUnicode_UTF8(unicode) == NULL) {
4088
170k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4089
170k
        if (PyUnicode_UTF8(unicode) == NULL) {
4090
170k
            err = unicode_fill_utf8(unicode);
4091
170k
        }
4092
170k
        Py_END_CRITICAL_SECTION();
4093
170k
    }
4094
24.5M
    return err;
4095
24.5M
}
4096
4097
const char *
4098
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4099
24.5M
{
4100
24.5M
    if (!PyUnicode_Check(unicode)) {
4101
0
        PyErr_BadArgument();
4102
0
        if (psize) {
4103
0
            *psize = -1;
4104
0
        }
4105
0
        return NULL;
4106
0
    }
4107
4108
24.5M
    if (unicode_ensure_utf8(unicode) == -1) {
4109
207
        if (psize) {
4110
207
            *psize = -1;
4111
207
        }
4112
207
        return NULL;
4113
207
    }
4114
4115
24.5M
    if (psize) {
4116
24.3M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4117
24.3M
    }
4118
24.5M
    return PyUnicode_UTF8(unicode);
4119
24.5M
}
4120
4121
const char *
4122
PyUnicode_AsUTF8(PyObject *unicode)
4123
275k
{
4124
275k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4125
275k
}
4126
4127
const char *
4128
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4129
700k
{
4130
700k
    Py_ssize_t size;
4131
700k
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4132
700k
    if (s && strlen(s) != (size_t)size) {
4133
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4134
0
        return NULL;
4135
0
    }
4136
700k
    return s;
4137
700k
}
4138
4139
/*
4140
PyUnicode_GetSize() has been deprecated since Python 3.3
4141
because it returned length of Py_UNICODE.
4142
4143
But this function is part of stable abi, because it doesn't
4144
include Py_UNICODE in signature and it was not excluded from
4145
stable ABI in PEP 384.
4146
*/
4147
PyAPI_FUNC(Py_ssize_t)
4148
PyUnicode_GetSize(PyObject *unicode)
4149
0
{
4150
0
    PyErr_SetString(PyExc_RuntimeError,
4151
0
                    "PyUnicode_GetSize has been removed.");
4152
0
    return -1;
4153
0
}
4154
4155
Py_ssize_t
4156
PyUnicode_GetLength(PyObject *unicode)
4157
25.6k
{
4158
25.6k
    if (!PyUnicode_Check(unicode)) {
4159
0
        PyErr_BadArgument();
4160
0
        return -1;
4161
0
    }
4162
25.6k
    return PyUnicode_GET_LENGTH(unicode);
4163
25.6k
}
4164
4165
Py_UCS4
4166
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4167
18
{
4168
18
    const void *data;
4169
18
    int kind;
4170
4171
18
    if (!PyUnicode_Check(unicode)) {
4172
0
        PyErr_BadArgument();
4173
0
        return (Py_UCS4)-1;
4174
0
    }
4175
18
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4176
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4177
0
        return (Py_UCS4)-1;
4178
0
    }
4179
18
    data = PyUnicode_DATA(unicode);
4180
18
    kind = PyUnicode_KIND(unicode);
4181
18
    return PyUnicode_READ(kind, data, index);
4182
18
}
4183
4184
int
4185
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4186
0
{
4187
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4188
0
        PyErr_BadArgument();
4189
0
        return -1;
4190
0
    }
4191
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4192
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4193
0
        return -1;
4194
0
    }
4195
0
    if (unicode_check_modifiable(unicode))
4196
0
        return -1;
4197
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4198
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4199
0
        return -1;
4200
0
    }
4201
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4202
0
                    index, ch);
4203
0
    return 0;
4204
0
}
4205
4206
const char *
4207
PyUnicode_GetDefaultEncoding(void)
4208
0
{
4209
0
    return "utf-8";
4210
0
}
4211
4212
/* create or adjust a UnicodeDecodeError */
4213
static void
4214
make_decode_exception(PyObject **exceptionObject,
4215
                      const char *encoding,
4216
                      const char *input, Py_ssize_t length,
4217
                      Py_ssize_t startpos, Py_ssize_t endpos,
4218
                      const char *reason)
4219
338k
{
4220
338k
    if (*exceptionObject == NULL) {
4221
100k
        *exceptionObject = PyUnicodeDecodeError_Create(
4222
100k
            encoding, input, length, startpos, endpos, reason);
4223
100k
    }
4224
238k
    else {
4225
238k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4226
0
            goto onError;
4227
238k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4228
0
            goto onError;
4229
238k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4230
0
            goto onError;
4231
238k
    }
4232
338k
    return;
4233
4234
338k
onError:
4235
0
    Py_CLEAR(*exceptionObject);
4236
0
}
4237
4238
#ifdef MS_WINDOWS
4239
static int
4240
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4241
{
4242
    if (newsize > *size) {
4243
        wchar_t *newbuf = *buf;
4244
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4245
            PyErr_NoMemory();
4246
            return -1;
4247
        }
4248
        *buf = newbuf;
4249
    }
4250
    *size = newsize;
4251
    return 0;
4252
}
4253
4254
/* error handling callback helper:
4255
   build arguments, call the callback and check the arguments,
4256
   if no exception occurred, copy the replacement to the output
4257
   and adjust various state variables.
4258
   return 0 on success, -1 on error
4259
*/
4260
4261
static int
4262
unicode_decode_call_errorhandler_wchar(
4263
    const char *errors, PyObject **errorHandler,
4264
    const char *encoding, const char *reason,
4265
    const char **input, const char **inend, Py_ssize_t *startinpos,
4266
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4267
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4268
{
4269
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4270
4271
    PyObject *restuple = NULL;
4272
    PyObject *repunicode = NULL;
4273
    Py_ssize_t outsize;
4274
    Py_ssize_t insize;
4275
    Py_ssize_t requiredsize;
4276
    Py_ssize_t newpos;
4277
    PyObject *inputobj = NULL;
4278
    Py_ssize_t repwlen;
4279
4280
    if (*errorHandler == NULL) {
4281
        *errorHandler = PyCodec_LookupError(errors);
4282
        if (*errorHandler == NULL)
4283
            goto onError;
4284
    }
4285
4286
    make_decode_exception(exceptionObject,
4287
        encoding,
4288
        *input, *inend - *input,
4289
        *startinpos, *endinpos,
4290
        reason);
4291
    if (*exceptionObject == NULL)
4292
        goto onError;
4293
4294
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4295
    if (restuple == NULL)
4296
        goto onError;
4297
    if (!PyTuple_Check(restuple)) {
4298
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4299
        goto onError;
4300
    }
4301
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4302
        goto onError;
4303
4304
    /* Copy back the bytes variables, which might have been modified by the
4305
       callback */
4306
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4307
    if (!inputobj)
4308
        goto onError;
4309
    *input = PyBytes_AS_STRING(inputobj);
4310
    insize = PyBytes_GET_SIZE(inputobj);
4311
    *inend = *input + insize;
4312
    /* we can DECREF safely, as the exception has another reference,
4313
       so the object won't go away. */
4314
    Py_DECREF(inputobj);
4315
4316
    if (newpos<0)
4317
        newpos = insize+newpos;
4318
    if (newpos<0 || newpos>insize) {
4319
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4320
        goto onError;
4321
    }
4322
4323
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4324
    if (repwlen < 0)
4325
        goto onError;
4326
    repwlen--;
4327
    /* need more space? (at least enough for what we
4328
       have+the replacement+the rest of the string (starting
4329
       at the new input position), so we won't have to check space
4330
       when there are no errors in the rest of the string) */
4331
    requiredsize = *outpos;
4332
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4333
        goto overflow;
4334
    requiredsize += repwlen;
4335
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4336
        goto overflow;
4337
    requiredsize += insize - newpos;
4338
    outsize = *bufsize;
4339
    if (requiredsize > outsize) {
4340
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4341
            requiredsize = 2*outsize;
4342
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4343
            goto onError;
4344
        }
4345
    }
4346
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4347
    *outpos += repwlen;
4348
    *endinpos = newpos;
4349
    *inptr = *input + newpos;
4350
4351
    /* we made it! */
4352
    Py_DECREF(restuple);
4353
    return 0;
4354
4355
  overflow:
4356
    PyErr_SetString(PyExc_OverflowError,
4357
                    "decoded result is too long for a Python string");
4358
4359
  onError:
4360
    Py_XDECREF(restuple);
4361
    return -1;
4362
}
4363
#endif   /* MS_WINDOWS */
4364
4365
static int
4366
unicode_decode_call_errorhandler_writer(
4367
    const char *errors, PyObject **errorHandler,
4368
    const char *encoding, const char *reason,
4369
    const char **input, const char **inend, Py_ssize_t *startinpos,
4370
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4371
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4372
338k
{
4373
338k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4374
4375
338k
    PyObject *restuple = NULL;
4376
338k
    PyObject *repunicode = NULL;
4377
338k
    Py_ssize_t insize;
4378
338k
    Py_ssize_t newpos;
4379
338k
    Py_ssize_t replen;
4380
338k
    Py_ssize_t remain;
4381
338k
    PyObject *inputobj = NULL;
4382
338k
    int need_to_grow = 0;
4383
338k
    const char *new_inptr;
4384
4385
338k
    if (*errorHandler == NULL) {
4386
100k
        *errorHandler = PyCodec_LookupError(errors);
4387
100k
        if (*errorHandler == NULL)
4388
0
            goto onError;
4389
100k
    }
4390
4391
338k
    make_decode_exception(exceptionObject,
4392
338k
        encoding,
4393
338k
        *input, *inend - *input,
4394
338k
        *startinpos, *endinpos,
4395
338k
        reason);
4396
338k
    if (*exceptionObject == NULL)
4397
0
        goto onError;
4398
4399
338k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4400
338k
    if (restuple == NULL)
4401
59.7k
        goto onError;
4402
278k
    if (!PyTuple_Check(restuple)) {
4403
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4404
0
        goto onError;
4405
0
    }
4406
278k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4407
0
        goto onError;
4408
4409
    /* Copy back the bytes variables, which might have been modified by the
4410
       callback */
4411
278k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4412
278k
    if (!inputobj)
4413
0
        goto onError;
4414
278k
    remain = *inend - *input - *endinpos;
4415
278k
    *input = PyBytes_AS_STRING(inputobj);
4416
278k
    insize = PyBytes_GET_SIZE(inputobj);
4417
278k
    *inend = *input + insize;
4418
    /* we can DECREF safely, as the exception has another reference,
4419
       so the object won't go away. */
4420
278k
    Py_DECREF(inputobj);
4421
4422
278k
    if (newpos<0)
4423
0
        newpos = insize+newpos;
4424
278k
    if (newpos<0 || newpos>insize) {
4425
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4426
0
        goto onError;
4427
0
    }
4428
4429
278k
    replen = PyUnicode_GET_LENGTH(repunicode);
4430
278k
    if (replen > 1) {
4431
30.6k
        writer->min_length += replen - 1;
4432
30.6k
        need_to_grow = 1;
4433
30.6k
    }
4434
278k
    new_inptr = *input + newpos;
4435
278k
    if (*inend - new_inptr > remain) {
4436
        /* We don't know the decoding algorithm here so we make the worst
4437
           assumption that one byte decodes to one unicode character.
4438
           If unfortunately one byte could decode to more unicode characters,
4439
           the decoder may write out-of-bound then.  Is it possible for the
4440
           algorithms using this function? */
4441
15.9k
        writer->min_length += *inend - new_inptr - remain;
4442
15.9k
        need_to_grow = 1;
4443
15.9k
    }
4444
278k
    if (need_to_grow) {
4445
30.8k
        writer->overallocate = 1;
4446
30.8k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4447
30.8k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4448
0
            goto onError;
4449
30.8k
    }
4450
278k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4451
0
        goto onError;
4452
4453
278k
    *endinpos = newpos;
4454
278k
    *inptr = new_inptr;
4455
4456
    /* we made it! */
4457
278k
    Py_DECREF(restuple);
4458
278k
    return 0;
4459
4460
59.7k
  onError:
4461
59.7k
    Py_XDECREF(restuple);
4462
59.7k
    return -1;
4463
278k
}
4464
4465
/* --- UTF-7 Codec -------------------------------------------------------- */
4466
4467
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4468
4469
/* Three simple macros defining base-64. */
4470
4471
/* Is c a base-64 character? */
4472
4473
#define IS_BASE64(c) \
4474
258k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4475
258k
     ((c) >= 'a' && (c) <= 'z') ||     \
4476
258k
     ((c) >= '0' && (c) <= '9') ||     \
4477
258k
     (c) == '+' || (c) == '/')
4478
4479
/* given that c is a base-64 character, what is its base-64 value? */
4480
4481
#define FROM_BASE64(c)                                                  \
4482
224k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4483
224k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4484
180k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4485
100k
     (c) == '+' ? 62 : 63)
4486
4487
/* What is the base-64 character of the bottom 6 bits of n? */
4488
4489
#define TO_BASE64(n)  \
4490
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4491
4492
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4493
 * decoded as itself.  We are permissive on decoding; the only ASCII
4494
 * byte not decoding to itself is the + which begins a base64
4495
 * string. */
4496
4497
#define DECODE_DIRECT(c)                                \
4498
4.14M
    ((c) <= 127 && (c) != '+')
4499
4500
/* The UTF-7 encoder treats ASCII characters differently according to
4501
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4502
 * the above).  See RFC2152.  This array identifies these different
4503
 * sets:
4504
 * 0 : "Set D"
4505
 *     alphanumeric and '(),-./:?
4506
 * 1 : "Set O"
4507
 *     !"#$%&*;<=>@[]^_`{|}
4508
 * 2 : "whitespace"
4509
 *     ht nl cr sp
4510
 * 3 : special (must be base64 encoded)
4511
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4512
 */
4513
4514
static
4515
char utf7_category[128] = {
4516
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4517
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4518
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4519
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4520
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4521
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4522
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4523
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4524
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4525
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4526
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4527
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4528
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4529
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4530
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4531
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4532
};
4533
4534
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4535
 * answer depends on whether we are encoding set O as itself, and also
4536
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4537
 * clear that the answers to these questions vary between
4538
 * applications, so this code needs to be flexible.  */
4539
4540
#define ENCODE_DIRECT(c) \
4541
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4542
4543
PyObject *
4544
PyUnicode_DecodeUTF7(const char *s,
4545
                     Py_ssize_t size,
4546
                     const char *errors)
4547
0
{
4548
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4549
0
}
4550
4551
/* The decoder.  The only state we preserve is our read position,
4552
 * i.e. how many characters we have consumed.  So if we end in the
4553
 * middle of a shift sequence we have to back off the read position
4554
 * and the output to the beginning of the sequence, otherwise we lose
4555
 * all the shift state (seen bits, number of bits seen, high
4556
 * surrogate). */
4557
4558
PyObject *
4559
PyUnicode_DecodeUTF7Stateful(const char *s,
4560
                             Py_ssize_t size,
4561
                             const char *errors,
4562
                             Py_ssize_t *consumed)
4563
20.7k
{
4564
20.7k
    const char *starts = s;
4565
20.7k
    Py_ssize_t startinpos;
4566
20.7k
    Py_ssize_t endinpos;
4567
20.7k
    const char *e;
4568
20.7k
    _PyUnicodeWriter writer;
4569
20.7k
    const char *errmsg = "";
4570
20.7k
    int inShift = 0;
4571
20.7k
    Py_ssize_t shiftOutStart;
4572
20.7k
    unsigned int base64bits = 0;
4573
20.7k
    unsigned long base64buffer = 0;
4574
20.7k
    Py_UCS4 surrogate = 0;
4575
20.7k
    PyObject *errorHandler = NULL;
4576
20.7k
    PyObject *exc = NULL;
4577
4578
20.7k
    if (size == 0) {
4579
0
        if (consumed)
4580
0
            *consumed = 0;
4581
0
        _Py_RETURN_UNICODE_EMPTY();
4582
0
    }
4583
4584
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4585
20.7k
    _PyUnicodeWriter_Init(&writer);
4586
20.7k
    writer.min_length = size;
4587
4588
20.7k
    shiftOutStart = 0;
4589
20.7k
    e = s + size;
4590
4591
4.41M
    while (s < e) {
4592
4.40M
        Py_UCS4 ch;
4593
4.40M
      restart:
4594
4.40M
        ch = (unsigned char) *s;
4595
4596
4.40M
        if (inShift) { /* in a base-64 section */
4597
238k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4598
224k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4599
224k
                base64bits += 6;
4600
224k
                s++;
4601
224k
                if (base64bits >= 16) {
4602
                    /* we have enough bits for a UTF-16 value */
4603
78.2k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4604
78.2k
                    base64bits -= 16;
4605
78.2k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4606
78.2k
                    assert(outCh <= 0xffff);
4607
78.2k
                    if (surrogate) {
4608
                        /* expecting a second surrogate */
4609
7.51k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4610
2.71k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4611
2.71k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4612
0
                                goto onError;
4613
2.71k
                            surrogate = 0;
4614
2.71k
                            continue;
4615
2.71k
                        }
4616
4.79k
                        else {
4617
4.79k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4618
0
                                goto onError;
4619
4.79k
                            surrogate = 0;
4620
4.79k
                        }
4621
7.51k
                    }
4622
75.5k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4623
                        /* first surrogate */
4624
11.4k
                        surrogate = outCh;
4625
11.4k
                    }
4626
64.0k
                    else {
4627
64.0k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4628
0
                            goto onError;
4629
64.0k
                    }
4630
75.5k
                }
4631
224k
            }
4632
14.0k
            else { /* now leaving a base-64 section */
4633
14.0k
                inShift = 0;
4634
14.0k
                if (base64bits > 0) { /* left-over bits */
4635
11.7k
                    if (base64bits >= 6) {
4636
                        /* We've seen at least one base-64 character */
4637
6.31k
                        s++;
4638
6.31k
                        errmsg = "partial character in shift sequence";
4639
6.31k
                        goto utf7Error;
4640
6.31k
                    }
4641
5.47k
                    else {
4642
                        /* Some bits remain; they should be zero */
4643
5.47k
                        if (base64buffer != 0) {
4644
1.47k
                            s++;
4645
1.47k
                            errmsg = "non-zero padding bits in shift sequence";
4646
1.47k
                            goto utf7Error;
4647
1.47k
                        }
4648
5.47k
                    }
4649
11.7k
                }
4650
6.25k
                if (surrogate && DECODE_DIRECT(ch)) {
4651
2.95k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4652
0
                        goto onError;
4653
2.95k
                }
4654
6.25k
                surrogate = 0;
4655
6.25k
                if (ch == '-') {
4656
                    /* '-' is absorbed; other terminating
4657
                       characters are preserved */
4658
2.04k
                    s++;
4659
2.04k
                }
4660
6.25k
            }
4661
238k
        }
4662
4.16M
        else if ( ch == '+' ) {
4663
22.2k
            startinpos = s-starts;
4664
22.2k
            s++; /* consume '+' */
4665
22.2k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4666
1.76k
                s++;
4667
1.76k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4668
0
                    goto onError;
4669
1.76k
            }
4670
20.4k
            else if (s < e && !IS_BASE64(*s)) {
4671
2.81k
                s++;
4672
2.81k
                errmsg = "ill-formed sequence";
4673
2.81k
                goto utf7Error;
4674
2.81k
            }
4675
17.6k
            else { /* begin base64-encoded section */
4676
17.6k
                inShift = 1;
4677
17.6k
                surrogate = 0;
4678
17.6k
                shiftOutStart = writer.pos;
4679
17.6k
                base64bits = 0;
4680
17.6k
                base64buffer = 0;
4681
17.6k
            }
4682
22.2k
        }
4683
4.14M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4684
4.03M
            s++;
4685
4.03M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4686
0
                goto onError;
4687
4.03M
        }
4688
111k
        else {
4689
111k
            startinpos = s-starts;
4690
111k
            s++;
4691
111k
            errmsg = "unexpected special character";
4692
111k
            goto utf7Error;
4693
111k
        }
4694
4.27M
        continue;
4695
4.27M
utf7Error:
4696
122k
        endinpos = s-starts;
4697
122k
        if (unicode_decode_call_errorhandler_writer(
4698
122k
                errors, &errorHandler,
4699
122k
                "utf7", errmsg,
4700
122k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4701
122k
                &writer))
4702
9.16k
            goto onError;
4703
122k
    }
4704
4705
    /* end of string */
4706
4707
11.5k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4708
        /* if we're in an inconsistent state, that's an error */
4709
3.63k
        inShift = 0;
4710
3.63k
        if (surrogate ||
4711
3.08k
                (base64bits >= 6) ||
4712
2.33k
                (base64bits > 0 && base64buffer != 0)) {
4713
2.33k
            endinpos = size;
4714
2.33k
            if (unicode_decode_call_errorhandler_writer(
4715
2.33k
                    errors, &errorHandler,
4716
2.33k
                    "utf7", "unterminated shift sequence",
4717
2.33k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4718
2.33k
                    &writer))
4719
1.93k
                goto onError;
4720
393
            if (s < e)
4721
0
                goto restart;
4722
393
        }
4723
3.63k
    }
4724
4725
    /* return state */
4726
9.64k
    if (consumed) {
4727
0
        if (inShift) {
4728
0
            *consumed = startinpos;
4729
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4730
0
                PyObject *result = PyUnicode_FromKindAndData(
4731
0
                        writer.kind, writer.data, shiftOutStart);
4732
0
                Py_XDECREF(errorHandler);
4733
0
                Py_XDECREF(exc);
4734
0
                _PyUnicodeWriter_Dealloc(&writer);
4735
0
                return result;
4736
0
            }
4737
0
            writer.pos = shiftOutStart; /* back off output */
4738
0
        }
4739
0
        else {
4740
0
            *consumed = s-starts;
4741
0
        }
4742
0
    }
4743
4744
9.64k
    Py_XDECREF(errorHandler);
4745
9.64k
    Py_XDECREF(exc);
4746
9.64k
    return _PyUnicodeWriter_Finish(&writer);
4747
4748
11.1k
  onError:
4749
11.1k
    Py_XDECREF(errorHandler);
4750
11.1k
    Py_XDECREF(exc);
4751
11.1k
    _PyUnicodeWriter_Dealloc(&writer);
4752
11.1k
    return NULL;
4753
9.64k
}
4754
4755
4756
PyObject *
4757
_PyUnicode_EncodeUTF7(PyObject *str,
4758
                      const char *errors)
4759
0
{
4760
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4761
0
    if (len == 0) {
4762
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4763
0
    }
4764
0
    int kind = PyUnicode_KIND(str);
4765
0
    const void *data = PyUnicode_DATA(str);
4766
4767
    /* It might be possible to tighten this worst case */
4768
0
    if (len > PY_SSIZE_T_MAX / 8) {
4769
0
        return PyErr_NoMemory();
4770
0
    }
4771
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4772
0
    if (writer == NULL) {
4773
0
        return NULL;
4774
0
    }
4775
4776
0
    int inShift = 0;
4777
0
    unsigned int base64bits = 0;
4778
0
    unsigned long base64buffer = 0;
4779
0
    char *out = PyBytesWriter_GetData(writer);
4780
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4781
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4782
4783
0
        if (inShift) {
4784
0
            if (ENCODE_DIRECT(ch)) {
4785
                /* shifting out */
4786
0
                if (base64bits) { /* output remaining bits */
4787
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4788
0
                    base64buffer = 0;
4789
0
                    base64bits = 0;
4790
0
                }
4791
0
                inShift = 0;
4792
                /* Characters not in the BASE64 set implicitly unshift the sequence
4793
                   so no '-' is required, except if the character is itself a '-' */
4794
0
                if (IS_BASE64(ch) || ch == '-') {
4795
0
                    *out++ = '-';
4796
0
                }
4797
0
                *out++ = (char) ch;
4798
0
            }
4799
0
            else {
4800
0
                goto encode_char;
4801
0
            }
4802
0
        }
4803
0
        else { /* not in a shift sequence */
4804
0
            if (ch == '+') {
4805
0
                *out++ = '+';
4806
0
                        *out++ = '-';
4807
0
            }
4808
0
            else if (ENCODE_DIRECT(ch)) {
4809
0
                *out++ = (char) ch;
4810
0
            }
4811
0
            else {
4812
0
                *out++ = '+';
4813
0
                inShift = 1;
4814
0
                goto encode_char;
4815
0
            }
4816
0
        }
4817
0
        continue;
4818
0
encode_char:
4819
0
        if (ch >= 0x10000) {
4820
0
            assert(ch <= MAX_UNICODE);
4821
4822
            /* code first surrogate */
4823
0
            base64bits += 16;
4824
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4825
0
            while (base64bits >= 6) {
4826
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4827
0
                base64bits -= 6;
4828
0
            }
4829
            /* prepare second surrogate */
4830
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4831
0
        }
4832
0
        base64bits += 16;
4833
0
        base64buffer = (base64buffer << 16) | ch;
4834
0
        while (base64bits >= 6) {
4835
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4836
0
            base64bits -= 6;
4837
0
        }
4838
0
    }
4839
0
    if (base64bits)
4840
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4841
0
    if (inShift)
4842
0
        *out++ = '-';
4843
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4844
0
}
4845
4846
#undef IS_BASE64
4847
#undef FROM_BASE64
4848
#undef TO_BASE64
4849
#undef DECODE_DIRECT
4850
#undef ENCODE_DIRECT
4851
4852
/* --- UTF-8 Codec -------------------------------------------------------- */
4853
4854
PyObject *
4855
PyUnicode_DecodeUTF8(const char *s,
4856
                     Py_ssize_t size,
4857
                     const char *errors)
4858
54.2M
{
4859
54.2M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4860
54.2M
}
4861
4862
#include "stringlib/asciilib.h"
4863
#include "stringlib/codecs.h"
4864
#include "stringlib/undef.h"
4865
4866
#include "stringlib/ucs1lib.h"
4867
#include "stringlib/codecs.h"
4868
#include "stringlib/undef.h"
4869
4870
#include "stringlib/ucs2lib.h"
4871
#include "stringlib/codecs.h"
4872
#include "stringlib/undef.h"
4873
4874
#include "stringlib/ucs4lib.h"
4875
#include "stringlib/codecs.h"
4876
#include "stringlib/undef.h"
4877
4878
#if (SIZEOF_SIZE_T == 8)
4879
/* Mask to quickly check whether a C 'size_t' contains a
4880
   non-ASCII, UTF8-encoded char. */
4881
179M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4882
// used to count codepoints in UTF-8 string.
4883
270M
# define VECTOR_0101     0x0101010101010101ULL
4884
2.66M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4885
#elif (SIZEOF_SIZE_T == 4)
4886
# define ASCII_CHAR_MASK 0x80808080U
4887
# define VECTOR_0101     0x01010101U
4888
# define VECTOR_00FF     0x00ff00ffU
4889
#else
4890
# error C 'size_t' size should be either 4 or 8!
4891
#endif
4892
4893
#if (defined(__clang__) || defined(__GNUC__))
4894
#define HAVE_CTZ 1
4895
static inline unsigned int
4896
ctz(size_t v)
4897
7.44M
{
4898
7.44M
    return __builtin_ctzll((unsigned long long)v);
4899
7.44M
}
4900
#elif defined(_MSC_VER)
4901
#define HAVE_CTZ 1
4902
static inline unsigned int
4903
ctz(size_t v)
4904
{
4905
    unsigned long pos;
4906
#if SIZEOF_SIZE_T == 4
4907
    _BitScanForward(&pos, v);
4908
#else
4909
    _BitScanForward64(&pos, v);
4910
#endif /* SIZEOF_SIZE_T */
4911
    return pos;
4912
}
4913
#else
4914
#define HAVE_CTZ 0
4915
#endif
4916
4917
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4918
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4919
static size_t
4920
load_unaligned(const unsigned char *p, size_t size)
4921
37.0M
{
4922
37.0M
    union {
4923
37.0M
        size_t s;
4924
37.0M
        unsigned char b[SIZEOF_SIZE_T];
4925
37.0M
    } u;
4926
37.0M
    u.s = 0;
4927
    // This switch statement assumes little endian because:
4928
    // * union is faster than bitwise or and shift.
4929
    // * big endian machine is rare and hard to maintain.
4930
37.0M
    switch (size) {
4931
0
    default:
4932
0
#if SIZEOF_SIZE_T == 8
4933
0
    case 8:
4934
0
        u.b[7] = p[7];
4935
0
        _Py_FALLTHROUGH;
4936
3.49M
    case 7:
4937
3.49M
        u.b[6] = p[6];
4938
3.49M
        _Py_FALLTHROUGH;
4939
6.92M
    case 6:
4940
6.92M
        u.b[5] = p[5];
4941
6.92M
        _Py_FALLTHROUGH;
4942
10.4M
    case 5:
4943
10.4M
        u.b[4] = p[4];
4944
10.4M
        _Py_FALLTHROUGH;
4945
10.4M
#endif
4946
14.4M
    case 4:
4947
14.4M
        u.b[3] = p[3];
4948
14.4M
        _Py_FALLTHROUGH;
4949
27.4M
    case 3:
4950
27.4M
        u.b[2] = p[2];
4951
27.4M
        _Py_FALLTHROUGH;
4952
33.0M
    case 2:
4953
33.0M
        u.b[1] = p[1];
4954
33.0M
        _Py_FALLTHROUGH;
4955
35.0M
    case 1:
4956
35.0M
        u.b[0] = p[0];
4957
35.0M
        break;
4958
1.97M
    case 0:
4959
1.97M
        break;
4960
37.0M
    }
4961
37.0M
    return u.s;
4962
37.0M
}
4963
#endif
4964
4965
/*
4966
 * Find the first non-ASCII character in a byte sequence.
4967
 *
4968
 * This function scans a range of bytes from `start` to `end` and returns the
4969
 * index of the first byte that is not an ASCII character (i.e., has the most
4970
 * significant bit set). If all characters in the range are ASCII, it returns
4971
 * `end - start`.
4972
 */
4973
static Py_ssize_t
4974
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4975
41.6M
{
4976
    // The search is done in `size_t` chunks.
4977
    // The start and end might not be aligned at `size_t` boundaries,
4978
    // so they're handled specially.
4979
4980
41.6M
    const unsigned char *p = start;
4981
4982
41.6M
    if (end - start >= SIZEOF_SIZE_T) {
4983
        // Avoid unaligned read.
4984
16.5M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4985
16.5M
        size_t u;
4986
16.5M
        memcpy(&u, p, sizeof(size_t));
4987
16.5M
        u &= ASCII_CHAR_MASK;
4988
16.5M
        if (u) {
4989
3.54M
            return (ctz(u) - 7) / 8;
4990
3.54M
        }
4991
13.0M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4992
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4993
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
4994
        while (p < p2) {
4995
            if (*p & 0x80) {
4996
                return p - start;
4997
            }
4998
            p++;
4999
        }
5000
#endif
5001
5002
13.0M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5003
137M
        while (p <= e) {
5004
125M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5005
125M
            if (u) {
5006
1.05M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5007
1.05M
                return p - start + (ctz(u) - 7) / 8;
5008
#else
5009
                // big endian and minor compilers are difficult to test.
5010
                // fallback to per byte check.
5011
                break;
5012
#endif
5013
1.05M
            }
5014
123M
            p += SIZEOF_SIZE_T;
5015
123M
        }
5016
13.0M
    }
5017
37.0M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5018
41.6M
    assert((end - p) < SIZEOF_SIZE_T);
5019
    // we can not use *(const size_t*)p to avoid buffer overrun.
5020
37.0M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5021
37.0M
    if (u) {
5022
2.83M
        return p - start + (ctz(u) - 7) / 8;
5023
2.83M
    }
5024
34.2M
    return end - start;
5025
#else
5026
    while (p < end) {
5027
        if (*p & 0x80) {
5028
            break;
5029
        }
5030
        p++;
5031
    }
5032
    return p - start;
5033
#endif
5034
37.0M
}
5035
5036
static inline int
5037
scalar_utf8_start_char(unsigned int ch)
5038
943k
{
5039
    // 0xxxxxxx or 11xxxxxx are first byte.
5040
943k
    return (~ch >> 7 | ch >> 6) & 1;
5041
943k
}
5042
5043
static inline size_t
5044
vector_utf8_start_chars(size_t v)
5045
270M
{
5046
270M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5047
270M
}
5048
5049
5050
// Count the number of UTF-8 code points in a given byte sequence.
5051
static Py_ssize_t
5052
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5053
361k
{
5054
361k
    Py_ssize_t len = 0;
5055
5056
361k
    if (end - s >= SIZEOF_SIZE_T) {
5057
296k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5058
16.6k
            len += scalar_utf8_start_char(*s++);
5059
16.6k
        }
5060
5061
1.61M
        while (s + SIZEOF_SIZE_T <= end) {
5062
1.33M
            const unsigned char *e = end;
5063
1.33M
            if (e - s > SIZEOF_SIZE_T * 255) {
5064
1.05M
                e = s + SIZEOF_SIZE_T * 255;
5065
1.05M
            }
5066
1.33M
            Py_ssize_t vstart = 0;
5067
271M
            while (s + SIZEOF_SIZE_T <= e) {
5068
270M
                size_t v = *(size_t*)s;
5069
270M
                size_t vs = vector_utf8_start_chars(v);
5070
270M
                vstart += vs;
5071
270M
                s += SIZEOF_SIZE_T;
5072
270M
            }
5073
1.33M
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5074
1.33M
            vstart += vstart >> 16;
5075
1.33M
#if SIZEOF_SIZE_T == 8
5076
1.33M
            vstart += vstart >> 32;
5077
1.33M
#endif
5078
1.33M
            len += vstart & 0x7ff;
5079
1.33M
        }
5080
280k
    }
5081
1.28M
    while (s < end) {
5082
926k
        len += scalar_utf8_start_char(*s++);
5083
926k
    }
5084
361k
    return len;
5085
361k
}
5086
5087
static Py_ssize_t
5088
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5089
4.71M
{
5090
4.71M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5091
4.71M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5092
4.57M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5093
696k
    {
5094
        /* Fast path, see in STRINGLIB(utf8_decode) for
5095
           an explanation. */
5096
696k
        const char *p = start;
5097
696k
        Py_UCS1 *q = dest;
5098
1.53M
        while (p + SIZEOF_SIZE_T <= end) {
5099
1.02M
            size_t value = *(const size_t *) p;
5100
1.02M
            if (value & ASCII_CHAR_MASK)
5101
178k
                break;
5102
842k
            *((size_t *)q) = value;
5103
842k
            p += SIZEOF_SIZE_T;
5104
842k
            q += SIZEOF_SIZE_T;
5105
842k
        }
5106
3.11M
        while (p < end) {
5107
2.61M
            if ((unsigned char)*p & 0x80)
5108
197k
                break;
5109
2.41M
            *q++ = *p++;
5110
2.41M
        }
5111
696k
        return p - start;
5112
696k
    }
5113
4.01M
#endif
5114
4.01M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5115
4.01M
                                         (const unsigned char*)end);
5116
4.01M
    memcpy(dest, start, pos);
5117
4.01M
    return pos;
5118
4.71M
}
5119
5120
static int
5121
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5122
                         const char *starts, const char *s, const char *end,
5123
                         _Py_error_handler error_handler,
5124
                         const char *errors,
5125
                         Py_ssize_t *consumed)
5126
7.44M
{
5127
7.44M
    Py_ssize_t startinpos, endinpos;
5128
7.44M
    const char *errmsg = "";
5129
7.44M
    PyObject *error_handler_obj = NULL;
5130
7.44M
    PyObject *exc = NULL;
5131
5132
340M
    while (s < end) {
5133
337M
        Py_UCS4 ch;
5134
337M
        int kind = writer->kind;
5135
5136
337M
        if (kind == PyUnicode_1BYTE_KIND) {
5137
7.67M
            if (PyUnicode_IS_ASCII(writer->buffer))
5138
7.08M
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5139
595k
            else
5140
595k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5141
330M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5142
111M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5143
218M
        } else {
5144
218M
            assert(kind == PyUnicode_4BYTE_KIND);
5145
218M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5146
218M
        }
5147
5148
337M
        switch (ch) {
5149
4.89M
        case 0:
5150
4.89M
            if (s == end || consumed)
5151
4.86M
                goto End;
5152
24.1k
            errmsg = "unexpected end of data";
5153
24.1k
            startinpos = s - starts;
5154
24.1k
            endinpos = end - starts;
5155
24.1k
            break;
5156
240M
        case 1:
5157
240M
            errmsg = "invalid start byte";
5158
240M
            startinpos = s - starts;
5159
240M
            endinpos = startinpos + 1;
5160
240M
            break;
5161
83.3M
        case 2:
5162
83.3M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5163
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5164
0
            {
5165
                /* Truncated surrogate code in range D800-DFFF */
5166
0
                goto End;
5167
0
            }
5168
83.3M
            _Py_FALLTHROUGH;
5169
85.3M
        case 3:
5170
85.6M
        case 4:
5171
85.6M
            errmsg = "invalid continuation byte";
5172
85.6M
            startinpos = s - starts;
5173
85.6M
            endinpos = startinpos + ch - 1;
5174
85.6M
            break;
5175
7.13M
        default:
5176
            // ch doesn't fit into kind, so change the buffer kind to write
5177
            // the character
5178
7.13M
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5179
0
                goto onError;
5180
7.13M
            continue;
5181
337M
        }
5182
5183
325M
        if (error_handler == _Py_ERROR_UNKNOWN)
5184
123k
            error_handler = _Py_GetErrorHandler(errors);
5185
5186
325M
        switch (error_handler) {
5187
0
        case _Py_ERROR_IGNORE:
5188
0
            s += (endinpos - startinpos);
5189
0
            break;
5190
5191
325M
        case _Py_ERROR_REPLACE:
5192
325M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5193
0
                goto onError;
5194
325M
            s += (endinpos - startinpos);
5195
325M
            break;
5196
5197
2.73k
        case _Py_ERROR_SURROGATEESCAPE:
5198
2.73k
        {
5199
2.73k
            Py_ssize_t i;
5200
5201
2.73k
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5202
0
                goto onError;
5203
5.83k
            for (i=startinpos; i<endinpos; i++) {
5204
3.09k
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5205
3.09k
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5206
3.09k
                                ch + 0xdc00);
5207
3.09k
                writer->pos++;
5208
3.09k
            }
5209
2.73k
            s += (endinpos - startinpos);
5210
2.73k
            break;
5211
2.73k
        }
5212
5213
1.08k
        default:
5214
1.08k
            if (unicode_decode_call_errorhandler_writer(
5215
1.08k
                    errors, &error_handler_obj,
5216
1.08k
                    "utf-8", errmsg,
5217
1.08k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5218
1.08k
                    writer)) {
5219
1.08k
                goto onError;
5220
1.08k
            }
5221
5222
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5223
0
                return -1;
5224
0
            }
5225
325M
        }
5226
325M
    }
5227
5228
7.44M
End:
5229
7.44M
    if (consumed)
5230
660
        *consumed = s - starts;
5231
5232
7.44M
    Py_XDECREF(error_handler_obj);
5233
7.44M
    Py_XDECREF(exc);
5234
7.44M
    return 0;
5235
5236
1.08k
onError:
5237
1.08k
    Py_XDECREF(error_handler_obj);
5238
1.08k
    Py_XDECREF(exc);
5239
1.08k
    return -1;
5240
7.44M
}
5241
5242
5243
static PyObject *
5244
unicode_decode_utf8(const char *s, Py_ssize_t size,
5245
                    _Py_error_handler error_handler, const char *errors,
5246
                    Py_ssize_t *consumed)
5247
70.9M
{
5248
70.9M
    if (size == 0) {
5249
1.90M
        if (consumed) {
5250
0
            *consumed = 0;
5251
0
        }
5252
1.90M
        _Py_RETURN_UNICODE_EMPTY();
5253
1.90M
    }
5254
5255
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5256
69.0M
    if (size == 1 && (unsigned char)s[0] < 128) {
5257
31.4M
        if (consumed) {
5258
0
            *consumed = 1;
5259
0
        }
5260
31.4M
        return get_latin1_char((unsigned char)s[0]);
5261
31.4M
    }
5262
5263
    // I don't know this check is necessary or not. But there is a test
5264
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5265
37.6M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5266
0
        PyErr_NoMemory();
5267
0
        return NULL;
5268
0
    }
5269
5270
37.6M
    const char *starts = s;
5271
37.6M
    const char *end = s + size;
5272
5273
37.6M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5274
37.6M
    if (pos == size) {  // fast path: ASCII string.
5275
30.2M
        PyObject *u = PyUnicode_New(size, 127);
5276
30.2M
        if (u == NULL) {
5277
0
            return NULL;
5278
0
        }
5279
30.2M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5280
30.2M
        if (consumed) {
5281
91
            *consumed = size;
5282
91
        }
5283
30.2M
        return u;
5284
30.2M
    }
5285
5286
7.39M
    int maxchr = 127;
5287
7.39M
    Py_ssize_t maxsize = size;
5288
5289
7.39M
    unsigned char ch = (unsigned char)(s[pos]);
5290
    // error handler other than strict may remove/replace the invalid byte.
5291
    // consumed != NULL allows 1~3 bytes remainings.
5292
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5293
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5294
    // reallocation and copy.
5295
7.39M
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5296
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5297
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5298
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5299
        // means that it is no longer necessary to allocate several times the required amount
5300
        // of memory.
5301
361k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5302
361k
        if (ch < 0xc4) { // latin1
5303
238k
            maxchr = 0xff;
5304
238k
        }
5305
122k
        else if (ch < 0xf0) { // ucs2
5306
105k
            maxchr = 0xffff;
5307
105k
        }
5308
17.0k
        else { // ucs4
5309
17.0k
            maxchr = 0x10ffff;
5310
17.0k
        }
5311
361k
    }
5312
7.39M
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5313
7.39M
    if (!u) {
5314
0
        return NULL;
5315
0
    }
5316
5317
    // Use _PyUnicodeWriter after fast path is failed.
5318
7.39M
    _PyUnicodeWriter writer;
5319
7.39M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5320
7.39M
    if (maxchr <= 255) {
5321
7.27M
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5322
7.27M
        s += pos;
5323
7.27M
        writer.pos = pos;
5324
7.27M
    }
5325
5326
7.39M
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5327
7.39M
                                 error_handler, errors,
5328
7.39M
                                 consumed) < 0) {
5329
1.08k
        _PyUnicodeWriter_Dealloc(&writer);
5330
1.08k
        return NULL;
5331
1.08k
    }
5332
7.39M
    return _PyUnicodeWriter_Finish(&writer);
5333
7.39M
}
5334
5335
5336
// Used by PyUnicodeWriter_WriteUTF8() implementation
5337
int
5338
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5339
                            const char *s, Py_ssize_t size,
5340
                            _Py_error_handler error_handler, const char *errors,
5341
                            Py_ssize_t *consumed)
5342
4.03M
{
5343
4.03M
    if (size == 0) {
5344
8.36k
        if (consumed) {
5345
0
            *consumed = 0;
5346
0
        }
5347
8.36k
        return 0;
5348
8.36k
    }
5349
5350
    // fast path: try ASCII string.
5351
4.02M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5352
0
        return -1;
5353
0
    }
5354
5355
4.02M
    const char *starts = s;
5356
4.02M
    const char *end = s + size;
5357
4.02M
    Py_ssize_t decoded = 0;
5358
4.02M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5359
4.02M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5360
4.02M
        decoded = ascii_decode(s, end, dest);
5361
4.02M
        writer->pos += decoded;
5362
5363
4.02M
        if (decoded == size) {
5364
3.98M
            if (consumed) {
5365
800
                *consumed = size;
5366
800
            }
5367
3.98M
            return 0;
5368
3.98M
        }
5369
45.5k
        s += decoded;
5370
45.5k
    }
5371
5372
47.6k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5373
47.6k
                                    error_handler, errors, consumed);
5374
4.02M
}
5375
5376
5377
PyObject *
5378
PyUnicode_DecodeUTF8Stateful(const char *s,
5379
                             Py_ssize_t size,
5380
                             const char *errors,
5381
                             Py_ssize_t *consumed)
5382
70.7M
{
5383
70.7M
    return unicode_decode_utf8(s, size,
5384
70.7M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5385
70.7M
                               errors, consumed);
5386
70.7M
}
5387
5388
5389
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5390
   non-zero, use strict error handler otherwise.
5391
5392
   On success, write a pointer to a newly allocated wide character string into
5393
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5394
   (in number of wchar_t units) into *wlen (if wlen is set).
5395
5396
   On memory allocation failure, return -1.
5397
5398
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5399
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5400
   is not NULL, write the decoding error message into *reason. */
5401
int
5402
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5403
                 const char **reason, _Py_error_handler errors)
5404
11.1k
{
5405
11.1k
    const char *orig_s = s;
5406
11.1k
    const char *e;
5407
11.1k
    wchar_t *unicode;
5408
11.1k
    Py_ssize_t outpos;
5409
5410
11.1k
    int surrogateescape = 0;
5411
11.1k
    int surrogatepass = 0;
5412
11.1k
    switch (errors)
5413
11.1k
    {
5414
0
    case _Py_ERROR_STRICT:
5415
0
        break;
5416
11.1k
    case _Py_ERROR_SURROGATEESCAPE:
5417
11.1k
        surrogateescape = 1;
5418
11.1k
        break;
5419
0
    case _Py_ERROR_SURROGATEPASS:
5420
0
        surrogatepass = 1;
5421
0
        break;
5422
0
    default:
5423
0
        return -3;
5424
11.1k
    }
5425
5426
    /* Note: size will always be longer than the resulting Unicode
5427
       character count */
5428
11.1k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5429
0
        return -1;
5430
0
    }
5431
5432
11.1k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5433
11.1k
    if (!unicode) {
5434
0
        return -1;
5435
0
    }
5436
5437
    /* Unpack UTF-8 encoded data */
5438
11.1k
    e = s + size;
5439
11.1k
    outpos = 0;
5440
11.1k
    while (s < e) {
5441
11.1k
        Py_UCS4 ch;
5442
11.1k
#if SIZEOF_WCHAR_T == 4
5443
11.1k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5444
#else
5445
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5446
#endif
5447
11.1k
        if (ch > 0xFF) {
5448
0
#if SIZEOF_WCHAR_T == 4
5449
0
            Py_UNREACHABLE();
5450
#else
5451
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5452
            /* write a surrogate pair */
5453
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5454
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5455
#endif
5456
0
        }
5457
11.1k
        else {
5458
11.1k
            if (!ch && s == e) {
5459
11.1k
                break;
5460
11.1k
            }
5461
5462
0
            if (surrogateescape) {
5463
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5464
0
            }
5465
0
            else {
5466
                /* Is it a valid three-byte code? */
5467
0
                if (surrogatepass
5468
0
                    && (e - s) >= 3
5469
0
                    && (s[0] & 0xf0) == 0xe0
5470
0
                    && (s[1] & 0xc0) == 0x80
5471
0
                    && (s[2] & 0xc0) == 0x80)
5472
0
                {
5473
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5474
0
                    s += 3;
5475
0
                    unicode[outpos++] = ch;
5476
0
                }
5477
0
                else {
5478
0
                    PyMem_RawFree(unicode );
5479
0
                    if (reason != NULL) {
5480
0
                        switch (ch) {
5481
0
                        case 0:
5482
0
                            *reason = "unexpected end of data";
5483
0
                            break;
5484
0
                        case 1:
5485
0
                            *reason = "invalid start byte";
5486
0
                            break;
5487
                        /* 2, 3, 4 */
5488
0
                        default:
5489
0
                            *reason = "invalid continuation byte";
5490
0
                            break;
5491
0
                        }
5492
0
                    }
5493
0
                    if (wlen != NULL) {
5494
0
                        *wlen = s - orig_s;
5495
0
                    }
5496
0
                    return -2;
5497
0
                }
5498
0
            }
5499
0
        }
5500
11.1k
    }
5501
11.1k
    unicode[outpos] = L'\0';
5502
11.1k
    if (wlen) {
5503
11.1k
        *wlen = outpos;
5504
11.1k
    }
5505
11.1k
    *wstr = unicode;
5506
11.1k
    return 0;
5507
11.1k
}
5508
5509
5510
wchar_t*
5511
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5512
                               size_t *wlen)
5513
0
{
5514
0
    wchar_t *wstr;
5515
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5516
0
                               &wstr, wlen,
5517
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5518
0
    if (res != 0) {
5519
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5520
0
        assert(res != -3);
5521
0
        if (wlen) {
5522
0
            *wlen = (size_t)res;
5523
0
        }
5524
0
        return NULL;
5525
0
    }
5526
0
    return wstr;
5527
0
}
5528
5529
5530
/* UTF-8 encoder.
5531
5532
   On success, return 0 and write the newly allocated character string (use
5533
   PyMem_Free() to free the memory) into *str.
5534
5535
   On encoding failure, return -2 and write the position of the invalid
5536
   surrogate character into *error_pos (if error_pos is set) and the decoding
5537
   error message into *reason (if reason is set).
5538
5539
   On memory allocation failure, return -1. */
5540
int
5541
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5542
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5543
1.39k
{
5544
1.39k
    const Py_ssize_t max_char_size = 4;
5545
1.39k
    Py_ssize_t len = wcslen(text);
5546
5547
1.39k
    assert(len >= 0);
5548
5549
1.39k
    int surrogateescape = 0;
5550
1.39k
    int surrogatepass = 0;
5551
1.39k
    switch (errors)
5552
1.39k
    {
5553
136
    case _Py_ERROR_STRICT:
5554
136
        break;
5555
1.25k
    case _Py_ERROR_SURROGATEESCAPE:
5556
1.25k
        surrogateescape = 1;
5557
1.25k
        break;
5558
0
    case _Py_ERROR_SURROGATEPASS:
5559
0
        surrogatepass = 1;
5560
0
        break;
5561
0
    default:
5562
0
        return -3;
5563
1.39k
    }
5564
5565
1.39k
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5566
0
        return -1;
5567
0
    }
5568
1.39k
    char *bytes;
5569
1.39k
    if (raw_malloc) {
5570
1.39k
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5571
1.39k
    }
5572
0
    else {
5573
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5574
0
    }
5575
1.39k
    if (bytes == NULL) {
5576
0
        return -1;
5577
0
    }
5578
5579
1.39k
    char *p = bytes;
5580
1.39k
    Py_ssize_t i;
5581
90.6k
    for (i = 0; i < len; ) {
5582
89.2k
        Py_ssize_t ch_pos = i;
5583
89.2k
        Py_UCS4 ch = text[i];
5584
89.2k
        i++;
5585
#if Py_UNICODE_SIZE == 2
5586
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5587
            && i < len
5588
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5589
        {
5590
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5591
            i++;
5592
        }
5593
#endif
5594
5595
89.2k
        if (ch < 0x80) {
5596
            /* Encode ASCII */
5597
89.2k
            *p++ = (char) ch;
5598
5599
89.2k
        }
5600
0
        else if (ch < 0x0800) {
5601
            /* Encode Latin-1 */
5602
0
            *p++ = (char)(0xc0 | (ch >> 6));
5603
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5604
0
        }
5605
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5606
            /* surrogateescape error handler */
5607
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5608
0
                if (error_pos != NULL) {
5609
0
                    *error_pos = (size_t)ch_pos;
5610
0
                }
5611
0
                if (reason != NULL) {
5612
0
                    *reason = "encoding error";
5613
0
                }
5614
0
                if (raw_malloc) {
5615
0
                    PyMem_RawFree(bytes);
5616
0
                }
5617
0
                else {
5618
0
                    PyMem_Free(bytes);
5619
0
                }
5620
0
                return -2;
5621
0
            }
5622
0
            *p++ = (char)(ch & 0xff);
5623
0
        }
5624
0
        else if (ch < 0x10000) {
5625
0
            *p++ = (char)(0xe0 | (ch >> 12));
5626
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5627
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5628
0
        }
5629
0
        else {  /* ch >= 0x10000 */
5630
0
            assert(ch <= MAX_UNICODE);
5631
            /* Encode UCS4 Unicode ordinals */
5632
0
            *p++ = (char)(0xf0 | (ch >> 18));
5633
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5634
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5635
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5636
0
        }
5637
89.2k
    }
5638
1.39k
    *p++ = '\0';
5639
5640
1.39k
    size_t final_size = (p - bytes);
5641
1.39k
    char *bytes2;
5642
1.39k
    if (raw_malloc) {
5643
1.39k
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5644
1.39k
    }
5645
0
    else {
5646
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5647
0
    }
5648
1.39k
    if (bytes2 == NULL) {
5649
0
        if (error_pos != NULL) {
5650
0
            *error_pos = (size_t)-1;
5651
0
        }
5652
0
        if (raw_malloc) {
5653
0
            PyMem_RawFree(bytes);
5654
0
        }
5655
0
        else {
5656
0
            PyMem_Free(bytes);
5657
0
        }
5658
0
        return -1;
5659
0
    }
5660
1.39k
    *str = bytes2;
5661
1.39k
    return 0;
5662
1.39k
}
5663
5664
5665
/* Primary internal function which creates utf8 encoded bytes objects.
5666
5667
   Allocation strategy:  if the string is short, convert into a stack buffer
5668
   and allocate exactly as much space needed at the end.  Else allocate the
5669
   maximum possible needed (4 result bytes per Unicode character), and return
5670
   the excess memory at the end.
5671
*/
5672
static PyObject *
5673
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5674
                    const char *errors)
5675
21.8M
{
5676
21.8M
    if (!PyUnicode_Check(unicode)) {
5677
0
        PyErr_BadArgument();
5678
0
        return NULL;
5679
0
    }
5680
5681
21.8M
    if (PyUnicode_UTF8(unicode))
5682
10.9M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5683
10.9M
                                         PyUnicode_UTF8_LENGTH(unicode));
5684
5685
10.8M
    int kind = PyUnicode_KIND(unicode);
5686
10.8M
    const void *data = PyUnicode_DATA(unicode);
5687
10.8M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5688
5689
10.8M
    PyBytesWriter *writer;
5690
10.8M
    char *end;
5691
5692
10.8M
    switch (kind) {
5693
0
    default:
5694
0
        Py_UNREACHABLE();
5695
7.87M
    case PyUnicode_1BYTE_KIND:
5696
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5697
7.87M
        assert(!PyUnicode_IS_ASCII(unicode));
5698
7.87M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5699
7.87M
                                      error_handler, errors, &end);
5700
7.87M
        break;
5701
1.80M
    case PyUnicode_2BYTE_KIND:
5702
1.80M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5703
1.80M
                                      error_handler, errors, &end);
5704
1.80M
        break;
5705
1.21M
    case PyUnicode_4BYTE_KIND:
5706
1.21M
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5707
1.21M
                                      error_handler, errors, &end);
5708
1.21M
        break;
5709
10.8M
    }
5710
5711
10.8M
    if (writer == NULL) {
5712
148k
        PyBytesWriter_Discard(writer);
5713
148k
        return NULL;
5714
148k
    }
5715
10.7M
    return PyBytesWriter_FinishWithPointer(writer, end);
5716
10.8M
}
5717
5718
static int
5719
unicode_fill_utf8(PyObject *unicode)
5720
170k
{
5721
170k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5722
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5723
170k
    assert(!PyUnicode_IS_ASCII(unicode));
5724
5725
170k
    int kind = PyUnicode_KIND(unicode);
5726
170k
    const void *data = PyUnicode_DATA(unicode);
5727
170k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5728
5729
170k
    PyBytesWriter *writer;
5730
170k
    char *end;
5731
5732
170k
    switch (kind) {
5733
0
    default:
5734
0
        Py_UNREACHABLE();
5735
121k
    case PyUnicode_1BYTE_KIND:
5736
121k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5737
121k
                                      _Py_ERROR_STRICT, NULL, &end);
5738
121k
        break;
5739
40.2k
    case PyUnicode_2BYTE_KIND:
5740
40.2k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5741
40.2k
                                      _Py_ERROR_STRICT, NULL, &end);
5742
40.2k
        break;
5743
8.12k
    case PyUnicode_4BYTE_KIND:
5744
8.12k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5745
8.12k
                                      _Py_ERROR_STRICT, NULL, &end);
5746
8.12k
        break;
5747
170k
    }
5748
170k
    if (writer == NULL) {
5749
207
        return -1;
5750
207
    }
5751
5752
169k
    const char *start = PyBytesWriter_GetData(writer);
5753
169k
    Py_ssize_t len = end - start;
5754
5755
169k
    char *cache = PyMem_Malloc(len + 1);
5756
169k
    if (cache == NULL) {
5757
0
        PyBytesWriter_Discard(writer);
5758
0
        PyErr_NoMemory();
5759
0
        return -1;
5760
0
    }
5761
169k
    memcpy(cache, start, len);
5762
169k
    cache[len] = '\0';
5763
169k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5764
169k
    PyUnicode_SET_UTF8(unicode, cache);
5765
169k
    PyBytesWriter_Discard(writer);
5766
169k
    return 0;
5767
169k
}
5768
5769
PyObject *
5770
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5771
20.6M
{
5772
20.6M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5773
20.6M
}
5774
5775
5776
PyObject *
5777
PyUnicode_AsUTF8String(PyObject *unicode)
5778
2.16k
{
5779
2.16k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5780
2.16k
}
5781
5782
/* --- UTF-32 Codec ------------------------------------------------------- */
5783
5784
PyObject *
5785
PyUnicode_DecodeUTF32(const char *s,
5786
                      Py_ssize_t size,
5787
                      const char *errors,
5788
                      int *byteorder)
5789
90
{
5790
90
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5791
90
}
5792
5793
PyObject *
5794
PyUnicode_DecodeUTF32Stateful(const char *s,
5795
                              Py_ssize_t size,
5796
                              const char *errors,
5797
                              int *byteorder,
5798
                              Py_ssize_t *consumed)
5799
39.5k
{
5800
39.5k
    const char *starts = s;
5801
39.5k
    Py_ssize_t startinpos;
5802
39.5k
    Py_ssize_t endinpos;
5803
39.5k
    _PyUnicodeWriter writer;
5804
39.5k
    const unsigned char *q, *e;
5805
39.5k
    int le, bo = 0;       /* assume native ordering by default */
5806
39.5k
    const char *encoding;
5807
39.5k
    const char *errmsg = "";
5808
39.5k
    PyObject *errorHandler = NULL;
5809
39.5k
    PyObject *exc = NULL;
5810
5811
39.5k
    q = (const unsigned char *)s;
5812
39.5k
    e = q + size;
5813
5814
39.5k
    if (byteorder)
5815
39.4k
        bo = *byteorder;
5816
5817
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5818
       byte order setting accordingly. In native mode, the leading BOM
5819
       mark is skipped, in all other modes, it is copied to the output
5820
       stream as-is (giving a ZWNBSP character). */
5821
39.5k
    if (bo == 0 && size >= 4) {
5822
37.5k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5823
37.5k
        if (bom == 0x0000FEFF) {
5824
201
            bo = -1;
5825
201
            q += 4;
5826
201
        }
5827
37.3k
        else if (bom == 0xFFFE0000) {
5828
286
            bo = 1;
5829
286
            q += 4;
5830
286
        }
5831
37.5k
        if (byteorder)
5832
37.4k
            *byteorder = bo;
5833
37.5k
    }
5834
5835
39.5k
    if (q == e) {
5836
102
        if (consumed)
5837
0
            *consumed = size;
5838
102
        _Py_RETURN_UNICODE_EMPTY();
5839
102
    }
5840
5841
#ifdef WORDS_BIGENDIAN
5842
    le = bo < 0;
5843
#else
5844
39.4k
    le = bo <= 0;
5845
39.4k
#endif
5846
39.4k
    encoding = le ? "utf-32-le" : "utf-32-be";
5847
5848
39.4k
    _PyUnicodeWriter_Init(&writer);
5849
39.4k
    writer.min_length = (e - q + 3) / 4;
5850
39.4k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5851
0
        goto onError;
5852
5853
122k
    while (1) {
5854
122k
        Py_UCS4 ch = 0;
5855
122k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5856
5857
122k
        if (e - q >= 4) {
5858
96.9k
            int kind = writer.kind;
5859
96.9k
            void *data = writer.data;
5860
96.9k
            const unsigned char *last = e - 4;
5861
96.9k
            Py_ssize_t pos = writer.pos;
5862
96.9k
            if (le) {
5863
2.15M
                do {
5864
2.15M
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5865
2.15M
                    if (ch > maxch)
5866
89.9k
                        break;
5867
2.06M
                    if (kind != PyUnicode_1BYTE_KIND &&
5868
2.03M
                        Py_UNICODE_IS_SURROGATE(ch))
5869
142
                        break;
5870
2.06M
                    PyUnicode_WRITE(kind, data, pos++, ch);
5871
2.06M
                    q += 4;
5872
2.06M
                } while (q <= last);
5873
91.3k
            }
5874
5.64k
            else {
5875
8.85k
                do {
5876
8.85k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5877
8.85k
                    if (ch > maxch)
5878
5.34k
                        break;
5879
3.51k
                    if (kind != PyUnicode_1BYTE_KIND &&
5880
2.84k
                        Py_UNICODE_IS_SURROGATE(ch))
5881
137
                        break;
5882
3.37k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5883
3.37k
                    q += 4;
5884
3.37k
                } while (q <= last);
5885
5.64k
            }
5886
96.9k
            writer.pos = pos;
5887
96.9k
        }
5888
5889
122k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5890
282
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5891
282
            startinpos = ((const char *)q) - starts;
5892
282
            endinpos = startinpos + 4;
5893
282
        }
5894
122k
        else if (ch <= maxch) {
5895
27.2k
            if (q == e || consumed)
5896
4.91k
                break;
5897
            /* remaining bytes at the end? (size should be divisible by 4) */
5898
22.2k
            errmsg = "truncated data";
5899
22.2k
            startinpos = ((const char *)q) - starts;
5900
22.2k
            endinpos = ((const char *)e) - starts;
5901
22.2k
        }
5902
95.3k
        else {
5903
95.3k
            if (ch < 0x110000) {
5904
5.05k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5905
0
                    goto onError;
5906
5.05k
                q += 4;
5907
5.05k
                continue;
5908
5.05k
            }
5909
90.2k
            errmsg = "code point not in range(0x110000)";
5910
90.2k
            startinpos = ((const char *)q) - starts;
5911
90.2k
            endinpos = startinpos + 4;
5912
90.2k
        }
5913
5914
        /* The remaining input chars are ignored if the callback
5915
           chooses to skip the input */
5916
112k
        if (unicode_decode_call_errorhandler_writer(
5917
112k
                errors, &errorHandler,
5918
112k
                encoding, errmsg,
5919
112k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5920
112k
                &writer))
5921
34.5k
            goto onError;
5922
112k
    }
5923
5924
4.91k
    if (consumed)
5925
0
        *consumed = (const char *)q-starts;
5926
5927
4.91k
    Py_XDECREF(errorHandler);
5928
4.91k
    Py_XDECREF(exc);
5929
4.91k
    return _PyUnicodeWriter_Finish(&writer);
5930
5931
34.5k
  onError:
5932
34.5k
    _PyUnicodeWriter_Dealloc(&writer);
5933
34.5k
    Py_XDECREF(errorHandler);
5934
34.5k
    Py_XDECREF(exc);
5935
34.5k
    return NULL;
5936
39.4k
}
5937
5938
PyObject *
5939
_PyUnicode_EncodeUTF32(PyObject *str,
5940
                       const char *errors,
5941
                       int byteorder)
5942
0
{
5943
0
    if (!PyUnicode_Check(str)) {
5944
0
        PyErr_BadArgument();
5945
0
        return NULL;
5946
0
    }
5947
0
    int kind = PyUnicode_KIND(str);
5948
0
    const void *data = PyUnicode_DATA(str);
5949
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5950
5951
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5952
0
        return PyErr_NoMemory();
5953
0
    Py_ssize_t nsize = len + (byteorder == 0);
5954
5955
0
#if PY_LITTLE_ENDIAN
5956
0
    int native_ordering = byteorder <= 0;
5957
#else
5958
    int native_ordering = byteorder >= 0;
5959
#endif
5960
5961
0
    if (kind == PyUnicode_1BYTE_KIND) {
5962
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5963
        // on short strings
5964
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5965
0
        if (v == NULL) {
5966
0
            return NULL;
5967
0
        }
5968
5969
        /* output buffer is 4-bytes aligned */
5970
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5971
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5972
0
        if (byteorder == 0) {
5973
0
            *out++ = 0xFEFF;
5974
0
        }
5975
0
        if (len > 0) {
5976
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5977
0
                                 &out, native_ordering);
5978
0
        }
5979
0
        return v;
5980
0
    }
5981
5982
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5983
0
    if (writer == NULL) {
5984
0
        return NULL;
5985
0
    }
5986
5987
    /* output buffer is 4-bytes aligned */
5988
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5989
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5990
0
    if (byteorder == 0) {
5991
0
        *out++ = 0xFEFF;
5992
0
    }
5993
0
    if (len == 0) {
5994
0
        return PyBytesWriter_Finish(writer);
5995
0
    }
5996
5997
0
    const char *encoding;
5998
0
    if (byteorder == -1)
5999
0
        encoding = "utf-32-le";
6000
0
    else if (byteorder == 1)
6001
0
        encoding = "utf-32-be";
6002
0
    else
6003
0
        encoding = "utf-32";
6004
6005
0
    PyObject *errorHandler = NULL;
6006
0
    PyObject *exc = NULL;
6007
0
    PyObject *rep = NULL;
6008
6009
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6010
0
        if (kind == PyUnicode_2BYTE_KIND) {
6011
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6012
0
                                        &out, native_ordering);
6013
0
        }
6014
0
        else {
6015
0
            assert(kind == PyUnicode_4BYTE_KIND);
6016
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6017
0
                                        &out, native_ordering);
6018
0
        }
6019
0
        if (pos == len)
6020
0
            break;
6021
6022
0
        Py_ssize_t newpos;
6023
0
        rep = unicode_encode_call_errorhandler(
6024
0
                errors, &errorHandler,
6025
0
                encoding, "surrogates not allowed",
6026
0
                str, &exc, pos, pos + 1, &newpos);
6027
0
        if (!rep)
6028
0
            goto error;
6029
6030
0
        Py_ssize_t repsize, moreunits;
6031
0
        if (PyBytes_Check(rep)) {
6032
0
            repsize = PyBytes_GET_SIZE(rep);
6033
0
            if (repsize & 3) {
6034
0
                raise_encode_exception(&exc, encoding,
6035
0
                                       str, pos, pos + 1,
6036
0
                                       "surrogates not allowed");
6037
0
                goto error;
6038
0
            }
6039
0
            moreunits = repsize / 4;
6040
0
        }
6041
0
        else {
6042
0
            assert(PyUnicode_Check(rep));
6043
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6044
0
            if (!PyUnicode_IS_ASCII(rep)) {
6045
0
                raise_encode_exception(&exc, encoding,
6046
0
                                       str, pos, pos + 1,
6047
0
                                       "surrogates not allowed");
6048
0
                goto error;
6049
0
            }
6050
0
        }
6051
0
        moreunits += pos - newpos;
6052
0
        pos = newpos;
6053
6054
        /* four bytes are reserved for each surrogate */
6055
0
        if (moreunits > 0) {
6056
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6057
0
            if (out == NULL) {
6058
0
                goto error;
6059
0
            }
6060
0
        }
6061
6062
0
        if (PyBytes_Check(rep)) {
6063
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6064
0
            out += repsize / 4;
6065
0
        }
6066
0
        else {
6067
            /* rep is unicode */
6068
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6069
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6070
0
                                 &out, native_ordering);
6071
0
        }
6072
6073
0
        Py_CLEAR(rep);
6074
0
    }
6075
6076
0
    Py_XDECREF(errorHandler);
6077
0
    Py_XDECREF(exc);
6078
6079
    /* Cut back to size actually needed. This is necessary for, for example,
6080
       encoding of a string containing isolated surrogates and the 'ignore'
6081
       handler is used. */
6082
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6083
6084
0
  error:
6085
0
    Py_XDECREF(rep);
6086
0
    Py_XDECREF(errorHandler);
6087
0
    Py_XDECREF(exc);
6088
0
    PyBytesWriter_Discard(writer);
6089
0
    return NULL;
6090
0
}
6091
6092
PyObject *
6093
PyUnicode_AsUTF32String(PyObject *unicode)
6094
0
{
6095
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6096
0
}
6097
6098
/* --- UTF-16 Codec ------------------------------------------------------- */
6099
6100
PyObject *
6101
PyUnicode_DecodeUTF16(const char *s,
6102
                      Py_ssize_t size,
6103
                      const char *errors,
6104
                      int *byteorder)
6105
87
{
6106
87
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6107
87
}
6108
6109
PyObject *
6110
PyUnicode_DecodeUTF16Stateful(const char *s,
6111
                              Py_ssize_t size,
6112
                              const char *errors,
6113
                              int *byteorder,
6114
                              Py_ssize_t *consumed)
6115
15.0k
{
6116
15.0k
    const char *starts = s;
6117
15.0k
    Py_ssize_t startinpos;
6118
15.0k
    Py_ssize_t endinpos;
6119
15.0k
    _PyUnicodeWriter writer;
6120
15.0k
    const unsigned char *q, *e;
6121
15.0k
    int bo = 0;       /* assume native ordering by default */
6122
15.0k
    int native_ordering;
6123
15.0k
    const char *errmsg = "";
6124
15.0k
    PyObject *errorHandler = NULL;
6125
15.0k
    PyObject *exc = NULL;
6126
15.0k
    const char *encoding;
6127
6128
15.0k
    q = (const unsigned char *)s;
6129
15.0k
    e = q + size;
6130
6131
15.0k
    if (byteorder)
6132
14.9k
        bo = *byteorder;
6133
6134
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6135
       byte order setting accordingly. In native mode, the leading BOM
6136
       mark is skipped, in all other modes, it is copied to the output
6137
       stream as-is (giving a ZWNBSP character). */
6138
15.0k
    if (bo == 0 && size >= 2) {
6139
14.3k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6140
14.3k
        if (bom == 0xFEFF) {
6141
345
            q += 2;
6142
345
            bo = -1;
6143
345
        }
6144
13.9k
        else if (bom == 0xFFFE) {
6145
2.48k
            q += 2;
6146
2.48k
            bo = 1;
6147
2.48k
        }
6148
14.3k
        if (byteorder)
6149
14.2k
            *byteorder = bo;
6150
14.3k
    }
6151
6152
15.0k
    if (q == e) {
6153
96
        if (consumed)
6154
0
            *consumed = size;
6155
96
        _Py_RETURN_UNICODE_EMPTY();
6156
96
    }
6157
6158
14.9k
#if PY_LITTLE_ENDIAN
6159
14.9k
    native_ordering = bo <= 0;
6160
14.9k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6161
#else
6162
    native_ordering = bo >= 0;
6163
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6164
#endif
6165
6166
    /* Note: size will always be longer than the resulting Unicode
6167
       character count normally.  Error handler will take care of
6168
       resizing when needed. */
6169
14.9k
    _PyUnicodeWriter_Init(&writer);
6170
14.9k
    writer.min_length = (e - q + 1) / 2;
6171
14.9k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6172
0
        goto onError;
6173
6174
55.1k
    while (1) {
6175
55.1k
        Py_UCS4 ch = 0;
6176
55.1k
        if (e - q >= 2) {
6177
47.0k
            int kind = writer.kind;
6178
47.0k
            if (kind == PyUnicode_1BYTE_KIND) {
6179
18.0k
                if (PyUnicode_IS_ASCII(writer.buffer))
6180
14.4k
                    ch = asciilib_utf16_decode(&q, e,
6181
14.4k
                            (Py_UCS1*)writer.data, &writer.pos,
6182
14.4k
                            native_ordering);
6183
3.65k
                else
6184
3.65k
                    ch = ucs1lib_utf16_decode(&q, e,
6185
3.65k
                            (Py_UCS1*)writer.data, &writer.pos,
6186
3.65k
                            native_ordering);
6187
28.9k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6188
12.8k
                ch = ucs2lib_utf16_decode(&q, e,
6189
12.8k
                        (Py_UCS2*)writer.data, &writer.pos,
6190
12.8k
                        native_ordering);
6191
16.1k
            } else {
6192
16.1k
                assert(kind == PyUnicode_4BYTE_KIND);
6193
16.1k
                ch = ucs4lib_utf16_decode(&q, e,
6194
16.1k
                        (Py_UCS4*)writer.data, &writer.pos,
6195
16.1k
                        native_ordering);
6196
16.1k
            }
6197
47.0k
        }
6198
6199
55.1k
        switch (ch)
6200
55.1k
        {
6201
15.5k
        case 0:
6202
            /* remaining byte at the end? (size should be even) */
6203
15.5k
            if (q == e || consumed)
6204
10.0k
                goto End;
6205
5.56k
            errmsg = "truncated data";
6206
5.56k
            startinpos = ((const char *)q) - starts;
6207
5.56k
            endinpos = ((const char *)e) - starts;
6208
5.56k
            break;
6209
            /* The remaining input chars are ignored if the callback
6210
               chooses to skip the input */
6211
1.59k
        case 1:
6212
1.59k
            q -= 2;
6213
1.59k
            if (consumed)
6214
0
                goto End;
6215
1.59k
            errmsg = "unexpected end of data";
6216
1.59k
            startinpos = ((const char *)q) - starts;
6217
1.59k
            endinpos = ((const char *)e) - starts;
6218
1.59k
            break;
6219
14.0k
        case 2:
6220
14.0k
            errmsg = "illegal encoding";
6221
14.0k
            startinpos = ((const char *)q) - 2 - starts;
6222
14.0k
            endinpos = startinpos + 2;
6223
14.0k
            break;
6224
7.41k
        case 3:
6225
7.41k
            errmsg = "illegal UTF-16 surrogate";
6226
7.41k
            startinpos = ((const char *)q) - 4 - starts;
6227
7.41k
            endinpos = startinpos + 2;
6228
7.41k
            break;
6229
16.4k
        default:
6230
16.4k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6231
0
                goto onError;
6232
16.4k
            continue;
6233
55.1k
        }
6234
6235
28.6k
        if (unicode_decode_call_errorhandler_writer(
6236
28.6k
                errors,
6237
28.6k
                &errorHandler,
6238
28.6k
                encoding, errmsg,
6239
28.6k
                &starts,
6240
28.6k
                (const char **)&e,
6241
28.6k
                &startinpos,
6242
28.6k
                &endinpos,
6243
28.6k
                &exc,
6244
28.6k
                (const char **)&q,
6245
28.6k
                &writer))
6246
4.94k
            goto onError;
6247
28.6k
    }
6248
6249
10.0k
End:
6250
10.0k
    if (consumed)
6251
0
        *consumed = (const char *)q-starts;
6252
6253
10.0k
    Py_XDECREF(errorHandler);
6254
10.0k
    Py_XDECREF(exc);
6255
10.0k
    return _PyUnicodeWriter_Finish(&writer);
6256
6257
4.94k
  onError:
6258
4.94k
    _PyUnicodeWriter_Dealloc(&writer);
6259
4.94k
    Py_XDECREF(errorHandler);
6260
4.94k
    Py_XDECREF(exc);
6261
4.94k
    return NULL;
6262
14.9k
}
6263
6264
PyObject *
6265
_PyUnicode_EncodeUTF16(PyObject *str,
6266
                       const char *errors,
6267
                       int byteorder)
6268
5.95k
{
6269
5.95k
    if (!PyUnicode_Check(str)) {
6270
0
        PyErr_BadArgument();
6271
0
        return NULL;
6272
0
    }
6273
5.95k
    int kind = PyUnicode_KIND(str);
6274
5.95k
    const void *data = PyUnicode_DATA(str);
6275
5.95k
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6276
6277
5.95k
    Py_ssize_t pairs = 0;
6278
5.95k
    if (kind == PyUnicode_4BYTE_KIND) {
6279
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6280
0
        const Py_UCS4 *end = in + len;
6281
0
        while (in < end) {
6282
0
            if (*in++ >= 0x10000) {
6283
0
                pairs++;
6284
0
            }
6285
0
        }
6286
0
    }
6287
5.95k
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6288
0
        return PyErr_NoMemory();
6289
0
    }
6290
5.95k
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6291
6292
#if PY_BIG_ENDIAN
6293
    int native_ordering = byteorder >= 0;
6294
#else
6295
5.95k
    int native_ordering = byteorder <= 0;
6296
5.95k
#endif
6297
6298
5.95k
    if (kind == PyUnicode_1BYTE_KIND) {
6299
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6300
        // on short strings
6301
5.90k
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6302
5.90k
        if (v == NULL) {
6303
0
            return NULL;
6304
0
        }
6305
6306
        /* output buffer is 2-bytes aligned */
6307
5.90k
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6308
5.90k
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6309
5.90k
        if (byteorder == 0) {
6310
0
            *out++ = 0xFEFF;
6311
0
        }
6312
5.90k
        if (len > 0) {
6313
5.90k
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6314
5.90k
        }
6315
5.90k
        return v;
6316
5.90k
    }
6317
6318
51
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6319
51
    if (writer == NULL) {
6320
0
        return NULL;
6321
0
    }
6322
6323
    /* output buffer is 2-bytes aligned */
6324
51
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6325
51
    unsigned short *out = PyBytesWriter_GetData(writer);
6326
51
    if (byteorder == 0) {
6327
0
        *out++ = 0xFEFF;
6328
0
    }
6329
51
    if (len == 0) {
6330
0
        return PyBytesWriter_Finish(writer);
6331
0
    }
6332
6333
51
    const char *encoding;
6334
51
    if (byteorder < 0) {
6335
0
        encoding = "utf-16-le";
6336
0
    }
6337
51
    else if (byteorder > 0) {
6338
51
        encoding = "utf-16-be";
6339
51
    }
6340
0
    else {
6341
0
        encoding = "utf-16";
6342
0
    }
6343
6344
51
    PyObject *errorHandler = NULL;
6345
51
    PyObject *exc = NULL;
6346
51
    PyObject *rep = NULL;
6347
6348
51
    for (Py_ssize_t pos = 0; pos < len; ) {
6349
51
        if (kind == PyUnicode_2BYTE_KIND) {
6350
51
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6351
51
                                        &out, native_ordering);
6352
51
        }
6353
0
        else {
6354
0
            assert(kind == PyUnicode_4BYTE_KIND);
6355
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6356
0
                                        &out, native_ordering);
6357
0
        }
6358
51
        if (pos == len)
6359
51
            break;
6360
6361
0
        Py_ssize_t newpos;
6362
0
        rep = unicode_encode_call_errorhandler(
6363
0
                errors, &errorHandler,
6364
0
                encoding, "surrogates not allowed",
6365
0
                str, &exc, pos, pos + 1, &newpos);
6366
0
        if (!rep)
6367
0
            goto error;
6368
6369
0
        Py_ssize_t repsize, moreunits;
6370
0
        if (PyBytes_Check(rep)) {
6371
0
            repsize = PyBytes_GET_SIZE(rep);
6372
0
            if (repsize & 1) {
6373
0
                raise_encode_exception(&exc, encoding,
6374
0
                                       str, pos, pos + 1,
6375
0
                                       "surrogates not allowed");
6376
0
                goto error;
6377
0
            }
6378
0
            moreunits = repsize / 2;
6379
0
        }
6380
0
        else {
6381
0
            assert(PyUnicode_Check(rep));
6382
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6383
0
            if (!PyUnicode_IS_ASCII(rep)) {
6384
0
                raise_encode_exception(&exc, encoding,
6385
0
                                       str, pos, pos + 1,
6386
0
                                       "surrogates not allowed");
6387
0
                goto error;
6388
0
            }
6389
0
        }
6390
0
        moreunits += pos - newpos;
6391
0
        pos = newpos;
6392
6393
        /* two bytes are reserved for each surrogate */
6394
0
        if (moreunits > 0) {
6395
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6396
0
            if (out == NULL) {
6397
0
                goto error;
6398
0
            }
6399
0
        }
6400
6401
0
        if (PyBytes_Check(rep)) {
6402
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6403
0
            out += repsize / 2;
6404
0
        } else {
6405
            /* rep is unicode */
6406
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6407
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6408
0
                                 &out, native_ordering);
6409
0
        }
6410
6411
0
        Py_CLEAR(rep);
6412
0
    }
6413
6414
51
    Py_XDECREF(errorHandler);
6415
51
    Py_XDECREF(exc);
6416
6417
    /* Cut back to size actually needed. This is necessary for, for example,
6418
    encoding of a string containing isolated surrogates and the 'ignore' handler
6419
    is used. */
6420
51
    return PyBytesWriter_FinishWithPointer(writer, out);
6421
6422
0
  error:
6423
0
    Py_XDECREF(rep);
6424
0
    Py_XDECREF(errorHandler);
6425
0
    Py_XDECREF(exc);
6426
0
    PyBytesWriter_Discard(writer);
6427
0
    return NULL;
6428
51
}
6429
6430
PyObject *
6431
PyUnicode_AsUTF16String(PyObject *unicode)
6432
0
{
6433
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6434
0
}
6435
6436
_PyUnicode_Name_CAPI *
6437
_PyUnicode_GetNameCAPI(void)
6438
2.08k
{
6439
2.08k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6440
2.08k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6441
6442
2.08k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6443
2.08k
    if (ucnhash_capi == NULL) {
6444
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6445
1
                PyUnicodeData_CAPSULE_NAME, 1);
6446
6447
        // It's fine if we overwrite the value here. It's always the same value.
6448
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6449
1
    }
6450
2.08k
    return ucnhash_capi;
6451
2.08k
}
6452
6453
/* --- Unicode Escape Codec ----------------------------------------------- */
6454
6455
PyObject *
6456
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6457
                               Py_ssize_t size,
6458
                               const char *errors,
6459
                               Py_ssize_t *consumed,
6460
                               int *first_invalid_escape_char,
6461
                               const char **first_invalid_escape_ptr)
6462
26.6k
{
6463
26.6k
    const char *starts = s;
6464
26.6k
    const char *initial_starts = starts;
6465
26.6k
    _PyUnicodeWriter writer;
6466
26.6k
    const char *end;
6467
26.6k
    PyObject *errorHandler = NULL;
6468
26.6k
    PyObject *exc = NULL;
6469
26.6k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6470
6471
    // so we can remember if we've seen an invalid escape char or not
6472
26.6k
    *first_invalid_escape_char = -1;
6473
26.6k
    *first_invalid_escape_ptr = NULL;
6474
6475
26.6k
    if (size == 0) {
6476
2.05k
        if (consumed) {
6477
0
            *consumed = 0;
6478
0
        }
6479
2.05k
        _Py_RETURN_UNICODE_EMPTY();
6480
2.05k
    }
6481
    /* Escaped strings will always be longer than the resulting
6482
       Unicode string, so we start with size here and then reduce the
6483
       length after conversion to the true value.
6484
       (but if the error callback returns a long replacement string
6485
       we'll have to allocate more space) */
6486
24.5k
    _PyUnicodeWriter_Init(&writer);
6487
24.5k
    writer.min_length = size;
6488
24.5k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6489
0
        goto onError;
6490
0
    }
6491
6492
24.5k
    end = s + size;
6493
211k
    while (s < end) {
6494
187k
        unsigned char c = (unsigned char) *s++;
6495
187k
        Py_UCS4 ch;
6496
187k
        int count;
6497
187k
        const char *message;
6498
6499
187k
#define WRITE_ASCII_CHAR(ch)                                                  \
6500
187k
            do {                                                              \
6501
19.1k
                assert(ch <= 127);                                            \
6502
19.1k
                assert(writer.pos < writer.size);                             \
6503
19.1k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6504
19.1k
            } while(0)
6505
6506
187k
#define WRITE_CHAR(ch)                                                        \
6507
187k
            do {                                                              \
6508
173k
                if (ch <= writer.maxchar) {                                   \
6509
159k
                    assert(writer.pos < writer.size);                         \
6510
159k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6511
159k
                }                                                             \
6512
173k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6513
0
                    goto onError;                                             \
6514
0
                }                                                             \
6515
173k
            } while(0)
6516
6517
        /* Non-escape characters are interpreted as Unicode ordinals */
6518
187k
        if (c != '\\') {
6519
134k
            WRITE_CHAR(c);
6520
134k
            continue;
6521
134k
        }
6522
6523
53.2k
        Py_ssize_t startinpos = s - starts - 1;
6524
        /* \ - Escapes */
6525
53.2k
        if (s >= end) {
6526
0
            message = "\\ at end of string";
6527
0
            goto incomplete;
6528
0
        }
6529
53.2k
        c = (unsigned char) *s++;
6530
6531
53.2k
        assert(writer.pos < writer.size);
6532
53.2k
        switch (c) {
6533
6534
            /* \x escapes */
6535
596
        case '\n': continue;
6536
1.59k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6537
925
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6538
1.64k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6539
1.38k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6540
        /* FF */
6541
2.77k
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6542
767
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6543
1.09k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6544
1.59k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6545
        /* VT */
6546
904
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6547
        /* BEL, not classic C */
6548
631
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6549
6550
            /* \OOO (octal) escapes */
6551
3.78k
        case '0': case '1': case '2': case '3':
6552
6.67k
        case '4': case '5': case '6': case '7':
6553
6.67k
            ch = c - '0';
6554
6.67k
            if (s < end && '0' <= *s && *s <= '7') {
6555
3.54k
                ch = (ch<<3) + *s++ - '0';
6556
3.54k
                if (s < end && '0' <= *s && *s <= '7') {
6557
2.21k
                    ch = (ch<<3) + *s++ - '0';
6558
2.21k
                }
6559
3.54k
            }
6560
6.67k
            if (ch > 0377) {
6561
1.27k
                if (*first_invalid_escape_char == -1) {
6562
757
                    *first_invalid_escape_char = ch;
6563
757
                    if (starts == initial_starts) {
6564
                        /* Back up 3 chars, since we've already incremented s. */
6565
757
                        *first_invalid_escape_ptr = s - 3;
6566
757
                    }
6567
757
                }
6568
1.27k
            }
6569
6.67k
            WRITE_CHAR(ch);
6570
6.67k
            continue;
6571
6572
            /* hex escapes */
6573
            /* \xXX */
6574
6.67k
        case 'x':
6575
5.65k
            count = 2;
6576
5.65k
            message = "truncated \\xXX escape";
6577
5.65k
            goto hexescape;
6578
6579
            /* \uXXXX */
6580
6.78k
        case 'u':
6581
6.78k
            count = 4;
6582
6.78k
            message = "truncated \\uXXXX escape";
6583
6.78k
            goto hexescape;
6584
6585
            /* \UXXXXXXXX */
6586
12.2k
        case 'U':
6587
12.2k
            count = 8;
6588
12.2k
            message = "truncated \\UXXXXXXXX escape";
6589
24.7k
        hexescape:
6590
161k
            for (ch = 0; count; ++s, --count) {
6591
136k
                if (s >= end) {
6592
7
                    goto incomplete;
6593
7
                }
6594
136k
                c = (unsigned char)*s;
6595
136k
                ch <<= 4;
6596
136k
                if (c >= '0' && c <= '9') {
6597
104k
                    ch += c - '0';
6598
104k
                }
6599
32.5k
                else if (c >= 'a' && c <= 'f') {
6600
32.3k
                    ch += c - ('a' - 10);
6601
32.3k
                }
6602
249
                else if (c >= 'A' && c <= 'F') {
6603
239
                    ch += c - ('A' - 10);
6604
239
                }
6605
10
                else {
6606
10
                    goto error;
6607
10
                }
6608
136k
            }
6609
6610
            /* when we get here, ch is a 32-bit unicode character */
6611
24.7k
            if (ch > MAX_UNICODE) {
6612
1
                message = "illegal Unicode character";
6613
1
                goto error;
6614
1
            }
6615
6616
24.7k
            WRITE_CHAR(ch);
6617
24.7k
            continue;
6618
6619
            /* \N{name} */
6620
24.7k
        case 'N':
6621
2.08k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6622
2.08k
            if (ucnhash_capi == NULL) {
6623
0
                PyErr_SetString(
6624
0
                        PyExc_UnicodeError,
6625
0
                        "\\N escapes not supported (can't load unicodedata module)"
6626
0
                );
6627
0
                goto onError;
6628
0
            }
6629
6630
2.08k
            message = "malformed \\N character escape";
6631
2.08k
            if (s >= end) {
6632
2
                goto incomplete;
6633
2
            }
6634
2.07k
            if (*s == '{') {
6635
2.07k
                const char *start = ++s;
6636
2.07k
                size_t namelen;
6637
                /* look for the closing brace */
6638
27.2k
                while (s < end && *s != '}')
6639
25.1k
                    s++;
6640
2.07k
                if (s >= end) {
6641
7
                    goto incomplete;
6642
7
                }
6643
2.06k
                namelen = s - start;
6644
2.06k
                if (namelen) {
6645
                    /* found a name.  look it up in the unicode database */
6646
2.06k
                    s++;
6647
2.06k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6648
2.06k
                    if (namelen <= INT_MAX &&
6649
2.06k
                        ucnhash_capi->getcode(start, (int)namelen,
6650
2.06k
                                              &ch, 0)) {
6651
2.00k
                        assert(ch <= MAX_UNICODE);
6652
2.00k
                        WRITE_CHAR(ch);
6653
2.00k
                        continue;
6654
2.00k
                    }
6655
63
                    message = "unknown Unicode character name";
6656
63
                }
6657
2.06k
            }
6658
69
            goto error;
6659
6660
5.85k
        default:
6661
5.85k
            if (*first_invalid_escape_char == -1) {
6662
3.44k
                *first_invalid_escape_char = c;
6663
3.44k
                if (starts == initial_starts) {
6664
                    /* Back up one char, since we've already incremented s. */
6665
3.44k
                    *first_invalid_escape_ptr = s - 1;
6666
3.44k
                }
6667
3.44k
            }
6668
5.85k
            WRITE_ASCII_CHAR('\\');
6669
5.85k
            WRITE_CHAR(c);
6670
5.85k
            continue;
6671
53.2k
        }
6672
6673
16
      incomplete:
6674
16
        if (consumed) {
6675
0
            *consumed = startinpos;
6676
0
            break;
6677
0
        }
6678
96
      error:;
6679
96
        Py_ssize_t endinpos = s-starts;
6680
96
        writer.min_length = end - s + writer.pos;
6681
96
        if (unicode_decode_call_errorhandler_writer(
6682
96
                errors, &errorHandler,
6683
96
                "unicodeescape", message,
6684
96
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6685
96
                &writer)) {
6686
96
            goto onError;
6687
96
        }
6688
96
        assert(end - s <= writer.size - writer.pos);
6689
6690
0
#undef WRITE_ASCII_CHAR
6691
0
#undef WRITE_CHAR
6692
0
    }
6693
6694
24.4k
    Py_XDECREF(errorHandler);
6695
24.4k
    Py_XDECREF(exc);
6696
24.4k
    return _PyUnicodeWriter_Finish(&writer);
6697
6698
96
  onError:
6699
96
    _PyUnicodeWriter_Dealloc(&writer);
6700
96
    Py_XDECREF(errorHandler);
6701
96
    Py_XDECREF(exc);
6702
96
    return NULL;
6703
24.5k
}
6704
6705
PyObject *
6706
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6707
                              Py_ssize_t size,
6708
                              const char *errors,
6709
                              Py_ssize_t *consumed)
6710
0
{
6711
0
    int first_invalid_escape_char;
6712
0
    const char *first_invalid_escape_ptr;
6713
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6714
0
                                                      consumed,
6715
0
                                                      &first_invalid_escape_char,
6716
0
                                                      &first_invalid_escape_ptr);
6717
0
    if (result == NULL)
6718
0
        return NULL;
6719
0
    if (first_invalid_escape_char != -1) {
6720
0
        if (first_invalid_escape_char > 0xff) {
6721
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6722
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6723
0
                                 "Such sequences will not work in the future. ",
6724
0
                                 first_invalid_escape_char) < 0)
6725
0
            {
6726
0
                Py_DECREF(result);
6727
0
                return NULL;
6728
0
            }
6729
0
        }
6730
0
        else {
6731
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6732
0
                                 "\"\\%c\" is an invalid escape sequence. "
6733
0
                                 "Such sequences will not work in the future. ",
6734
0
                                 first_invalid_escape_char) < 0)
6735
0
            {
6736
0
                Py_DECREF(result);
6737
0
                return NULL;
6738
0
            }
6739
0
        }
6740
0
    }
6741
0
    return result;
6742
0
}
6743
6744
PyObject *
6745
PyUnicode_DecodeUnicodeEscape(const char *s,
6746
                              Py_ssize_t size,
6747
                              const char *errors)
6748
0
{
6749
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6750
0
}
6751
6752
/* Return a Unicode-Escape string version of the Unicode object. */
6753
6754
PyObject *
6755
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6756
387k
{
6757
387k
    if (!PyUnicode_Check(unicode)) {
6758
0
        PyErr_BadArgument();
6759
0
        return NULL;
6760
0
    }
6761
6762
387k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6763
387k
    if (len == 0) {
6764
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6765
0
    }
6766
387k
    int kind = PyUnicode_KIND(unicode);
6767
387k
    const void *data = PyUnicode_DATA(unicode);
6768
6769
    /* Initial allocation is based on the longest-possible character
6770
     * escape.
6771
     *
6772
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6773
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6774
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6775
387k
    Py_ssize_t expandsize = kind * 2 + 2;
6776
387k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6777
0
        return PyErr_NoMemory();
6778
0
    }
6779
6780
387k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6781
387k
    if (writer == NULL) {
6782
0
        return NULL;
6783
0
    }
6784
387k
    char *p = PyBytesWriter_GetData(writer);
6785
6786
775k
    for (Py_ssize_t i = 0; i < len; i++) {
6787
387k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6788
6789
        /* U+0000-U+00ff range */
6790
387k
        if (ch < 0x100) {
6791
381k
            if (ch >= ' ' && ch < 127) {
6792
27.1k
                if (ch != '\\') {
6793
                    /* Copy printable US ASCII as-is */
6794
0
                    *p++ = (char) ch;
6795
0
                }
6796
                /* Escape backslashes */
6797
27.1k
                else {
6798
27.1k
                    *p++ = '\\';
6799
27.1k
                    *p++ = '\\';
6800
27.1k
                }
6801
27.1k
            }
6802
6803
            /* Map special whitespace to '\t', \n', '\r' */
6804
354k
            else if (ch == '\t') {
6805
3.56k
                *p++ = '\\';
6806
3.56k
                *p++ = 't';
6807
3.56k
            }
6808
350k
            else if (ch == '\n') {
6809
1.00k
                *p++ = '\\';
6810
1.00k
                *p++ = 'n';
6811
1.00k
            }
6812
349k
            else if (ch == '\r') {
6813
820
                *p++ = '\\';
6814
820
                *p++ = 'r';
6815
820
            }
6816
6817
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6818
349k
            else {
6819
349k
                *p++ = '\\';
6820
349k
                *p++ = 'x';
6821
349k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6822
349k
                *p++ = Py_hexdigits[ch & 0x000F];
6823
349k
            }
6824
381k
        }
6825
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6826
6.18k
        else if (ch < 0x10000) {
6827
5.06k
            *p++ = '\\';
6828
5.06k
            *p++ = 'u';
6829
5.06k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6830
5.06k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6831
5.06k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6832
5.06k
            *p++ = Py_hexdigits[ch & 0x000F];
6833
5.06k
        }
6834
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6835
1.12k
        else {
6836
6837
            /* Make sure that the first two digits are zero */
6838
1.12k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6839
1.12k
            *p++ = '\\';
6840
1.12k
            *p++ = 'U';
6841
1.12k
            *p++ = '0';
6842
1.12k
            *p++ = '0';
6843
1.12k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6844
1.12k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6845
1.12k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6846
1.12k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6847
1.12k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6848
1.12k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6849
1.12k
        }
6850
387k
    }
6851
6852
387k
    return PyBytesWriter_FinishWithPointer(writer, p);
6853
387k
}
6854
6855
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6856
6857
PyObject *
6858
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6859
                                          Py_ssize_t size,
6860
                                          const char *errors,
6861
                                          Py_ssize_t *consumed)
6862
0
{
6863
0
    const char *starts = s;
6864
0
    _PyUnicodeWriter writer;
6865
0
    const char *end;
6866
0
    PyObject *errorHandler = NULL;
6867
0
    PyObject *exc = NULL;
6868
6869
0
    if (size == 0) {
6870
0
        if (consumed) {
6871
0
            *consumed = 0;
6872
0
        }
6873
0
        _Py_RETURN_UNICODE_EMPTY();
6874
0
    }
6875
6876
    /* Escaped strings will always be longer than the resulting
6877
       Unicode string, so we start with size here and then reduce the
6878
       length after conversion to the true value. (But decoding error
6879
       handler might have to resize the string) */
6880
0
    _PyUnicodeWriter_Init(&writer);
6881
0
    writer.min_length = size;
6882
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6883
0
        goto onError;
6884
0
    }
6885
6886
0
    end = s + size;
6887
0
    while (s < end) {
6888
0
        unsigned char c = (unsigned char) *s++;
6889
0
        Py_UCS4 ch;
6890
0
        int count;
6891
0
        const char *message;
6892
6893
0
#define WRITE_CHAR(ch)                                                        \
6894
0
            do {                                                              \
6895
0
                if (ch <= writer.maxchar) {                                   \
6896
0
                    assert(writer.pos < writer.size);                         \
6897
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6898
0
                }                                                             \
6899
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6900
0
                    goto onError;                                             \
6901
0
                }                                                             \
6902
0
            } while(0)
6903
6904
        /* Non-escape characters are interpreted as Unicode ordinals */
6905
0
        if (c != '\\' || (s >= end && !consumed)) {
6906
0
            WRITE_CHAR(c);
6907
0
            continue;
6908
0
        }
6909
6910
0
        Py_ssize_t startinpos = s - starts - 1;
6911
        /* \ - Escapes */
6912
0
        if (s >= end) {
6913
0
            assert(consumed);
6914
            // Set message to silent compiler warning.
6915
            // Actually it is never used.
6916
0
            message = "\\ at end of string";
6917
0
            goto incomplete;
6918
0
        }
6919
6920
0
        c = (unsigned char) *s++;
6921
0
        if (c == 'u') {
6922
0
            count = 4;
6923
0
            message = "truncated \\uXXXX escape";
6924
0
        }
6925
0
        else if (c == 'U') {
6926
0
            count = 8;
6927
0
            message = "truncated \\UXXXXXXXX escape";
6928
0
        }
6929
0
        else {
6930
0
            assert(writer.pos < writer.size);
6931
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6932
0
            WRITE_CHAR(c);
6933
0
            continue;
6934
0
        }
6935
6936
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6937
0
        for (ch = 0; count; ++s, --count) {
6938
0
            if (s >= end) {
6939
0
                goto incomplete;
6940
0
            }
6941
0
            c = (unsigned char)*s;
6942
0
            ch <<= 4;
6943
0
            if (c >= '0' && c <= '9') {
6944
0
                ch += c - '0';
6945
0
            }
6946
0
            else if (c >= 'a' && c <= 'f') {
6947
0
                ch += c - ('a' - 10);
6948
0
            }
6949
0
            else if (c >= 'A' && c <= 'F') {
6950
0
                ch += c - ('A' - 10);
6951
0
            }
6952
0
            else {
6953
0
                goto error;
6954
0
            }
6955
0
        }
6956
0
        if (ch > MAX_UNICODE) {
6957
0
            message = "\\Uxxxxxxxx out of range";
6958
0
            goto error;
6959
0
        }
6960
0
        WRITE_CHAR(ch);
6961
0
        continue;
6962
6963
0
      incomplete:
6964
0
        if (consumed) {
6965
0
            *consumed = startinpos;
6966
0
            break;
6967
0
        }
6968
0
      error:;
6969
0
        Py_ssize_t endinpos = s-starts;
6970
0
        writer.min_length = end - s + writer.pos;
6971
0
        if (unicode_decode_call_errorhandler_writer(
6972
0
                errors, &errorHandler,
6973
0
                "rawunicodeescape", message,
6974
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6975
0
                &writer)) {
6976
0
            goto onError;
6977
0
        }
6978
0
        assert(end - s <= writer.size - writer.pos);
6979
6980
0
#undef WRITE_CHAR
6981
0
    }
6982
0
    Py_XDECREF(errorHandler);
6983
0
    Py_XDECREF(exc);
6984
0
    return _PyUnicodeWriter_Finish(&writer);
6985
6986
0
  onError:
6987
0
    _PyUnicodeWriter_Dealloc(&writer);
6988
0
    Py_XDECREF(errorHandler);
6989
0
    Py_XDECREF(exc);
6990
0
    return NULL;
6991
0
}
6992
6993
PyObject *
6994
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6995
                                 Py_ssize_t size,
6996
                                 const char *errors)
6997
0
{
6998
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6999
0
}
7000
7001
7002
PyObject *
7003
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7004
278k
{
7005
278k
    if (!PyUnicode_Check(unicode)) {
7006
0
        PyErr_BadArgument();
7007
0
        return NULL;
7008
0
    }
7009
278k
    int kind = PyUnicode_KIND(unicode);
7010
278k
    const void *data = PyUnicode_DATA(unicode);
7011
278k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7012
278k
    if (len == 0) {
7013
520
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7014
520
    }
7015
277k
    if (kind == PyUnicode_1BYTE_KIND) {
7016
277k
        return PyBytes_FromStringAndSize(data, len);
7017
277k
    }
7018
7019
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7020
       bytes, and 1 byte characters 4. */
7021
289
    Py_ssize_t expandsize = kind * 2 + 2;
7022
289
    if (len > PY_SSIZE_T_MAX / expandsize) {
7023
0
        return PyErr_NoMemory();
7024
0
    }
7025
7026
289
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7027
289
    if (writer == NULL) {
7028
0
        return NULL;
7029
0
    }
7030
289
    char *p = PyBytesWriter_GetData(writer);
7031
7032
4.77M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7033
4.77M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7034
7035
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7036
4.77M
        if (ch < 0x100) {
7037
4.73M
            *p++ = (char) ch;
7038
4.73M
        }
7039
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7040
34.2k
        else if (ch < 0x10000) {
7041
33.6k
            *p++ = '\\';
7042
33.6k
            *p++ = 'u';
7043
33.6k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7044
33.6k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7045
33.6k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7046
33.6k
            *p++ = Py_hexdigits[ch & 15];
7047
33.6k
        }
7048
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7049
573
        else {
7050
573
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7051
573
            *p++ = '\\';
7052
573
            *p++ = 'U';
7053
573
            *p++ = '0';
7054
573
            *p++ = '0';
7055
573
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7056
573
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7057
573
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7058
573
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7059
573
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7060
573
            *p++ = Py_hexdigits[ch & 15];
7061
573
        }
7062
4.77M
    }
7063
7064
289
    return PyBytesWriter_FinishWithPointer(writer, p);
7065
289
}
7066
7067
/* --- Latin-1 Codec ------------------------------------------------------ */
7068
7069
PyObject *
7070
PyUnicode_DecodeLatin1(const char *s,
7071
                       Py_ssize_t size,
7072
                       const char *errors)
7073
3.05M
{
7074
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7075
3.05M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7076
3.05M
}
7077
7078
/* create or adjust a UnicodeEncodeError */
7079
static void
7080
make_encode_exception(PyObject **exceptionObject,
7081
                      const char *encoding,
7082
                      PyObject *unicode,
7083
                      Py_ssize_t startpos, Py_ssize_t endpos,
7084
                      const char *reason)
7085
244k
{
7086
244k
    if (*exceptionObject == NULL) {
7087
244k
        *exceptionObject = PyObject_CallFunction(
7088
244k
            PyExc_UnicodeEncodeError, "sOnns",
7089
244k
            encoding, unicode, startpos, endpos, reason);
7090
244k
    }
7091
0
    else {
7092
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7093
0
            goto onError;
7094
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7095
0
            goto onError;
7096
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7097
0
            goto onError;
7098
0
        return;
7099
0
      onError:
7100
0
        Py_CLEAR(*exceptionObject);
7101
0
    }
7102
244k
}
7103
7104
/* raises a UnicodeEncodeError */
7105
static void
7106
raise_encode_exception(PyObject **exceptionObject,
7107
                       const char *encoding,
7108
                       PyObject *unicode,
7109
                       Py_ssize_t startpos, Py_ssize_t endpos,
7110
                       const char *reason)
7111
85.2k
{
7112
85.2k
    make_encode_exception(exceptionObject,
7113
85.2k
                          encoding, unicode, startpos, endpos, reason);
7114
85.2k
    if (*exceptionObject != NULL)
7115
85.2k
        PyCodec_StrictErrors(*exceptionObject);
7116
85.2k
}
7117
7118
/* error handling callback helper:
7119
   build arguments, call the callback and check the arguments,
7120
   put the result into newpos and return the replacement string, which
7121
   has to be freed by the caller */
7122
static PyObject *
7123
unicode_encode_call_errorhandler(const char *errors,
7124
                                 PyObject **errorHandler,
7125
                                 const char *encoding, const char *reason,
7126
                                 PyObject *unicode, PyObject **exceptionObject,
7127
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7128
                                 Py_ssize_t *newpos)
7129
159k
{
7130
159k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7131
159k
    Py_ssize_t len;
7132
159k
    PyObject *restuple;
7133
159k
    PyObject *resunicode;
7134
7135
159k
    if (*errorHandler == NULL) {
7136
159k
        *errorHandler = PyCodec_LookupError(errors);
7137
159k
        if (*errorHandler == NULL)
7138
0
            return NULL;
7139
159k
    }
7140
7141
159k
    len = PyUnicode_GET_LENGTH(unicode);
7142
7143
159k
    make_encode_exception(exceptionObject,
7144
159k
                          encoding, unicode, startpos, endpos, reason);
7145
159k
    if (*exceptionObject == NULL)
7146
0
        return NULL;
7147
7148
159k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7149
159k
    if (restuple == NULL)
7150
159k
        return NULL;
7151
0
    if (!PyTuple_Check(restuple)) {
7152
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7153
0
        Py_DECREF(restuple);
7154
0
        return NULL;
7155
0
    }
7156
0
    if (!PyArg_ParseTuple(restuple, argparse,
7157
0
                          &resunicode, newpos)) {
7158
0
        Py_DECREF(restuple);
7159
0
        return NULL;
7160
0
    }
7161
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7162
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7163
0
        Py_DECREF(restuple);
7164
0
        return NULL;
7165
0
    }
7166
0
    if (*newpos<0)
7167
0
        *newpos = len + *newpos;
7168
0
    if (*newpos<0 || *newpos>len) {
7169
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7170
0
        Py_DECREF(restuple);
7171
0
        return NULL;
7172
0
    }
7173
0
    Py_INCREF(resunicode);
7174
0
    Py_DECREF(restuple);
7175
0
    return resunicode;
7176
0
}
7177
7178
static PyObject *
7179
unicode_encode_ucs1(PyObject *unicode,
7180
                    const char *errors,
7181
                    const Py_UCS4 limit)
7182
99.7k
{
7183
    /* input state */
7184
99.7k
    Py_ssize_t pos=0, size;
7185
99.7k
    int kind;
7186
99.7k
    const void *data;
7187
99.7k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7188
99.7k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7189
99.7k
    PyObject *error_handler_obj = NULL;
7190
99.7k
    PyObject *exc = NULL;
7191
99.7k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7192
99.7k
    PyObject *rep = NULL;
7193
7194
99.7k
    size = PyUnicode_GET_LENGTH(unicode);
7195
99.7k
    kind = PyUnicode_KIND(unicode);
7196
99.7k
    data = PyUnicode_DATA(unicode);
7197
    /* allocate enough for a simple encoding without
7198
       replacements, if we need more, we'll resize */
7199
99.7k
    if (size == 0)
7200
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7201
7202
    /* output object */
7203
99.7k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7204
99.7k
    if (writer == NULL) {
7205
0
        return NULL;
7206
0
    }
7207
    /* pointer into the output */
7208
99.7k
    char *str = PyBytesWriter_GetData(writer);
7209
7210
5.03M
    while (pos < size) {
7211
5.03M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7212
7213
        /* can we encode this? */
7214
5.03M
        if (ch < limit) {
7215
            /* no overflow check, because we know that the space is enough */
7216
4.93M
            *str++ = (char)ch;
7217
4.93M
            ++pos;
7218
4.93M
        }
7219
99.8k
        else {
7220
99.8k
            Py_ssize_t newpos, i;
7221
            /* startpos for collecting unencodable chars */
7222
99.8k
            Py_ssize_t collstart = pos;
7223
99.8k
            Py_ssize_t collend = collstart + 1;
7224
            /* find all unecodable characters */
7225
7226
958k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7227
858k
                ++collend;
7228
7229
            /* Only overallocate the buffer if it's not the last write */
7230
99.8k
            writer->overallocate = (collend < size);
7231
7232
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7233
99.8k
            if (error_handler == _Py_ERROR_UNKNOWN)
7234
99.7k
                error_handler = _Py_GetErrorHandler(errors);
7235
7236
99.8k
            switch (error_handler) {
7237
85.2k
            case _Py_ERROR_STRICT:
7238
85.2k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7239
85.2k
                goto onError;
7240
7241
4.39k
            case _Py_ERROR_REPLACE:
7242
4.39k
                memset(str, '?', collend - collstart);
7243
4.39k
                str += (collend - collstart);
7244
4.39k
                _Py_FALLTHROUGH;
7245
4.39k
            case _Py_ERROR_IGNORE:
7246
4.39k
                pos = collend;
7247
4.39k
                break;
7248
7249
0
            case _Py_ERROR_BACKSLASHREPLACE:
7250
                /* subtract preallocated bytes */
7251
0
                writer->size -= (collend - collstart);
7252
0
                str = backslashreplace(writer, str,
7253
0
                                       unicode, collstart, collend);
7254
0
                if (str == NULL)
7255
0
                    goto onError;
7256
0
                pos = collend;
7257
0
                break;
7258
7259
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7260
                /* subtract preallocated bytes */
7261
0
                writer->size -= (collend - collstart);
7262
0
                str = xmlcharrefreplace(writer, str,
7263
0
                                        unicode, collstart, collend);
7264
0
                if (str == NULL)
7265
0
                    goto onError;
7266
0
                pos = collend;
7267
0
                break;
7268
7269
10.1k
            case _Py_ERROR_SURROGATEESCAPE:
7270
10.1k
                for (i = collstart; i < collend; ++i) {
7271
10.1k
                    ch = PyUnicode_READ(kind, data, i);
7272
10.1k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7273
                        /* Not a UTF-8b surrogate */
7274
10.1k
                        break;
7275
10.1k
                    }
7276
0
                    *str++ = (char)(ch - 0xdc00);
7277
0
                    ++pos;
7278
0
                }
7279
10.1k
                if (i >= collend)
7280
0
                    break;
7281
10.1k
                collstart = pos;
7282
10.1k
                assert(collstart != collend);
7283
10.1k
                _Py_FALLTHROUGH;
7284
7285
10.1k
            default:
7286
10.1k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7287
10.1k
                                                       encoding, reason, unicode, &exc,
7288
10.1k
                                                       collstart, collend, &newpos);
7289
10.1k
                if (rep == NULL)
7290
10.1k
                    goto onError;
7291
7292
0
                if (newpos < collstart) {
7293
0
                    writer->overallocate = 1;
7294
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7295
0
                                                             collstart - newpos,
7296
0
                                                             str);
7297
0
                    if (str == NULL) {
7298
0
                        goto onError;
7299
0
                    }
7300
0
                }
7301
0
                else {
7302
                    /* subtract preallocated bytes */
7303
0
                    writer->size -= newpos - collstart;
7304
                    /* Only overallocate the buffer if it's not the last write */
7305
0
                    writer->overallocate = (newpos < size);
7306
0
                }
7307
7308
0
                char *rep_str;
7309
0
                Py_ssize_t rep_len;
7310
0
                if (PyBytes_Check(rep)) {
7311
                    /* Directly copy bytes result to output. */
7312
0
                    rep_str = PyBytes_AS_STRING(rep);
7313
0
                    rep_len = PyBytes_GET_SIZE(rep);
7314
0
                }
7315
0
                else {
7316
0
                    assert(PyUnicode_Check(rep));
7317
7318
0
                    if (limit == 256 ?
7319
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7320
0
                        !PyUnicode_IS_ASCII(rep))
7321
0
                    {
7322
                        /* Not all characters are smaller than limit */
7323
0
                        raise_encode_exception(&exc, encoding, unicode,
7324
0
                                               collstart, collend, reason);
7325
0
                        goto onError;
7326
0
                    }
7327
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7328
0
                    rep_str = PyUnicode_DATA(rep);
7329
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7330
0
                }
7331
7332
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7333
0
                if (str == NULL) {
7334
0
                    goto onError;
7335
0
                }
7336
0
                memcpy(str, rep_str, rep_len);
7337
0
                str += rep_len;
7338
7339
0
                pos = newpos;
7340
0
                Py_CLEAR(rep);
7341
99.8k
            }
7342
7343
            /* If overallocation was disabled, ensure that it was the last
7344
               write. Otherwise, we missed an optimization */
7345
99.8k
            assert(writer->overallocate || pos == size);
7346
4.39k
        }
7347
5.03M
    }
7348
7349
4.37k
    Py_XDECREF(error_handler_obj);
7350
4.37k
    Py_XDECREF(exc);
7351
4.37k
    return PyBytesWriter_FinishWithPointer(writer, str);
7352
7353
95.4k
  onError:
7354
95.4k
    Py_XDECREF(rep);
7355
95.4k
    PyBytesWriter_Discard(writer);
7356
95.4k
    Py_XDECREF(error_handler_obj);
7357
95.4k
    Py_XDECREF(exc);
7358
95.4k
    return NULL;
7359
99.7k
}
7360
7361
PyObject *
7362
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7363
10
{
7364
10
    if (!PyUnicode_Check(unicode)) {
7365
0
        PyErr_BadArgument();
7366
0
        return NULL;
7367
0
    }
7368
    /* Fast path: if it is a one-byte string, construct
7369
       bytes object directly. */
7370
10
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7371
10
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7372
10
                                         PyUnicode_GET_LENGTH(unicode));
7373
    /* Non-Latin-1 characters present. Defer to above function to
7374
       raise the exception. */
7375
0
    return unicode_encode_ucs1(unicode, errors, 256);
7376
10
}
7377
7378
PyObject*
7379
PyUnicode_AsLatin1String(PyObject *unicode)
7380
0
{
7381
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7382
0
}
7383
7384
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7385
7386
PyObject *
7387
PyUnicode_DecodeASCII(const char *s,
7388
                      Py_ssize_t size,
7389
                      const char *errors)
7390
694k
{
7391
694k
    const char *starts = s;
7392
694k
    const char *e = s + size;
7393
694k
    PyObject *error_handler_obj = NULL;
7394
694k
    PyObject *exc = NULL;
7395
694k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7396
7397
694k
    if (size == 0)
7398
0
        _Py_RETURN_UNICODE_EMPTY();
7399
7400
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7401
694k
    if (size == 1 && (unsigned char)s[0] < 128) {
7402
6.41k
        return get_latin1_char((unsigned char)s[0]);
7403
6.41k
    }
7404
7405
    // Shortcut for simple case
7406
688k
    PyObject *u = PyUnicode_New(size, 127);
7407
688k
    if (u == NULL) {
7408
0
        return NULL;
7409
0
    }
7410
688k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7411
688k
    if (outpos == size) {
7412
490k
        return u;
7413
490k
    }
7414
7415
197k
    _PyUnicodeWriter writer;
7416
197k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7417
197k
    writer.pos = outpos;
7418
7419
197k
    s += outpos;
7420
197k
    int kind = writer.kind;
7421
197k
    void *data = writer.data;
7422
197k
    Py_ssize_t startinpos, endinpos;
7423
7424
28.3M
    while (s < e) {
7425
28.1M
        unsigned char c = (unsigned char)*s;
7426
28.1M
        if (c < 128) {
7427
16.5M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7428
16.5M
            writer.pos++;
7429
16.5M
            ++s;
7430
16.5M
            continue;
7431
16.5M
        }
7432
7433
        /* byte outsize range 0x00..0x7f: call the error handler */
7434
7435
11.6M
        if (error_handler == _Py_ERROR_UNKNOWN)
7436
197k
            error_handler = _Py_GetErrorHandler(errors);
7437
7438
11.6M
        switch (error_handler)
7439
11.6M
        {
7440
1.00M
        case _Py_ERROR_REPLACE:
7441
11.6M
        case _Py_ERROR_SURROGATEESCAPE:
7442
            /* Fast-path: the error handler only writes one character,
7443
               but we may switch to UCS2 at the first write */
7444
11.6M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7445
0
                goto onError;
7446
11.6M
            kind = writer.kind;
7447
11.6M
            data = writer.data;
7448
7449
11.6M
            if (error_handler == _Py_ERROR_REPLACE)
7450
1.00M
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7451
10.5M
            else
7452
10.5M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7453
11.6M
            writer.pos++;
7454
11.6M
            ++s;
7455
11.6M
            break;
7456
7457
0
        case _Py_ERROR_IGNORE:
7458
0
            ++s;
7459
0
            break;
7460
7461
8.00k
        default:
7462
8.00k
            startinpos = s-starts;
7463
8.00k
            endinpos = startinpos + 1;
7464
8.00k
            if (unicode_decode_call_errorhandler_writer(
7465
8.00k
                    errors, &error_handler_obj,
7466
8.00k
                    "ascii", "ordinal not in range(128)",
7467
8.00k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7468
8.00k
                    &writer))
7469
8.00k
                goto onError;
7470
0
            kind = writer.kind;
7471
0
            data = writer.data;
7472
11.6M
        }
7473
11.6M
    }
7474
189k
    Py_XDECREF(error_handler_obj);
7475
189k
    Py_XDECREF(exc);
7476
189k
    return _PyUnicodeWriter_Finish(&writer);
7477
7478
8.00k
  onError:
7479
8.00k
    _PyUnicodeWriter_Dealloc(&writer);
7480
8.00k
    Py_XDECREF(error_handler_obj);
7481
8.00k
    Py_XDECREF(exc);
7482
8.00k
    return NULL;
7483
197k
}
7484
7485
PyObject *
7486
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7487
400k
{
7488
400k
    if (!PyUnicode_Check(unicode)) {
7489
0
        PyErr_BadArgument();
7490
0
        return NULL;
7491
0
    }
7492
    /* Fast path: if it is an ASCII-only string, construct bytes object
7493
       directly. Else defer to above function to raise the exception. */
7494
400k
    if (PyUnicode_IS_ASCII(unicode))
7495
300k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7496
300k
                                         PyUnicode_GET_LENGTH(unicode));
7497
99.7k
    return unicode_encode_ucs1(unicode, errors, 128);
7498
400k
}
7499
7500
PyObject *
7501
PyUnicode_AsASCIIString(PyObject *unicode)
7502
84.5k
{
7503
84.5k
    return _PyUnicode_AsASCIIString(unicode, NULL);
7504
84.5k
}
7505
7506
#ifdef MS_WINDOWS
7507
7508
/* --- MBCS codecs for Windows -------------------------------------------- */
7509
7510
#if SIZEOF_INT < SIZEOF_SIZE_T
7511
#define NEED_RETRY
7512
#endif
7513
7514
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7515
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7516
   both cases also and avoids partial characters overrunning the
7517
   length limit in MultiByteToWideChar on Windows */
7518
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7519
7520
#ifndef WC_ERR_INVALID_CHARS
7521
#  define WC_ERR_INVALID_CHARS 0x0080
7522
#endif
7523
7524
static const char*
7525
code_page_name(UINT code_page, PyObject **obj)
7526
{
7527
    *obj = NULL;
7528
    if (code_page == CP_ACP)
7529
        return "mbcs";
7530
7531
    *obj = PyBytes_FromFormat("cp%u", code_page);
7532
    if (*obj == NULL)
7533
        return NULL;
7534
    return PyBytes_AS_STRING(*obj);
7535
}
7536
7537
static DWORD
7538
decode_code_page_flags(UINT code_page)
7539
{
7540
    if (code_page == CP_UTF7) {
7541
        /* The CP_UTF7 decoder only supports flags=0 */
7542
        return 0;
7543
    }
7544
    else
7545
        return MB_ERR_INVALID_CHARS;
7546
}
7547
7548
/*
7549
 * Decode a byte string from a Windows code page into unicode object in strict
7550
 * mode.
7551
 *
7552
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7553
 * OSError and returns -1 on other error.
7554
 */
7555
static int
7556
decode_code_page_strict(UINT code_page,
7557
                        wchar_t **buf,
7558
                        Py_ssize_t *bufsize,
7559
                        const char *in,
7560
                        int insize)
7561
{
7562
    DWORD flags = MB_ERR_INVALID_CHARS;
7563
    wchar_t *out;
7564
    DWORD outsize;
7565
7566
    /* First get the size of the result */
7567
    assert(insize > 0);
7568
    while ((outsize = MultiByteToWideChar(code_page, flags,
7569
                                          in, insize, NULL, 0)) <= 0)
7570
    {
7571
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7572
            goto error;
7573
        }
7574
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7575
        flags = 0;
7576
    }
7577
7578
    /* Extend a wchar_t* buffer */
7579
    Py_ssize_t n = *bufsize;   /* Get the current length */
7580
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7581
        return -1;
7582
    }
7583
    out = *buf + n;
7584
7585
    /* Do the conversion */
7586
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7587
    if (outsize <= 0)
7588
        goto error;
7589
    return insize;
7590
7591
error:
7592
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7593
        return -2;
7594
    PyErr_SetFromWindowsErr(0);
7595
    return -1;
7596
}
7597
7598
/*
7599
 * Decode a byte string from a code page into unicode object with an error
7600
 * handler.
7601
 *
7602
 * Returns consumed size if succeed, or raise an OSError or
7603
 * UnicodeDecodeError exception and returns -1 on error.
7604
 */
7605
static int
7606
decode_code_page_errors(UINT code_page,
7607
                        wchar_t **buf,
7608
                        Py_ssize_t *bufsize,
7609
                        const char *in, const int size,
7610
                        const char *errors, int final)
7611
{
7612
    const char *startin = in;
7613
    const char *endin = in + size;
7614
    DWORD flags = MB_ERR_INVALID_CHARS;
7615
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7616
       2000 English version of the message. */
7617
    const char *reason = "No mapping for the Unicode character exists "
7618
                         "in the target code page.";
7619
    /* each step cannot decode more than 1 character, but a character can be
7620
       represented as a surrogate pair */
7621
    wchar_t buffer[2], *out;
7622
    int insize;
7623
    Py_ssize_t outsize;
7624
    PyObject *errorHandler = NULL;
7625
    PyObject *exc = NULL;
7626
    PyObject *encoding_obj = NULL;
7627
    const char *encoding;
7628
    DWORD err;
7629
    int ret = -1;
7630
7631
    assert(size > 0);
7632
7633
    encoding = code_page_name(code_page, &encoding_obj);
7634
    if (encoding == NULL)
7635
        return -1;
7636
7637
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7638
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7639
           UnicodeDecodeError. */
7640
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7641
        if (exc != NULL) {
7642
            PyCodec_StrictErrors(exc);
7643
            Py_CLEAR(exc);
7644
        }
7645
        goto error;
7646
    }
7647
7648
    /* Extend a wchar_t* buffer */
7649
    Py_ssize_t n = *bufsize;   /* Get the current length */
7650
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7651
        PyErr_NoMemory();
7652
        goto error;
7653
    }
7654
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7655
        goto error;
7656
    }
7657
    out = *buf + n;
7658
7659
    /* Decode the byte string character per character */
7660
    while (in < endin)
7661
    {
7662
        /* Decode a character */
7663
        insize = 1;
7664
        do
7665
        {
7666
            outsize = MultiByteToWideChar(code_page, flags,
7667
                                          in, insize,
7668
                                          buffer, Py_ARRAY_LENGTH(buffer));
7669
            if (outsize > 0)
7670
                break;
7671
            err = GetLastError();
7672
            if (err == ERROR_INVALID_FLAGS && flags) {
7673
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7674
                flags = 0;
7675
                continue;
7676
            }
7677
            if (err != ERROR_NO_UNICODE_TRANSLATION
7678
                && err != ERROR_INSUFFICIENT_BUFFER)
7679
            {
7680
                PyErr_SetFromWindowsErr(err);
7681
                goto error;
7682
            }
7683
            insize++;
7684
        }
7685
        /* 4=maximum length of a UTF-8 sequence */
7686
        while (insize <= 4 && (in + insize) <= endin);
7687
7688
        if (outsize <= 0) {
7689
            Py_ssize_t startinpos, endinpos, outpos;
7690
7691
            /* last character in partial decode? */
7692
            if (in + insize >= endin && !final)
7693
                break;
7694
7695
            startinpos = in - startin;
7696
            endinpos = startinpos + 1;
7697
            outpos = out - *buf;
7698
            if (unicode_decode_call_errorhandler_wchar(
7699
                    errors, &errorHandler,
7700
                    encoding, reason,
7701
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7702
                    buf, bufsize, &outpos))
7703
            {
7704
                goto error;
7705
            }
7706
            out = *buf + outpos;
7707
        }
7708
        else {
7709
            in += insize;
7710
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7711
            out += outsize;
7712
        }
7713
    }
7714
7715
    /* Shrink the buffer */
7716
    assert(out - *buf <= *bufsize);
7717
    *bufsize = out - *buf;
7718
    /* (in - startin) <= size and size is an int */
7719
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7720
7721
error:
7722
    Py_XDECREF(encoding_obj);
7723
    Py_XDECREF(errorHandler);
7724
    Py_XDECREF(exc);
7725
    return ret;
7726
}
7727
7728
static PyObject *
7729
decode_code_page_stateful(int code_page,
7730
                          const char *s, Py_ssize_t size,
7731
                          const char *errors, Py_ssize_t *consumed)
7732
{
7733
    wchar_t *buf = NULL;
7734
    Py_ssize_t bufsize = 0;
7735
    int chunk_size, final, converted, done;
7736
7737
    if (code_page < 0) {
7738
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7739
        return NULL;
7740
    }
7741
    if (size < 0) {
7742
        PyErr_BadInternalCall();
7743
        return NULL;
7744
    }
7745
7746
    if (consumed)
7747
        *consumed = 0;
7748
7749
    do
7750
    {
7751
#ifdef NEED_RETRY
7752
        if (size > DECODING_CHUNK_SIZE) {
7753
            chunk_size = DECODING_CHUNK_SIZE;
7754
            final = 0;
7755
            done = 0;
7756
        }
7757
        else
7758
#endif
7759
        {
7760
            chunk_size = (int)size;
7761
            final = (consumed == NULL);
7762
            done = 1;
7763
        }
7764
7765
        if (chunk_size == 0 && done) {
7766
            if (buf != NULL)
7767
                break;
7768
            _Py_RETURN_UNICODE_EMPTY();
7769
        }
7770
7771
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7772
                                            s, chunk_size);
7773
        if (converted == -2)
7774
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7775
                                                s, chunk_size,
7776
                                                errors, final);
7777
        assert(converted != 0 || done);
7778
7779
        if (converted < 0) {
7780
            PyMem_Free(buf);
7781
            return NULL;
7782
        }
7783
7784
        if (consumed)
7785
            *consumed += converted;
7786
7787
        s += converted;
7788
        size -= converted;
7789
    } while (!done);
7790
7791
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7792
    PyMem_Free(buf);
7793
    return v;
7794
}
7795
7796
PyObject *
7797
PyUnicode_DecodeCodePageStateful(int code_page,
7798
                                 const char *s,
7799
                                 Py_ssize_t size,
7800
                                 const char *errors,
7801
                                 Py_ssize_t *consumed)
7802
{
7803
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7804
}
7805
7806
PyObject *
7807
PyUnicode_DecodeMBCSStateful(const char *s,
7808
                             Py_ssize_t size,
7809
                             const char *errors,
7810
                             Py_ssize_t *consumed)
7811
{
7812
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7813
}
7814
7815
PyObject *
7816
PyUnicode_DecodeMBCS(const char *s,
7817
                     Py_ssize_t size,
7818
                     const char *errors)
7819
{
7820
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7821
}
7822
7823
static DWORD
7824
encode_code_page_flags(UINT code_page, const char *errors)
7825
{
7826
    if (code_page == CP_UTF8) {
7827
        return WC_ERR_INVALID_CHARS;
7828
    }
7829
    else if (code_page == CP_UTF7) {
7830
        /* CP_UTF7 only supports flags=0 */
7831
        return 0;
7832
    }
7833
    else {
7834
        if (errors != NULL && strcmp(errors, "replace") == 0)
7835
            return 0;
7836
        else
7837
            return WC_NO_BEST_FIT_CHARS;
7838
    }
7839
}
7840
7841
/*
7842
 * Encode a Unicode string to a Windows code page into a byte string in strict
7843
 * mode.
7844
 *
7845
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7846
 * an OSError and returns -1 on other error.
7847
 */
7848
static int
7849
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7850
                        PyObject *unicode, Py_ssize_t offset, int len,
7851
                        const char* errors)
7852
{
7853
    BOOL usedDefaultChar = FALSE;
7854
    BOOL *pusedDefaultChar = &usedDefaultChar;
7855
    int outsize;
7856
    wchar_t *p;
7857
    Py_ssize_t size;
7858
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7859
    char *out;
7860
    /* Create a substring so that we can get the UTF-16 representation
7861
       of just the slice under consideration. */
7862
    PyObject *substring;
7863
    int ret = -1;
7864
7865
    assert(len > 0);
7866
7867
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7868
        pusedDefaultChar = &usedDefaultChar;
7869
    else
7870
        pusedDefaultChar = NULL;
7871
7872
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7873
    if (substring == NULL)
7874
        return -1;
7875
    p = PyUnicode_AsWideCharString(substring, &size);
7876
    Py_CLEAR(substring);
7877
    if (p == NULL) {
7878
        return -1;
7879
    }
7880
    assert(size <= INT_MAX);
7881
7882
    /* First get the size of the result */
7883
    outsize = WideCharToMultiByte(code_page, flags,
7884
                                  p, (int)size,
7885
                                  NULL, 0,
7886
                                  NULL, pusedDefaultChar);
7887
    if (outsize <= 0)
7888
        goto error;
7889
    /* If we used a default char, then we failed! */
7890
    if (pusedDefaultChar && *pusedDefaultChar) {
7891
        ret = -2;
7892
        goto done;
7893
    }
7894
7895
    if (*writer == NULL) {
7896
        /* Create string object */
7897
        *writer = PyBytesWriter_Create(outsize);
7898
        if (*writer == NULL) {
7899
            goto done;
7900
        }
7901
        out = PyBytesWriter_GetData(*writer);
7902
    }
7903
    else {
7904
        /* Extend string object */
7905
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7906
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7907
            goto done;
7908
        }
7909
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7910
    }
7911
7912
    /* Do the conversion */
7913
    outsize = WideCharToMultiByte(code_page, flags,
7914
                                  p, (int)size,
7915
                                  out, outsize,
7916
                                  NULL, pusedDefaultChar);
7917
    if (outsize <= 0)
7918
        goto error;
7919
    if (pusedDefaultChar && *pusedDefaultChar) {
7920
        ret = -2;
7921
        goto done;
7922
    }
7923
    ret = 0;
7924
7925
done:
7926
    PyMem_Free(p);
7927
    return ret;
7928
7929
error:
7930
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7931
        ret = -2;
7932
        goto done;
7933
    }
7934
    PyErr_SetFromWindowsErr(0);
7935
    goto done;
7936
}
7937
7938
/*
7939
 * Encode a Unicode string to a Windows code page into a byte string using an
7940
 * error handler.
7941
 *
7942
 * Returns consumed characters if succeed, or raise an OSError and returns
7943
 * -1 on other error.
7944
 */
7945
static int
7946
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7947
                        PyObject *unicode, Py_ssize_t unicode_offset,
7948
                        Py_ssize_t insize, const char* errors)
7949
{
7950
    const DWORD flags = encode_code_page_flags(code_page, errors);
7951
    Py_ssize_t pos = unicode_offset;
7952
    Py_ssize_t endin = unicode_offset + insize;
7953
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7954
       2000 English version of the message. */
7955
    const char *reason = "invalid character";
7956
    /* 4=maximum length of a UTF-8 sequence */
7957
    char buffer[4];
7958
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7959
    Py_ssize_t outsize;
7960
    char *out;
7961
    PyObject *errorHandler = NULL;
7962
    PyObject *exc = NULL;
7963
    PyObject *encoding_obj = NULL;
7964
    const char *encoding;
7965
    Py_ssize_t newpos;
7966
    PyObject *rep;
7967
    int ret = -1;
7968
7969
    assert(insize > 0);
7970
7971
    encoding = code_page_name(code_page, &encoding_obj);
7972
    if (encoding == NULL)
7973
        return -1;
7974
7975
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7976
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7977
           then we raise a UnicodeEncodeError. */
7978
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7979
        if (exc != NULL) {
7980
            PyCodec_StrictErrors(exc);
7981
            Py_DECREF(exc);
7982
        }
7983
        Py_XDECREF(encoding_obj);
7984
        return -1;
7985
    }
7986
7987
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7988
        pusedDefaultChar = &usedDefaultChar;
7989
    else
7990
        pusedDefaultChar = NULL;
7991
7992
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7993
        PyErr_NoMemory();
7994
        goto error;
7995
    }
7996
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7997
7998
    if (*writer == NULL) {
7999
        /* Create string object */
8000
        *writer = PyBytesWriter_Create(outsize);
8001
        if (*writer == NULL) {
8002
            goto error;
8003
        }
8004
        out = PyBytesWriter_GetData(*writer);
8005
    }
8006
    else {
8007
        /* Extend string object */
8008
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8009
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8010
            goto error;
8011
        }
8012
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8013
    }
8014
8015
    /* Encode the string character per character */
8016
    while (pos < endin)
8017
    {
8018
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8019
        wchar_t chars[2];
8020
        int charsize;
8021
        if (ch < 0x10000) {
8022
            chars[0] = (wchar_t)ch;
8023
            charsize = 1;
8024
        }
8025
        else {
8026
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8027
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8028
            charsize = 2;
8029
        }
8030
8031
        outsize = WideCharToMultiByte(code_page, flags,
8032
                                      chars, charsize,
8033
                                      buffer, Py_ARRAY_LENGTH(buffer),
8034
                                      NULL, pusedDefaultChar);
8035
        if (outsize > 0) {
8036
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8037
            {
8038
                pos++;
8039
                memcpy(out, buffer, outsize);
8040
                out += outsize;
8041
                continue;
8042
            }
8043
        }
8044
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8045
            PyErr_SetFromWindowsErr(0);
8046
            goto error;
8047
        }
8048
8049
        rep = unicode_encode_call_errorhandler(
8050
                  errors, &errorHandler, encoding, reason,
8051
                  unicode, &exc,
8052
                  pos, pos + 1, &newpos);
8053
        if (rep == NULL)
8054
            goto error;
8055
8056
        Py_ssize_t morebytes = pos - newpos;
8057
        if (PyBytes_Check(rep)) {
8058
            outsize = PyBytes_GET_SIZE(rep);
8059
            morebytes += outsize;
8060
            if (morebytes > 0) {
8061
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8062
                if (out == NULL) {
8063
                    Py_DECREF(rep);
8064
                    goto error;
8065
                }
8066
            }
8067
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8068
            out += outsize;
8069
        }
8070
        else {
8071
            Py_ssize_t i;
8072
            int kind;
8073
            const void *data;
8074
8075
            outsize = PyUnicode_GET_LENGTH(rep);
8076
            morebytes += outsize;
8077
            if (morebytes > 0) {
8078
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8079
                if (out == NULL) {
8080
                    Py_DECREF(rep);
8081
                    goto error;
8082
                }
8083
            }
8084
            kind = PyUnicode_KIND(rep);
8085
            data = PyUnicode_DATA(rep);
8086
            for (i=0; i < outsize; i++) {
8087
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8088
                if (ch > 127) {
8089
                    raise_encode_exception(&exc,
8090
                        encoding, unicode,
8091
                        pos, pos + 1,
8092
                        "unable to encode error handler result to ASCII");
8093
                    Py_DECREF(rep);
8094
                    goto error;
8095
                }
8096
                *out = (unsigned char)ch;
8097
                out++;
8098
            }
8099
        }
8100
        pos = newpos;
8101
        Py_DECREF(rep);
8102
    }
8103
    /* write a NUL byte */
8104
    *out = 0;
8105
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8106
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8107
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8108
        goto error;
8109
    }
8110
    ret = 0;
8111
8112
error:
8113
    Py_XDECREF(encoding_obj);
8114
    Py_XDECREF(errorHandler);
8115
    Py_XDECREF(exc);
8116
    return ret;
8117
}
8118
8119
8120
PyObject *
8121
PyUnicode_EncodeCodePage(int code_page,
8122
                         PyObject *unicode,
8123
                         const char *errors)
8124
{
8125
    Py_ssize_t len;
8126
    PyBytesWriter *writer = NULL;
8127
    Py_ssize_t offset;
8128
    int chunk_len, ret, done;
8129
8130
    if (!PyUnicode_Check(unicode)) {
8131
        PyErr_BadArgument();
8132
        return NULL;
8133
    }
8134
8135
    len = PyUnicode_GET_LENGTH(unicode);
8136
8137
    if (code_page < 0) {
8138
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8139
        return NULL;
8140
    }
8141
8142
    if (len == 0)
8143
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8144
8145
    offset = 0;
8146
    do
8147
    {
8148
#ifdef NEED_RETRY
8149
        if (len > DECODING_CHUNK_SIZE) {
8150
            chunk_len = DECODING_CHUNK_SIZE;
8151
            done = 0;
8152
        }
8153
        else
8154
#endif
8155
        {
8156
            chunk_len = (int)len;
8157
            done = 1;
8158
        }
8159
8160
        ret = encode_code_page_strict(code_page, &writer,
8161
                                      unicode, offset, chunk_len,
8162
                                      errors);
8163
        if (ret == -2)
8164
            ret = encode_code_page_errors(code_page, &writer,
8165
                                          unicode, offset,
8166
                                          chunk_len, errors);
8167
        if (ret < 0) {
8168
            PyBytesWriter_Discard(writer);
8169
            return NULL;
8170
        }
8171
8172
        offset += chunk_len;
8173
        len -= chunk_len;
8174
    } while (!done);
8175
8176
    return PyBytesWriter_Finish(writer);
8177
}
8178
8179
8180
PyObject *
8181
PyUnicode_AsMBCSString(PyObject *unicode)
8182
{
8183
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8184
}
8185
8186
#undef NEED_RETRY
8187
8188
#endif /* MS_WINDOWS */
8189
8190
/* --- Character Mapping Codec -------------------------------------------- */
8191
8192
static int
8193
charmap_decode_string(const char *s,
8194
                      Py_ssize_t size,
8195
                      PyObject *mapping,
8196
                      const char *errors,
8197
                      _PyUnicodeWriter *writer)
8198
636k
{
8199
636k
    const char *starts = s;
8200
636k
    const char *e;
8201
636k
    Py_ssize_t startinpos, endinpos;
8202
636k
    PyObject *errorHandler = NULL, *exc = NULL;
8203
636k
    Py_ssize_t maplen;
8204
636k
    int mapkind;
8205
636k
    const void *mapdata;
8206
636k
    Py_UCS4 x;
8207
636k
    unsigned char ch;
8208
8209
636k
    maplen = PyUnicode_GET_LENGTH(mapping);
8210
636k
    mapdata = PyUnicode_DATA(mapping);
8211
636k
    mapkind = PyUnicode_KIND(mapping);
8212
8213
636k
    e = s + size;
8214
8215
636k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8216
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8217
         * is disabled in encoding aliases, latin1 is preferred because
8218
         * its implementation is faster. */
8219
127
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8220
127
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8221
127
        Py_UCS4 maxchar = writer->maxchar;
8222
8223
127
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8224
10.1k
        while (s < e) {
8225
10.0k
            ch = *s;
8226
10.0k
            x = mapdata_ucs1[ch];
8227
10.0k
            if (x > maxchar) {
8228
116
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8229
0
                    goto onError;
8230
116
                maxchar = writer->maxchar;
8231
116
                outdata = (Py_UCS1 *)writer->data;
8232
116
            }
8233
10.0k
            outdata[writer->pos] = x;
8234
10.0k
            writer->pos++;
8235
10.0k
            ++s;
8236
10.0k
        }
8237
127
        return 0;
8238
127
    }
8239
8240
756k
    while (s < e) {
8241
739k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8242
739k
            int outkind = writer->kind;
8243
739k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8244
739k
            if (outkind == PyUnicode_1BYTE_KIND) {
8245
674k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8246
674k
                Py_UCS4 maxchar = writer->maxchar;
8247
12.7M
                while (s < e) {
8248
12.1M
                    ch = *s;
8249
12.1M
                    x = mapdata_ucs2[ch];
8250
12.1M
                    if (x > maxchar)
8251
81.1k
                        goto Error;
8252
12.1M
                    outdata[writer->pos] = x;
8253
12.1M
                    writer->pos++;
8254
12.1M
                    ++s;
8255
12.1M
                }
8256
593k
                break;
8257
674k
            }
8258
65.1k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8259
65.1k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8260
17.3M
                while (s < e) {
8261
17.3M
                    ch = *s;
8262
17.3M
                    x = mapdata_ucs2[ch];
8263
17.3M
                    if (x == 0xFFFE)
8264
39.4k
                        goto Error;
8265
17.3M
                    outdata[writer->pos] = x;
8266
17.3M
                    writer->pos++;
8267
17.3M
                    ++s;
8268
17.3M
                }
8269
25.6k
                break;
8270
65.1k
            }
8271
739k
        }
8272
0
        ch = *s;
8273
8274
0
        if (ch < maplen)
8275
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8276
0
        else
8277
0
            x = 0xfffe; /* invalid value */
8278
120k
Error:
8279
120k
        if (x == 0xfffe)
8280
63.0k
        {
8281
            /* undefined mapping */
8282
63.0k
            startinpos = s-starts;
8283
63.0k
            endinpos = startinpos+1;
8284
63.0k
            if (unicode_decode_call_errorhandler_writer(
8285
63.0k
                    errors, &errorHandler,
8286
63.0k
                    "charmap", "character maps to <undefined>",
8287
63.0k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8288
63.0k
                    writer)) {
8289
17
                goto onError;
8290
17
            }
8291
63.0k
            continue;
8292
63.0k
        }
8293
8294
57.5k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8295
0
            goto onError;
8296
57.5k
        ++s;
8297
57.5k
    }
8298
635k
    Py_XDECREF(errorHandler);
8299
635k
    Py_XDECREF(exc);
8300
635k
    return 0;
8301
8302
17
onError:
8303
17
    Py_XDECREF(errorHandler);
8304
17
    Py_XDECREF(exc);
8305
17
    return -1;
8306
635k
}
8307
8308
static int
8309
charmap_decode_mapping(const char *s,
8310
                       Py_ssize_t size,
8311
                       PyObject *mapping,
8312
                       const char *errors,
8313
                       _PyUnicodeWriter *writer)
8314
0
{
8315
0
    const char *starts = s;
8316
0
    const char *e;
8317
0
    Py_ssize_t startinpos, endinpos;
8318
0
    PyObject *errorHandler = NULL, *exc = NULL;
8319
0
    unsigned char ch;
8320
0
    PyObject *key, *item = NULL;
8321
8322
0
    e = s + size;
8323
8324
0
    while (s < e) {
8325
0
        ch = *s;
8326
8327
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8328
0
        key = PyLong_FromLong((long)ch);
8329
0
        if (key == NULL)
8330
0
            goto onError;
8331
8332
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8333
0
        Py_DECREF(key);
8334
0
        if (rc == 0) {
8335
            /* No mapping found means: mapping is undefined. */
8336
0
            goto Undefined;
8337
0
        }
8338
0
        if (item == NULL) {
8339
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8340
                /* No mapping found means: mapping is undefined. */
8341
0
                PyErr_Clear();
8342
0
                goto Undefined;
8343
0
            } else
8344
0
                goto onError;
8345
0
        }
8346
8347
        /* Apply mapping */
8348
0
        if (item == Py_None)
8349
0
            goto Undefined;
8350
0
        if (PyLong_Check(item)) {
8351
0
            long value = PyLong_AsLong(item);
8352
0
            if (value == 0xFFFE)
8353
0
                goto Undefined;
8354
0
            if (value < 0 || value > MAX_UNICODE) {
8355
0
                PyErr_Format(PyExc_TypeError,
8356
0
                             "character mapping must be in range(0x%x)",
8357
0
                             (unsigned long)MAX_UNICODE + 1);
8358
0
                goto onError;
8359
0
            }
8360
8361
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8362
0
                goto onError;
8363
0
        }
8364
0
        else if (PyUnicode_Check(item)) {
8365
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8366
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8367
0
                if (value == 0xFFFE)
8368
0
                    goto Undefined;
8369
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8370
0
                    goto onError;
8371
0
            }
8372
0
            else {
8373
0
                writer->overallocate = 1;
8374
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8375
0
                    goto onError;
8376
0
            }
8377
0
        }
8378
0
        else {
8379
            /* wrong return value */
8380
0
            PyErr_SetString(PyExc_TypeError,
8381
0
                            "character mapping must return integer, None or str");
8382
0
            goto onError;
8383
0
        }
8384
0
        Py_CLEAR(item);
8385
0
        ++s;
8386
0
        continue;
8387
8388
0
Undefined:
8389
        /* undefined mapping */
8390
0
        Py_CLEAR(item);
8391
0
        startinpos = s-starts;
8392
0
        endinpos = startinpos+1;
8393
0
        if (unicode_decode_call_errorhandler_writer(
8394
0
                errors, &errorHandler,
8395
0
                "charmap", "character maps to <undefined>",
8396
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8397
0
                writer)) {
8398
0
            goto onError;
8399
0
        }
8400
0
    }
8401
0
    Py_XDECREF(errorHandler);
8402
0
    Py_XDECREF(exc);
8403
0
    return 0;
8404
8405
0
onError:
8406
0
    Py_XDECREF(item);
8407
0
    Py_XDECREF(errorHandler);
8408
0
    Py_XDECREF(exc);
8409
0
    return -1;
8410
0
}
8411
8412
PyObject *
8413
PyUnicode_DecodeCharmap(const char *s,
8414
                        Py_ssize_t size,
8415
                        PyObject *mapping,
8416
                        const char *errors)
8417
636k
{
8418
636k
    _PyUnicodeWriter writer;
8419
8420
    /* Default to Latin-1 */
8421
636k
    if (mapping == NULL)
8422
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8423
8424
636k
    if (size == 0)
8425
0
        _Py_RETURN_UNICODE_EMPTY();
8426
636k
    _PyUnicodeWriter_Init(&writer);
8427
636k
    writer.min_length = size;
8428
636k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8429
0
        goto onError;
8430
8431
636k
    if (PyUnicode_CheckExact(mapping)) {
8432
636k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8433
17
            goto onError;
8434
636k
    }
8435
0
    else {
8436
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8437
0
            goto onError;
8438
0
    }
8439
636k
    return _PyUnicodeWriter_Finish(&writer);
8440
8441
17
  onError:
8442
17
    _PyUnicodeWriter_Dealloc(&writer);
8443
17
    return NULL;
8444
636k
}
8445
8446
/* Charmap encoding: the lookup table */
8447
8448
/*[clinic input]
8449
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8450
[clinic start generated code]*/
8451
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8452
8453
struct encoding_map {
8454
    PyObject_HEAD
8455
    unsigned char level1[32];
8456
    int count2, count3;
8457
    unsigned char level23[1];
8458
};
8459
8460
/*[clinic input]
8461
EncodingMap.size
8462
8463
Return the size (in bytes) of this object.
8464
[clinic start generated code]*/
8465
8466
static PyObject *
8467
EncodingMap_size_impl(struct encoding_map *self)
8468
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8469
0
{
8470
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8471
0
                           128*self->count3);
8472
0
}
8473
8474
static PyMethodDef encoding_map_methods[] = {
8475
    ENCODINGMAP_SIZE_METHODDEF
8476
    {NULL, NULL}
8477
};
8478
8479
static PyTypeObject EncodingMapType = {
8480
    PyVarObject_HEAD_INIT(NULL, 0)
8481
    .tp_name = "EncodingMap",
8482
    .tp_basicsize = sizeof(struct encoding_map),
8483
    /* methods */
8484
    .tp_flags = Py_TPFLAGS_DEFAULT,
8485
    .tp_methods = encoding_map_methods,
8486
};
8487
8488
PyObject*
8489
PyUnicode_BuildEncodingMap(PyObject* string)
8490
124
{
8491
124
    PyObject *result;
8492
124
    struct encoding_map *mresult;
8493
124
    int i;
8494
124
    int need_dict = 0;
8495
124
    unsigned char level1[32];
8496
124
    unsigned char level2[512];
8497
124
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8498
124
    int count2 = 0, count3 = 0;
8499
124
    int kind;
8500
124
    const void *data;
8501
124
    int length;
8502
124
    Py_UCS4 ch;
8503
8504
124
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8505
0
        PyErr_BadArgument();
8506
0
        return NULL;
8507
0
    }
8508
124
    kind = PyUnicode_KIND(string);
8509
124
    data = PyUnicode_DATA(string);
8510
124
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8511
124
    memset(level1, 0xFF, sizeof level1);
8512
124
    memset(level2, 0xFF, sizeof level2);
8513
8514
    /* If there isn't a one-to-one mapping of NULL to \0,
8515
       or if there are non-BMP characters, we need to use
8516
       a mapping dictionary. */
8517
124
    if (PyUnicode_READ(kind, data, 0) != 0)
8518
0
        need_dict = 1;
8519
31.7k
    for (i = 1; i < length; i++) {
8520
31.6k
        int l1, l2;
8521
31.6k
        ch = PyUnicode_READ(kind, data, i);
8522
31.6k
        if (ch == 0 || ch > 0xFFFF) {
8523
0
            need_dict = 1;
8524
0
            break;
8525
0
        }
8526
31.6k
        if (ch == 0xFFFE)
8527
            /* unmapped character */
8528
831
            continue;
8529
30.7k
        l1 = ch >> 11;
8530
30.7k
        l2 = ch >> 7;
8531
30.7k
        if (level1[l1] == 0xFF)
8532
223
            level1[l1] = count2++;
8533
30.7k
        if (level2[l2] == 0xFF)
8534
648
            level2[l2] = count3++;
8535
30.7k
    }
8536
8537
124
    if (count2 >= 0xFF || count3 >= 0xFF)
8538
0
        need_dict = 1;
8539
8540
124
    if (need_dict) {
8541
0
        PyObject *result = PyDict_New();
8542
0
        if (!result)
8543
0
            return NULL;
8544
0
        for (i = 0; i < length; i++) {
8545
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8546
0
            PyObject *key = PyLong_FromLong(c);
8547
0
            if (key == NULL) {
8548
0
                Py_DECREF(result);
8549
0
                return NULL;
8550
0
            }
8551
0
            PyObject *value = PyLong_FromLong(i);
8552
0
            if (value == NULL) {
8553
0
                Py_DECREF(key);
8554
0
                Py_DECREF(result);
8555
0
                return NULL;
8556
0
            }
8557
0
            int rc = PyDict_SetItem(result, key, value);
8558
0
            Py_DECREF(key);
8559
0
            Py_DECREF(value);
8560
0
            if (rc < 0) {
8561
0
                Py_DECREF(result);
8562
0
                return NULL;
8563
0
            }
8564
0
        }
8565
0
        return result;
8566
0
    }
8567
8568
    /* Create a three-level trie */
8569
124
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8570
124
                             16*count2 + 128*count3 - 1);
8571
124
    if (!result) {
8572
0
        return PyErr_NoMemory();
8573
0
    }
8574
8575
124
    _PyObject_Init(result, &EncodingMapType);
8576
124
    mresult = (struct encoding_map*)result;
8577
124
    mresult->count2 = count2;
8578
124
    mresult->count3 = count3;
8579
124
    mlevel1 = mresult->level1;
8580
124
    mlevel2 = mresult->level23;
8581
124
    mlevel3 = mresult->level23 + 16*count2;
8582
124
    memcpy(mlevel1, level1, 32);
8583
124
    memset(mlevel2, 0xFF, 16*count2);
8584
124
    memset(mlevel3, 0, 128*count3);
8585
124
    count3 = 0;
8586
31.7k
    for (i = 1; i < length; i++) {
8587
31.6k
        int o1, o2, o3, i2, i3;
8588
31.6k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8589
31.6k
        if (ch == 0xFFFE)
8590
            /* unmapped character */
8591
831
            continue;
8592
30.7k
        o1 = ch>>11;
8593
30.7k
        o2 = (ch>>7) & 0xF;
8594
30.7k
        i2 = 16*mlevel1[o1] + o2;
8595
30.7k
        if (mlevel2[i2] == 0xFF)
8596
648
            mlevel2[i2] = count3++;
8597
30.7k
        o3 = ch & 0x7F;
8598
30.7k
        i3 = 128*mlevel2[i2] + o3;
8599
30.7k
        mlevel3[i3] = i;
8600
30.7k
    }
8601
124
    return result;
8602
124
}
8603
8604
static int
8605
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8606
0
{
8607
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8608
0
    int l1 = c>>11;
8609
0
    int l2 = (c>>7) & 0xF;
8610
0
    int l3 = c & 0x7F;
8611
0
    int i;
8612
8613
0
    if (c > 0xFFFF)
8614
0
        return -1;
8615
0
    if (c == 0)
8616
0
        return 0;
8617
    /* level 1*/
8618
0
    i = map->level1[l1];
8619
0
    if (i == 0xFF) {
8620
0
        return -1;
8621
0
    }
8622
    /* level 2*/
8623
0
    i = map->level23[16*i+l2];
8624
0
    if (i == 0xFF) {
8625
0
        return -1;
8626
0
    }
8627
    /* level 3 */
8628
0
    i = map->level23[16*map->count2 + 128*i + l3];
8629
0
    if (i == 0) {
8630
0
        return -1;
8631
0
    }
8632
0
    return i;
8633
0
}
8634
8635
/* Lookup the character in the mapping.
8636
   On success, return PyLong, PyBytes or None (if the character can't be found).
8637
   If the result is PyLong, put its value in replace.
8638
   On error, return NULL.
8639
   */
8640
static PyObject *
8641
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8642
0
{
8643
0
    PyObject *w = PyLong_FromLong((long)c);
8644
0
    PyObject *x;
8645
8646
0
    if (w == NULL)
8647
0
        return NULL;
8648
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8649
0
    Py_DECREF(w);
8650
0
    if (rc == 0) {
8651
        /* No mapping found means: mapping is undefined. */
8652
0
        Py_RETURN_NONE;
8653
0
    }
8654
0
    if (x == NULL) {
8655
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8656
            /* No mapping found means: mapping is undefined. */
8657
0
            PyErr_Clear();
8658
0
            Py_RETURN_NONE;
8659
0
        } else
8660
0
            return NULL;
8661
0
    }
8662
0
    else if (x == Py_None)
8663
0
        return x;
8664
0
    else if (PyLong_Check(x)) {
8665
0
        long value = PyLong_AsLong(x);
8666
0
        if (value < 0 || value > 255) {
8667
0
            PyErr_SetString(PyExc_TypeError,
8668
0
                            "character mapping must be in range(256)");
8669
0
            Py_DECREF(x);
8670
0
            return NULL;
8671
0
        }
8672
0
        *replace = (unsigned char)value;
8673
0
        return x;
8674
0
    }
8675
0
    else if (PyBytes_Check(x))
8676
0
        return x;
8677
0
    else {
8678
        /* wrong return value */
8679
0
        PyErr_Format(PyExc_TypeError,
8680
0
                     "character mapping must return integer, bytes or None, not %.400s",
8681
0
                     Py_TYPE(x)->tp_name);
8682
0
        Py_DECREF(x);
8683
0
        return NULL;
8684
0
    }
8685
0
}
8686
8687
static int
8688
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8689
0
{
8690
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8691
    /* exponentially overallocate to minimize reallocations */
8692
0
    if (requiredsize < 2 * outsize)
8693
0
        requiredsize = 2 * outsize;
8694
0
    return PyBytesWriter_Resize(writer, requiredsize);
8695
0
}
8696
8697
typedef enum charmapencode_result {
8698
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8699
} charmapencode_result;
8700
/* lookup the character, put the result in the output string and adjust
8701
   various state variables. Resize the output bytes object if not enough
8702
   space is available. Return a new reference to the object that
8703
   was put in the output buffer, or Py_None, if the mapping was undefined
8704
   (in which case no character was written) or NULL, if a
8705
   reallocation error occurred. The caller must decref the result */
8706
static charmapencode_result
8707
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8708
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8709
0
{
8710
0
    PyObject *rep;
8711
0
    unsigned char replace;
8712
0
    char *outstart;
8713
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8714
8715
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8716
0
        int res = encoding_map_lookup(c, mapping);
8717
0
        Py_ssize_t requiredsize = *outpos+1;
8718
0
        if (res == -1) {
8719
0
            return enc_FAILED;
8720
0
        }
8721
8722
0
        if (outsize<requiredsize) {
8723
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8724
0
                return enc_EXCEPTION;
8725
0
            }
8726
0
        }
8727
0
        outstart = _PyBytesWriter_GetData(writer);
8728
0
        outstart[(*outpos)++] = (char)res;
8729
0
        return enc_SUCCESS;
8730
0
    }
8731
8732
0
    rep = charmapencode_lookup(c, mapping, &replace);
8733
0
    if (rep==NULL)
8734
0
        return enc_EXCEPTION;
8735
0
    else if (rep==Py_None) {
8736
0
        Py_DECREF(rep);
8737
0
        return enc_FAILED;
8738
0
    } else {
8739
0
        if (PyLong_Check(rep)) {
8740
0
            Py_ssize_t requiredsize = *outpos+1;
8741
0
            if (outsize<requiredsize)
8742
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8743
0
                    Py_DECREF(rep);
8744
0
                    return enc_EXCEPTION;
8745
0
                }
8746
0
            outstart = _PyBytesWriter_GetData(writer);
8747
0
            outstart[(*outpos)++] = (char)replace;
8748
0
        }
8749
0
        else {
8750
0
            const char *repchars = PyBytes_AS_STRING(rep);
8751
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8752
0
            Py_ssize_t requiredsize = *outpos+repsize;
8753
0
            if (outsize<requiredsize)
8754
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8755
0
                    Py_DECREF(rep);
8756
0
                    return enc_EXCEPTION;
8757
0
                }
8758
0
            outstart = _PyBytesWriter_GetData(writer);
8759
0
            memcpy(outstart + *outpos, repchars, repsize);
8760
0
            *outpos += repsize;
8761
0
        }
8762
0
    }
8763
0
    Py_DECREF(rep);
8764
0
    return enc_SUCCESS;
8765
0
}
8766
8767
/* handle an error in _PyUnicode_EncodeCharmap()
8768
   Return 0 on success, -1 on error */
8769
static int
8770
charmap_encoding_error(
8771
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8772
    PyObject **exceptionObject,
8773
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8774
    PyBytesWriter *writer, Py_ssize_t *respos)
8775
0
{
8776
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8777
0
    Py_ssize_t size, repsize;
8778
0
    Py_ssize_t newpos;
8779
0
    int kind;
8780
0
    const void *data;
8781
0
    Py_ssize_t index;
8782
    /* startpos for collecting unencodable chars */
8783
0
    Py_ssize_t collstartpos = *inpos;
8784
0
    Py_ssize_t collendpos = *inpos+1;
8785
0
    Py_ssize_t collpos;
8786
0
    const char *encoding = "charmap";
8787
0
    const char *reason = "character maps to <undefined>";
8788
0
    charmapencode_result x;
8789
0
    Py_UCS4 ch;
8790
0
    int val;
8791
8792
0
    size = PyUnicode_GET_LENGTH(unicode);
8793
    /* find all unencodable characters */
8794
0
    while (collendpos < size) {
8795
0
        PyObject *rep;
8796
0
        unsigned char replace;
8797
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8798
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8799
0
            val = encoding_map_lookup(ch, mapping);
8800
0
            if (val != -1)
8801
0
                break;
8802
0
            ++collendpos;
8803
0
            continue;
8804
0
        }
8805
8806
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8807
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8808
0
        if (rep==NULL)
8809
0
            return -1;
8810
0
        else if (rep!=Py_None) {
8811
0
            Py_DECREF(rep);
8812
0
            break;
8813
0
        }
8814
0
        Py_DECREF(rep);
8815
0
        ++collendpos;
8816
0
    }
8817
    /* cache callback name lookup
8818
     * (if not done yet, i.e. it's the first error) */
8819
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8820
0
        *error_handler = _Py_GetErrorHandler(errors);
8821
8822
0
    switch (*error_handler) {
8823
0
    case _Py_ERROR_STRICT:
8824
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8825
0
        return -1;
8826
8827
0
    case _Py_ERROR_REPLACE:
8828
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8829
0
            x = charmapencode_output('?', mapping, writer, respos);
8830
0
            if (x==enc_EXCEPTION) {
8831
0
                return -1;
8832
0
            }
8833
0
            else if (x==enc_FAILED) {
8834
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8835
0
                return -1;
8836
0
            }
8837
0
        }
8838
0
        _Py_FALLTHROUGH;
8839
0
    case _Py_ERROR_IGNORE:
8840
0
        *inpos = collendpos;
8841
0
        break;
8842
8843
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8844
        /* generate replacement (temporarily (mis)uses p) */
8845
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8846
0
            char buffer[2+29+1+1];
8847
0
            char *cp;
8848
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8849
0
            for (cp = buffer; *cp; ++cp) {
8850
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8851
0
                if (x==enc_EXCEPTION)
8852
0
                    return -1;
8853
0
                else if (x==enc_FAILED) {
8854
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8855
0
                    return -1;
8856
0
                }
8857
0
            }
8858
0
        }
8859
0
        *inpos = collendpos;
8860
0
        break;
8861
8862
0
    default:
8863
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8864
0
                                                      encoding, reason, unicode, exceptionObject,
8865
0
                                                      collstartpos, collendpos, &newpos);
8866
0
        if (repunicode == NULL)
8867
0
            return -1;
8868
0
        if (PyBytes_Check(repunicode)) {
8869
            /* Directly copy bytes result to output. */
8870
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8871
0
            Py_ssize_t requiredsize;
8872
0
            repsize = PyBytes_Size(repunicode);
8873
0
            requiredsize = *respos + repsize;
8874
0
            if (requiredsize > outsize)
8875
                /* Make room for all additional bytes. */
8876
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8877
0
                    Py_DECREF(repunicode);
8878
0
                    return -1;
8879
0
                }
8880
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8881
0
                   PyBytes_AsString(repunicode),  repsize);
8882
0
            *respos += repsize;
8883
0
            *inpos = newpos;
8884
0
            Py_DECREF(repunicode);
8885
0
            break;
8886
0
        }
8887
        /* generate replacement  */
8888
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8889
0
        data = PyUnicode_DATA(repunicode);
8890
0
        kind = PyUnicode_KIND(repunicode);
8891
0
        for (index = 0; index < repsize; index++) {
8892
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8893
0
            x = charmapencode_output(repch, mapping, writer, respos);
8894
0
            if (x==enc_EXCEPTION) {
8895
0
                Py_DECREF(repunicode);
8896
0
                return -1;
8897
0
            }
8898
0
            else if (x==enc_FAILED) {
8899
0
                Py_DECREF(repunicode);
8900
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8901
0
                return -1;
8902
0
            }
8903
0
        }
8904
0
        *inpos = newpos;
8905
0
        Py_DECREF(repunicode);
8906
0
    }
8907
0
    return 0;
8908
0
}
8909
8910
PyObject *
8911
_PyUnicode_EncodeCharmap(PyObject *unicode,
8912
                         PyObject *mapping,
8913
                         const char *errors)
8914
0
{
8915
    /* Default to Latin-1 */
8916
0
    if (mapping == NULL) {
8917
0
        return unicode_encode_ucs1(unicode, errors, 256);
8918
0
    }
8919
8920
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8921
0
    if (size == 0) {
8922
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8923
0
    }
8924
0
    const void *data = PyUnicode_DATA(unicode);
8925
0
    int kind = PyUnicode_KIND(unicode);
8926
8927
0
    PyObject *error_handler_obj = NULL;
8928
0
    PyObject *exc = NULL;
8929
8930
    /* output object */
8931
0
    PyBytesWriter *writer;
8932
    /* allocate enough for a simple encoding without
8933
       replacements, if we need more, we'll resize */
8934
0
    writer = PyBytesWriter_Create(size);
8935
0
    if (writer == NULL) {
8936
0
        goto onError;
8937
0
    }
8938
8939
    /* current input position */
8940
0
    Py_ssize_t inpos = 0;
8941
    /* current output position */
8942
0
    Py_ssize_t respos = 0;
8943
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8944
8945
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8946
0
        char *outstart = _PyBytesWriter_GetData(writer);
8947
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8948
8949
0
        while (inpos<size) {
8950
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8951
8952
            /* try to encode it */
8953
0
            int res = encoding_map_lookup(ch, mapping);
8954
0
            Py_ssize_t requiredsize = respos+1;
8955
0
            if (res == -1) {
8956
0
                goto enc_FAILED;
8957
0
            }
8958
8959
0
            if (outsize<requiredsize) {
8960
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8961
0
                    goto onError;
8962
0
                }
8963
0
                outstart = _PyBytesWriter_GetData(writer);
8964
0
                outsize = _PyBytesWriter_GetSize(writer);
8965
0
            }
8966
0
            outstart[respos++] = (char)res;
8967
8968
            /* done with this character => adjust input position */
8969
0
            ++inpos;
8970
0
            continue;
8971
8972
0
enc_FAILED:
8973
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8974
0
                                       &exc,
8975
0
                                       &error_handler, &error_handler_obj, errors,
8976
0
                                       writer, &respos)) {
8977
0
                goto onError;
8978
0
            }
8979
0
            outstart = _PyBytesWriter_GetData(writer);
8980
0
            outsize = _PyBytesWriter_GetSize(writer);
8981
0
        }
8982
0
    }
8983
0
    else {
8984
0
        while (inpos<size) {
8985
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8986
            /* try to encode it */
8987
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8988
0
            if (x==enc_EXCEPTION) { /* error */
8989
0
                goto onError;
8990
0
            }
8991
0
            if (x==enc_FAILED) { /* unencodable character */
8992
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8993
0
                                           &exc,
8994
0
                                           &error_handler, &error_handler_obj, errors,
8995
0
                                           writer, &respos)) {
8996
0
                    goto onError;
8997
0
                }
8998
0
            }
8999
0
            else {
9000
                /* done with this character => adjust input position */
9001
0
                ++inpos;
9002
0
            }
9003
0
        }
9004
0
    }
9005
9006
0
    Py_XDECREF(exc);
9007
0
    Py_XDECREF(error_handler_obj);
9008
9009
    /* Resize if we allocated too much */
9010
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9011
9012
0
  onError:
9013
0
    PyBytesWriter_Discard(writer);
9014
0
    Py_XDECREF(exc);
9015
0
    Py_XDECREF(error_handler_obj);
9016
0
    return NULL;
9017
0
}
9018
9019
PyObject *
9020
PyUnicode_AsCharmapString(PyObject *unicode,
9021
                          PyObject *mapping)
9022
0
{
9023
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9024
0
        PyErr_BadArgument();
9025
0
        return NULL;
9026
0
    }
9027
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9028
0
}
9029
9030
/* create or adjust a UnicodeTranslateError */
9031
static void
9032
make_translate_exception(PyObject **exceptionObject,
9033
                         PyObject *unicode,
9034
                         Py_ssize_t startpos, Py_ssize_t endpos,
9035
                         const char *reason)
9036
0
{
9037
0
    if (*exceptionObject == NULL) {
9038
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9039
0
            unicode, startpos, endpos, reason);
9040
0
    }
9041
0
    else {
9042
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9043
0
            goto onError;
9044
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9045
0
            goto onError;
9046
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9047
0
            goto onError;
9048
0
        return;
9049
0
      onError:
9050
0
        Py_CLEAR(*exceptionObject);
9051
0
    }
9052
0
}
9053
9054
/* error handling callback helper:
9055
   build arguments, call the callback and check the arguments,
9056
   put the result into newpos and return the replacement string, which
9057
   has to be freed by the caller */
9058
static PyObject *
9059
unicode_translate_call_errorhandler(const char *errors,
9060
                                    PyObject **errorHandler,
9061
                                    const char *reason,
9062
                                    PyObject *unicode, PyObject **exceptionObject,
9063
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9064
                                    Py_ssize_t *newpos)
9065
0
{
9066
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9067
9068
0
    Py_ssize_t i_newpos;
9069
0
    PyObject *restuple;
9070
0
    PyObject *resunicode;
9071
9072
0
    if (*errorHandler == NULL) {
9073
0
        *errorHandler = PyCodec_LookupError(errors);
9074
0
        if (*errorHandler == NULL)
9075
0
            return NULL;
9076
0
    }
9077
9078
0
    make_translate_exception(exceptionObject,
9079
0
                             unicode, startpos, endpos, reason);
9080
0
    if (*exceptionObject == NULL)
9081
0
        return NULL;
9082
9083
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9084
0
    if (restuple == NULL)
9085
0
        return NULL;
9086
0
    if (!PyTuple_Check(restuple)) {
9087
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9088
0
        Py_DECREF(restuple);
9089
0
        return NULL;
9090
0
    }
9091
0
    if (!PyArg_ParseTuple(restuple, argparse,
9092
0
                          &resunicode, &i_newpos)) {
9093
0
        Py_DECREF(restuple);
9094
0
        return NULL;
9095
0
    }
9096
0
    if (i_newpos<0)
9097
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9098
0
    else
9099
0
        *newpos = i_newpos;
9100
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9101
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9102
0
        Py_DECREF(restuple);
9103
0
        return NULL;
9104
0
    }
9105
0
    Py_INCREF(resunicode);
9106
0
    Py_DECREF(restuple);
9107
0
    return resunicode;
9108
0
}
9109
9110
/* Lookup the character ch in the mapping and put the result in result,
9111
   which must be decrefed by the caller.
9112
   The result can be PyLong, PyUnicode, None or NULL.
9113
   If the result is PyLong, put its value in replace.
9114
   Return 0 on success, -1 on error */
9115
static int
9116
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9117
18.8k
{
9118
18.8k
    PyObject *w = PyLong_FromLong((long)c);
9119
18.8k
    PyObject *x;
9120
9121
18.8k
    if (w == NULL)
9122
0
        return -1;
9123
18.8k
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9124
18.8k
    Py_DECREF(w);
9125
18.8k
    if (rc == 0) {
9126
        /* No mapping found means: use 1:1 mapping. */
9127
6.35k
        *result = NULL;
9128
6.35k
        return 0;
9129
6.35k
    }
9130
12.4k
    if (x == NULL) {
9131
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9132
            /* No mapping found means: use 1:1 mapping. */
9133
0
            PyErr_Clear();
9134
0
            *result = NULL;
9135
0
            return 0;
9136
0
        } else
9137
0
            return -1;
9138
0
    }
9139
12.4k
    else if (x == Py_None) {
9140
0
        *result = x;
9141
0
        return 0;
9142
0
    }
9143
12.4k
    else if (PyLong_Check(x)) {
9144
0
        long value = PyLong_AsLong(x);
9145
0
        if (value < 0 || value > MAX_UNICODE) {
9146
0
            PyErr_Format(PyExc_ValueError,
9147
0
                         "character mapping must be in range(0x%x)",
9148
0
                         MAX_UNICODE+1);
9149
0
            Py_DECREF(x);
9150
0
            return -1;
9151
0
        }
9152
0
        *result = x;
9153
0
        *replace = (Py_UCS4)value;
9154
0
        return 0;
9155
0
    }
9156
12.4k
    else if (PyUnicode_Check(x)) {
9157
12.4k
        *result = x;
9158
12.4k
        return 0;
9159
12.4k
    }
9160
0
    else {
9161
        /* wrong return value */
9162
0
        PyErr_SetString(PyExc_TypeError,
9163
0
                        "character mapping must return integer, None or str");
9164
0
        Py_DECREF(x);
9165
0
        return -1;
9166
0
    }
9167
12.4k
}
9168
9169
/* lookup the character, write the result into the writer.
9170
   Return 1 if the result was written into the writer, return 0 if the mapping
9171
   was undefined, raise an exception return -1 on error. */
9172
static int
9173
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9174
                        _PyUnicodeWriter *writer)
9175
6.40k
{
9176
6.40k
    PyObject *item;
9177
6.40k
    Py_UCS4 replace;
9178
9179
6.40k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9180
0
        return -1;
9181
9182
6.40k
    if (item == NULL) {
9183
        /* not found => default to 1:1 mapping */
9184
112
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9185
0
            return -1;
9186
0
        }
9187
112
        return 1;
9188
112
    }
9189
9190
6.29k
    if (item == Py_None) {
9191
0
        Py_DECREF(item);
9192
0
        return 0;
9193
0
    }
9194
9195
6.29k
    if (PyLong_Check(item)) {
9196
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9197
0
            Py_DECREF(item);
9198
0
            return -1;
9199
0
        }
9200
0
        Py_DECREF(item);
9201
0
        return 1;
9202
0
    }
9203
9204
6.29k
    if (!PyUnicode_Check(item)) {
9205
0
        Py_DECREF(item);
9206
0
        return -1;
9207
0
    }
9208
9209
6.29k
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9210
0
        Py_DECREF(item);
9211
0
        return -1;
9212
0
    }
9213
9214
6.29k
    Py_DECREF(item);
9215
6.29k
    return 1;
9216
6.29k
}
9217
9218
static int
9219
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9220
                              Py_UCS1 *translate)
9221
12.4k
{
9222
12.4k
    PyObject *item = NULL;
9223
12.4k
    Py_UCS4 replace;
9224
12.4k
    int ret = 0;
9225
9226
12.4k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9227
0
        return -1;
9228
0
    }
9229
9230
12.4k
    if (item == Py_None) {
9231
        /* deletion */
9232
0
        translate[ch] = 0xfe;
9233
0
    }
9234
12.4k
    else if (item == NULL) {
9235
        /* not found => default to 1:1 mapping */
9236
6.24k
        translate[ch] = ch;
9237
6.24k
        return 1;
9238
6.24k
    }
9239
6.18k
    else if (PyLong_Check(item)) {
9240
0
        if (replace > 127) {
9241
            /* invalid character or character outside ASCII:
9242
               skip the fast translate */
9243
0
            goto exit;
9244
0
        }
9245
0
        translate[ch] = (Py_UCS1)replace;
9246
0
    }
9247
6.18k
    else if (PyUnicode_Check(item)) {
9248
6.18k
        if (PyUnicode_GET_LENGTH(item) != 1)
9249
6.18k
            goto exit;
9250
9251
0
        replace = PyUnicode_READ_CHAR(item, 0);
9252
0
        if (replace > 127)
9253
0
            goto exit;
9254
0
        translate[ch] = (Py_UCS1)replace;
9255
0
    }
9256
0
    else {
9257
        /* not None, NULL, long or unicode */
9258
0
        goto exit;
9259
0
    }
9260
0
    ret = 1;
9261
9262
6.18k
  exit:
9263
6.18k
    Py_DECREF(item);
9264
6.18k
    return ret;
9265
0
}
9266
9267
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9268
   was translated into writer, return 0 if the input string was partially
9269
   translated into writer, raise an exception and return -1 on error. */
9270
static int
9271
unicode_fast_translate(PyObject *input, PyObject *mapping,
9272
                       _PyUnicodeWriter *writer, int ignore,
9273
                       Py_ssize_t *input_pos)
9274
12.3k
{
9275
12.3k
    Py_UCS1 ascii_table[128], ch, ch2;
9276
12.3k
    Py_ssize_t len;
9277
12.3k
    const Py_UCS1 *in, *end;
9278
12.3k
    Py_UCS1 *out;
9279
12.3k
    int res = 0;
9280
9281
12.3k
    len = PyUnicode_GET_LENGTH(input);
9282
9283
12.3k
    memset(ascii_table, 0xff, 128);
9284
9285
12.3k
    in = PyUnicode_1BYTE_DATA(input);
9286
12.3k
    end = in + len;
9287
9288
12.3k
    assert(PyUnicode_IS_ASCII(writer->buffer));
9289
12.3k
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9290
12.3k
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9291
9292
18.6k
    for (; in < end; in++) {
9293
12.4k
        ch = *in;
9294
12.4k
        ch2 = ascii_table[ch];
9295
12.4k
        if (ch2 == 0xff) {
9296
12.4k
            int translate = unicode_fast_translate_lookup(mapping, ch,
9297
12.4k
                                                          ascii_table);
9298
12.4k
            if (translate < 0)
9299
0
                return -1;
9300
12.4k
            if (translate == 0)
9301
6.18k
                goto exit;
9302
6.24k
            ch2 = ascii_table[ch];
9303
6.24k
        }
9304
6.28k
        if (ch2 == 0xfe) {
9305
0
            if (ignore)
9306
0
                continue;
9307
0
            goto exit;
9308
0
        }
9309
6.28k
        assert(ch2 < 128);
9310
6.28k
        *out = ch2;
9311
6.28k
        out++;
9312
6.28k
    }
9313
6.17k
    res = 1;
9314
9315
12.3k
exit:
9316
12.3k
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9317
12.3k
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9318
12.3k
    return res;
9319
6.17k
}
9320
9321
static PyObject *
9322
_PyUnicode_TranslateCharmap(PyObject *input,
9323
                            PyObject *mapping,
9324
                            const char *errors)
9325
12.3k
{
9326
    /* input object */
9327
12.3k
    const void *data;
9328
12.3k
    Py_ssize_t size, i;
9329
12.3k
    int kind;
9330
    /* output buffer */
9331
12.3k
    _PyUnicodeWriter writer;
9332
    /* error handler */
9333
12.3k
    const char *reason = "character maps to <undefined>";
9334
12.3k
    PyObject *errorHandler = NULL;
9335
12.3k
    PyObject *exc = NULL;
9336
12.3k
    int ignore;
9337
12.3k
    int res;
9338
9339
12.3k
    if (mapping == NULL) {
9340
0
        PyErr_BadArgument();
9341
0
        return NULL;
9342
0
    }
9343
9344
12.3k
    data = PyUnicode_DATA(input);
9345
12.3k
    kind = PyUnicode_KIND(input);
9346
12.3k
    size = PyUnicode_GET_LENGTH(input);
9347
9348
12.3k
    if (size == 0)
9349
0
        return PyUnicode_FromObject(input);
9350
9351
    /* allocate enough for a simple 1:1 translation without
9352
       replacements, if we need more, we'll resize */
9353
12.3k
    _PyUnicodeWriter_Init(&writer);
9354
12.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9355
0
        goto onError;
9356
9357
12.3k
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9358
9359
12.3k
    if (PyUnicode_IS_ASCII(input)) {
9360
12.3k
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9361
12.3k
        if (res < 0) {
9362
0
            _PyUnicodeWriter_Dealloc(&writer);
9363
0
            return NULL;
9364
0
        }
9365
12.3k
        if (res == 1)
9366
6.17k
            return _PyUnicodeWriter_Finish(&writer);
9367
12.3k
    }
9368
0
    else {
9369
0
        i = 0;
9370
0
    }
9371
9372
12.5k
    while (i<size) {
9373
        /* try to encode it */
9374
6.40k
        int translate;
9375
6.40k
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9376
6.40k
        Py_ssize_t newpos;
9377
        /* startpos for collecting untranslatable chars */
9378
6.40k
        Py_ssize_t collstart;
9379
6.40k
        Py_ssize_t collend;
9380
6.40k
        Py_UCS4 ch;
9381
9382
6.40k
        ch = PyUnicode_READ(kind, data, i);
9383
6.40k
        translate = charmaptranslate_output(ch, mapping, &writer);
9384
6.40k
        if (translate < 0)
9385
0
            goto onError;
9386
9387
6.40k
        if (translate != 0) {
9388
            /* it worked => adjust input pointer */
9389
6.40k
            ++i;
9390
6.40k
            continue;
9391
6.40k
        }
9392
9393
        /* untranslatable character */
9394
0
        collstart = i;
9395
0
        collend = i+1;
9396
9397
        /* find all untranslatable characters */
9398
0
        while (collend < size) {
9399
0
            PyObject *x;
9400
0
            Py_UCS4 replace;
9401
0
            ch = PyUnicode_READ(kind, data, collend);
9402
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9403
0
                goto onError;
9404
0
            Py_XDECREF(x);
9405
0
            if (x != Py_None)
9406
0
                break;
9407
0
            ++collend;
9408
0
        }
9409
9410
0
        if (ignore) {
9411
0
            i = collend;
9412
0
        }
9413
0
        else {
9414
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9415
0
                                                             reason, input, &exc,
9416
0
                                                             collstart, collend, &newpos);
9417
0
            if (repunicode == NULL)
9418
0
                goto onError;
9419
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9420
0
                Py_DECREF(repunicode);
9421
0
                goto onError;
9422
0
            }
9423
0
            Py_DECREF(repunicode);
9424
0
            i = newpos;
9425
0
        }
9426
0
    }
9427
6.18k
    Py_XDECREF(exc);
9428
6.18k
    Py_XDECREF(errorHandler);
9429
6.18k
    return _PyUnicodeWriter_Finish(&writer);
9430
9431
0
  onError:
9432
0
    _PyUnicodeWriter_Dealloc(&writer);
9433
0
    Py_XDECREF(exc);
9434
0
    Py_XDECREF(errorHandler);
9435
0
    return NULL;
9436
6.18k
}
9437
9438
PyObject *
9439
PyUnicode_Translate(PyObject *str,
9440
                    PyObject *mapping,
9441
                    const char *errors)
9442
0
{
9443
0
    if (ensure_unicode(str) < 0)
9444
0
        return NULL;
9445
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9446
0
}
9447
9448
PyObject *
9449
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9450
5.42M
{
9451
5.42M
    if (!PyUnicode_Check(unicode)) {
9452
0
        PyErr_BadInternalCall();
9453
0
        return NULL;
9454
0
    }
9455
5.42M
    if (PyUnicode_IS_ASCII(unicode)) {
9456
        /* If the string is already ASCII, just return the same string */
9457
5.42M
        return Py_NewRef(unicode);
9458
5.42M
    }
9459
9460
2.41k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9461
2.41k
    PyObject *result = PyUnicode_New(len, 127);
9462
2.41k
    if (result == NULL) {
9463
0
        return NULL;
9464
0
    }
9465
9466
2.41k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9467
2.41k
    int kind = PyUnicode_KIND(unicode);
9468
2.41k
    const void *data = PyUnicode_DATA(unicode);
9469
2.41k
    Py_ssize_t i;
9470
35.6k
    for (i = 0; i < len; ++i) {
9471
33.3k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9472
33.3k
        if (ch < 127) {
9473
30.4k
            out[i] = ch;
9474
30.4k
        }
9475
2.85k
        else if (Py_UNICODE_ISSPACE(ch)) {
9476
1.31k
            out[i] = ' ';
9477
1.31k
        }
9478
1.54k
        else {
9479
1.54k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9480
1.54k
            if (decimal < 0) {
9481
126
                out[i] = '?';
9482
126
                out[i+1] = '\0';
9483
126
                _PyUnicode_LENGTH(result) = i + 1;
9484
126
                break;
9485
126
            }
9486
1.42k
            out[i] = '0' + decimal;
9487
1.42k
        }
9488
33.3k
    }
9489
9490
2.41k
    assert(_PyUnicode_CheckConsistency(result, 1));
9491
2.41k
    return result;
9492
2.41k
}
9493
9494
/* --- Helpers ------------------------------------------------------------ */
9495
9496
/* helper macro to fixup start/end slice values */
9497
#define ADJUST_INDICES(start, end, len) \
9498
159M
    do {                                \
9499
159M
        if (end > len) {                \
9500
134M
            end = len;                  \
9501
134M
        }                               \
9502
159M
        else if (end < 0) {             \
9503
0
            end += len;                 \
9504
0
            if (end < 0) {              \
9505
0
                end = 0;                \
9506
0
            }                           \
9507
0
        }                               \
9508
159M
        if (start < 0) {                \
9509
17.7k
            start += len;               \
9510
17.7k
            if (start < 0) {            \
9511
0
                start = 0;              \
9512
0
            }                           \
9513
17.7k
        }                               \
9514
159M
    } while (0)
9515
9516
static Py_ssize_t
9517
any_find_slice(PyObject* s1, PyObject* s2,
9518
               Py_ssize_t start,
9519
               Py_ssize_t end,
9520
               int direction)
9521
24.2M
{
9522
24.2M
    int kind1, kind2;
9523
24.2M
    const void *buf1, *buf2;
9524
24.2M
    Py_ssize_t len1, len2, result;
9525
9526
24.2M
    kind1 = PyUnicode_KIND(s1);
9527
24.2M
    kind2 = PyUnicode_KIND(s2);
9528
24.2M
    if (kind1 < kind2)
9529
0
        return -1;
9530
9531
24.2M
    len1 = PyUnicode_GET_LENGTH(s1);
9532
24.2M
    len2 = PyUnicode_GET_LENGTH(s2);
9533
24.2M
    ADJUST_INDICES(start, end, len1);
9534
24.2M
    if (end - start < len2)
9535
1.67M
        return -1;
9536
9537
22.5M
    buf1 = PyUnicode_DATA(s1);
9538
22.5M
    buf2 = PyUnicode_DATA(s2);
9539
22.5M
    if (len2 == 1) {
9540
21.9M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9541
21.9M
        result = findchar((const char *)buf1 + kind1*start,
9542
21.9M
                          kind1, end - start, ch, direction);
9543
21.9M
        if (result == -1)
9544
4.08M
            return -1;
9545
17.8M
        else
9546
17.8M
            return start + result;
9547
21.9M
    }
9548
9549
567k
    if (kind2 != kind1) {
9550
298k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9551
298k
        if (!buf2)
9552
0
            return -2;
9553
298k
    }
9554
9555
567k
    if (direction > 0) {
9556
567k
        switch (kind1) {
9557
269k
        case PyUnicode_1BYTE_KIND:
9558
269k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9559
229k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9560
40.2k
            else
9561
40.2k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9562
269k
            break;
9563
139k
        case PyUnicode_2BYTE_KIND:
9564
139k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9565
139k
            break;
9566
158k
        case PyUnicode_4BYTE_KIND:
9567
158k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9568
158k
            break;
9569
0
        default:
9570
0
            Py_UNREACHABLE();
9571
567k
        }
9572
567k
    }
9573
0
    else {
9574
0
        switch (kind1) {
9575
0
        case PyUnicode_1BYTE_KIND:
9576
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9577
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9578
0
            else
9579
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9580
0
            break;
9581
0
        case PyUnicode_2BYTE_KIND:
9582
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9583
0
            break;
9584
0
        case PyUnicode_4BYTE_KIND:
9585
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9586
0
            break;
9587
0
        default:
9588
0
            Py_UNREACHABLE();
9589
0
        }
9590
0
    }
9591
9592
567k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9593
567k
    if (kind2 != kind1)
9594
298k
        PyMem_Free((void *)buf2);
9595
9596
567k
    return result;
9597
567k
}
9598
9599
9600
Py_ssize_t
9601
PyUnicode_Count(PyObject *str,
9602
                PyObject *substr,
9603
                Py_ssize_t start,
9604
                Py_ssize_t end)
9605
0
{
9606
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9607
0
        return -1;
9608
9609
0
    return unicode_count_impl(str, substr, start, end);
9610
0
}
9611
9612
Py_ssize_t
9613
PyUnicode_Find(PyObject *str,
9614
               PyObject *substr,
9615
               Py_ssize_t start,
9616
               Py_ssize_t end,
9617
               int direction)
9618
0
{
9619
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9620
0
        return -2;
9621
9622
0
    return any_find_slice(str, substr, start, end, direction);
9623
0
}
9624
9625
Py_ssize_t
9626
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9627
                   Py_ssize_t start, Py_ssize_t end,
9628
                   int direction)
9629
3.53M
{
9630
3.53M
    int kind;
9631
3.53M
    Py_ssize_t len, result;
9632
3.53M
    len = PyUnicode_GET_LENGTH(str);
9633
3.53M
    ADJUST_INDICES(start, end, len);
9634
3.53M
    if (end - start < 1)
9635
0
        return -1;
9636
3.53M
    kind = PyUnicode_KIND(str);
9637
3.53M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9638
3.53M
                      kind, end-start, ch, direction);
9639
3.53M
    if (result == -1)
9640
2.57M
        return -1;
9641
961k
    else
9642
961k
        return start + result;
9643
3.53M
}
9644
9645
static int
9646
tailmatch(PyObject *self,
9647
          PyObject *substring,
9648
          Py_ssize_t start,
9649
          Py_ssize_t end,
9650
          int direction)
9651
104M
{
9652
104M
    int kind_self;
9653
104M
    int kind_sub;
9654
104M
    const void *data_self;
9655
104M
    const void *data_sub;
9656
104M
    Py_ssize_t offset;
9657
104M
    Py_ssize_t i;
9658
104M
    Py_ssize_t end_sub;
9659
9660
104M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9661
104M
    end -= PyUnicode_GET_LENGTH(substring);
9662
104M
    if (end < start)
9663
8.89M
        return 0;
9664
9665
95.3M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9666
0
        return 1;
9667
9668
95.3M
    kind_self = PyUnicode_KIND(self);
9669
95.3M
    data_self = PyUnicode_DATA(self);
9670
95.3M
    kind_sub = PyUnicode_KIND(substring);
9671
95.3M
    data_sub = PyUnicode_DATA(substring);
9672
95.3M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9673
9674
95.3M
    if (direction > 0)
9675
7.55M
        offset = end;
9676
87.8M
    else
9677
87.8M
        offset = start;
9678
9679
95.3M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9680
95.3M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9681
48.8M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9682
48.8M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9683
        /* If both are of the same kind, memcmp is sufficient */
9684
16.5M
        if (kind_self == kind_sub) {
9685
11.0M
            return ! memcmp((char *)data_self +
9686
11.0M
                                (offset * PyUnicode_KIND(substring)),
9687
11.0M
                            data_sub,
9688
11.0M
                            PyUnicode_GET_LENGTH(substring) *
9689
11.0M
                                PyUnicode_KIND(substring));
9690
11.0M
        }
9691
        /* otherwise we have to compare each character by first accessing it */
9692
5.56M
        else {
9693
            /* We do not need to compare 0 and len(substring)-1 because
9694
               the if statement above ensured already that they are equal
9695
               when we end up here. */
9696
6.06M
            for (i = 1; i < end_sub; ++i) {
9697
504k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9698
504k
                    PyUnicode_READ(kind_sub, data_sub, i))
9699
8.28k
                    return 0;
9700
504k
            }
9701
5.55M
            return 1;
9702
5.56M
        }
9703
16.5M
    }
9704
9705
78.8M
    return 0;
9706
95.3M
}
9707
9708
Py_ssize_t
9709
PyUnicode_Tailmatch(PyObject *str,
9710
                    PyObject *substr,
9711
                    Py_ssize_t start,
9712
                    Py_ssize_t end,
9713
                    int direction)
9714
135
{
9715
135
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9716
0
        return -1;
9717
9718
135
    return tailmatch(str, substr, start, end, direction);
9719
135
}
9720
9721
static PyObject *
9722
ascii_upper_or_lower(PyObject *self, int lower)
9723
83.0M
{
9724
83.0M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9725
83.0M
    const char *data = PyUnicode_DATA(self);
9726
83.0M
    char *resdata;
9727
83.0M
    PyObject *res;
9728
9729
83.0M
    res = PyUnicode_New(len, 127);
9730
83.0M
    if (res == NULL)
9731
0
        return NULL;
9732
83.0M
    resdata = PyUnicode_DATA(res);
9733
83.0M
    if (lower)
9734
83.0M
        _Py_bytes_lower(resdata, data, len);
9735
306
    else
9736
306
        _Py_bytes_upper(resdata, data, len);
9737
83.0M
    return res;
9738
83.0M
}
9739
9740
static Py_UCS4
9741
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9742
499k
{
9743
499k
    Py_ssize_t j;
9744
499k
    int final_sigma;
9745
499k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9746
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9747
9748
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9749
9750
    where ! is a negation and \p{xxx} is a character with property xxx.
9751
    */
9752
927k
    for (j = i - 1; j >= 0; j--) {
9753
925k
        c = PyUnicode_READ(kind, data, j);
9754
925k
        if (!_PyUnicode_IsCaseIgnorable(c))
9755
497k
            break;
9756
925k
    }
9757
499k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9758
499k
    if (final_sigma) {
9759
782k
        for (j = i + 1; j < length; j++) {
9760
780k
            c = PyUnicode_READ(kind, data, j);
9761
780k
            if (!_PyUnicode_IsCaseIgnorable(c))
9762
390k
                break;
9763
780k
        }
9764
392k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9765
392k
    }
9766
499k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9767
499k
}
9768
9769
static int
9770
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9771
           Py_UCS4 c, Py_UCS4 *mapped)
9772
119M
{
9773
    /* Obscure special case. */
9774
119M
    if (c == 0x3A3) {
9775
499k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9776
499k
        return 1;
9777
499k
    }
9778
119M
    return _PyUnicode_ToLowerFull(c, mapped);
9779
119M
}
9780
9781
static Py_ssize_t
9782
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9783
0
{
9784
0
    Py_ssize_t i, k = 0;
9785
0
    int n_res, j;
9786
0
    Py_UCS4 c, mapped[3];
9787
9788
0
    c = PyUnicode_READ(kind, data, 0);
9789
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9790
0
    for (j = 0; j < n_res; j++) {
9791
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9792
0
        res[k++] = mapped[j];
9793
0
    }
9794
0
    for (i = 1; i < length; i++) {
9795
0
        c = PyUnicode_READ(kind, data, i);
9796
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9797
0
        for (j = 0; j < n_res; j++) {
9798
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9799
0
            res[k++] = mapped[j];
9800
0
        }
9801
0
    }
9802
0
    return k;
9803
0
}
9804
9805
static Py_ssize_t
9806
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9807
0
    Py_ssize_t i, k = 0;
9808
9809
0
    for (i = 0; i < length; i++) {
9810
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9811
0
        int n_res, j;
9812
0
        if (Py_UNICODE_ISUPPER(c)) {
9813
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9814
0
        }
9815
0
        else if (Py_UNICODE_ISLOWER(c)) {
9816
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9817
0
        }
9818
0
        else {
9819
0
            n_res = 1;
9820
0
            mapped[0] = c;
9821
0
        }
9822
0
        for (j = 0; j < n_res; j++) {
9823
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9824
0
            res[k++] = mapped[j];
9825
0
        }
9826
0
    }
9827
0
    return k;
9828
0
}
9829
9830
static Py_ssize_t
9831
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9832
                  Py_UCS4 *maxchar, int lower)
9833
3.13M
{
9834
3.13M
    Py_ssize_t i, k = 0;
9835
9836
123M
    for (i = 0; i < length; i++) {
9837
119M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9838
119M
        int n_res, j;
9839
119M
        if (lower)
9840
119M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9841
0
        else
9842
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9843
239M
        for (j = 0; j < n_res; j++) {
9844
119M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9845
119M
            res[k++] = mapped[j];
9846
119M
        }
9847
119M
    }
9848
3.13M
    return k;
9849
3.13M
}
9850
9851
static Py_ssize_t
9852
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9853
0
{
9854
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9855
0
}
9856
9857
static Py_ssize_t
9858
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9859
3.13M
{
9860
3.13M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9861
3.13M
}
9862
9863
static Py_ssize_t
9864
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9865
0
{
9866
0
    Py_ssize_t i, k = 0;
9867
9868
0
    for (i = 0; i < length; i++) {
9869
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9870
0
        Py_UCS4 mapped[3];
9871
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9872
0
        for (j = 0; j < n_res; j++) {
9873
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9874
0
            res[k++] = mapped[j];
9875
0
        }
9876
0
    }
9877
0
    return k;
9878
0
}
9879
9880
static Py_ssize_t
9881
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9882
0
{
9883
0
    Py_ssize_t i, k = 0;
9884
0
    int previous_is_cased;
9885
9886
0
    previous_is_cased = 0;
9887
0
    for (i = 0; i < length; i++) {
9888
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9889
0
        Py_UCS4 mapped[3];
9890
0
        int n_res, j;
9891
9892
0
        if (previous_is_cased)
9893
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9894
0
        else
9895
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9896
9897
0
        for (j = 0; j < n_res; j++) {
9898
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9899
0
            res[k++] = mapped[j];
9900
0
        }
9901
9902
0
        previous_is_cased = _PyUnicode_IsCased(c);
9903
0
    }
9904
0
    return k;
9905
0
}
9906
9907
static PyObject *
9908
case_operation(PyObject *self,
9909
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9910
3.13M
{
9911
3.13M
    PyObject *res = NULL;
9912
3.13M
    Py_ssize_t length, newlength = 0;
9913
3.13M
    int kind, outkind;
9914
3.13M
    const void *data;
9915
3.13M
    void *outdata;
9916
3.13M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9917
9918
3.13M
    kind = PyUnicode_KIND(self);
9919
3.13M
    data = PyUnicode_DATA(self);
9920
3.13M
    length = PyUnicode_GET_LENGTH(self);
9921
3.13M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9922
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9923
0
        return NULL;
9924
0
    }
9925
3.13M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9926
3.13M
    if (tmp == NULL)
9927
0
        return PyErr_NoMemory();
9928
3.13M
    newlength = perform(kind, data, length, tmp, &maxchar);
9929
3.13M
    res = PyUnicode_New(newlength, maxchar);
9930
3.13M
    if (res == NULL)
9931
0
        goto leave;
9932
3.13M
    tmpend = tmp + newlength;
9933
3.13M
    outdata = PyUnicode_DATA(res);
9934
3.13M
    outkind = PyUnicode_KIND(res);
9935
3.13M
    switch (outkind) {
9936
205k
    case PyUnicode_1BYTE_KIND:
9937
205k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9938
205k
        break;
9939
2.86M
    case PyUnicode_2BYTE_KIND:
9940
2.86M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9941
2.86M
        break;
9942
62.7k
    case PyUnicode_4BYTE_KIND:
9943
62.7k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9944
62.7k
        break;
9945
0
    default:
9946
0
        Py_UNREACHABLE();
9947
3.13M
    }
9948
3.13M
  leave:
9949
3.13M
    PyMem_Free(tmp);
9950
3.13M
    return res;
9951
3.13M
}
9952
9953
PyObject *
9954
PyUnicode_Join(PyObject *separator, PyObject *seq)
9955
34.3M
{
9956
34.3M
    PyObject *res;
9957
34.3M
    PyObject *fseq;
9958
34.3M
    Py_ssize_t seqlen;
9959
34.3M
    PyObject **items;
9960
9961
34.3M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9962
34.3M
    if (fseq == NULL) {
9963
614
        return NULL;
9964
614
    }
9965
9966
34.3M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9967
9968
34.3M
    items = PySequence_Fast_ITEMS(fseq);
9969
34.3M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9970
34.3M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9971
9972
34.3M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9973
9974
34.3M
    Py_DECREF(fseq);
9975
34.3M
    return res;
9976
34.3M
}
9977
9978
PyObject *
9979
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9980
81.9M
{
9981
81.9M
    PyObject *res = NULL; /* the result */
9982
81.9M
    PyObject *sep = NULL;
9983
81.9M
    Py_ssize_t seplen;
9984
81.9M
    PyObject *item;
9985
81.9M
    Py_ssize_t sz, i, res_offset;
9986
81.9M
    Py_UCS4 maxchar;
9987
81.9M
    Py_UCS4 item_maxchar;
9988
81.9M
    int use_memcpy;
9989
81.9M
    unsigned char *res_data = NULL, *sep_data = NULL;
9990
81.9M
    PyObject *last_obj;
9991
81.9M
    int kind = 0;
9992
9993
    /* If empty sequence, return u"". */
9994
81.9M
    if (seqlen == 0) {
9995
9.82M
        _Py_RETURN_UNICODE_EMPTY();
9996
9.82M
    }
9997
9998
    /* If singleton sequence with an exact Unicode, return that. */
9999
72.1M
    last_obj = NULL;
10000
72.1M
    if (seqlen == 1) {
10001
13.4M
        if (PyUnicode_CheckExact(items[0])) {
10002
12.0M
            res = items[0];
10003
12.0M
            return Py_NewRef(res);
10004
12.0M
        }
10005
1.40M
        seplen = 0;
10006
1.40M
        maxchar = 0;
10007
1.40M
    }
10008
58.6M
    else {
10009
        /* Set up sep and seplen */
10010
58.6M
        if (separator == NULL) {
10011
            /* fall back to a blank space separator */
10012
0
            sep = PyUnicode_FromOrdinal(' ');
10013
0
            if (!sep)
10014
0
                goto onError;
10015
0
            seplen = 1;
10016
0
            maxchar = 32;
10017
0
        }
10018
58.6M
        else {
10019
58.6M
            if (!PyUnicode_Check(separator)) {
10020
0
                PyErr_Format(PyExc_TypeError,
10021
0
                             "separator: expected str instance,"
10022
0
                             " %.80s found",
10023
0
                             Py_TYPE(separator)->tp_name);
10024
0
                goto onError;
10025
0
            }
10026
58.6M
            sep = separator;
10027
58.6M
            seplen = PyUnicode_GET_LENGTH(separator);
10028
58.6M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10029
            /* inc refcount to keep this code path symmetric with the
10030
               above case of a blank separator */
10031
58.6M
            Py_INCREF(sep);
10032
58.6M
        }
10033
58.6M
        last_obj = sep;
10034
58.6M
    }
10035
10036
    /* There are at least two things to join, or else we have a subclass
10037
     * of str in the sequence.
10038
     * Do a pre-pass to figure out the total amount of space we'll
10039
     * need (sz), and see whether all argument are strings.
10040
     */
10041
60.0M
    sz = 0;
10042
#ifdef Py_DEBUG
10043
    use_memcpy = 0;
10044
#else
10045
60.0M
    use_memcpy = 1;
10046
60.0M
#endif
10047
432M
    for (i = 0; i < seqlen; i++) {
10048
372M
        size_t add_sz;
10049
372M
        item = items[i];
10050
372M
        if (!PyUnicode_Check(item)) {
10051
0
            PyErr_Format(PyExc_TypeError,
10052
0
                         "sequence item %zd: expected str instance,"
10053
0
                         " %.80s found",
10054
0
                         i, Py_TYPE(item)->tp_name);
10055
0
            goto onError;
10056
0
        }
10057
372M
        add_sz = PyUnicode_GET_LENGTH(item);
10058
372M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10059
372M
        maxchar = Py_MAX(maxchar, item_maxchar);
10060
372M
        if (i != 0) {
10061
312M
            add_sz += seplen;
10062
312M
        }
10063
372M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10064
0
            PyErr_SetString(PyExc_OverflowError,
10065
0
                            "join() result is too long for a Python string");
10066
0
            goto onError;
10067
0
        }
10068
372M
        sz += add_sz;
10069
372M
        if (use_memcpy && last_obj != NULL) {
10070
299M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10071
6.63M
                use_memcpy = 0;
10072
299M
        }
10073
372M
        last_obj = item;
10074
372M
    }
10075
10076
60.0M
    res = PyUnicode_New(sz, maxchar);
10077
60.0M
    if (res == NULL)
10078
0
        goto onError;
10079
10080
    /* Catenate everything. */
10081
#ifdef Py_DEBUG
10082
    use_memcpy = 0;
10083
#else
10084
60.0M
    if (use_memcpy) {
10085
53.4M
        res_data = PyUnicode_1BYTE_DATA(res);
10086
53.4M
        kind = PyUnicode_KIND(res);
10087
53.4M
        if (seplen != 0)
10088
207k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10089
53.4M
    }
10090
60.0M
#endif
10091
60.0M
    if (use_memcpy) {
10092
324M
        for (i = 0; i < seqlen; ++i) {
10093
271M
            Py_ssize_t itemlen;
10094
271M
            item = items[i];
10095
10096
            /* Copy item, and maybe the separator. */
10097
271M
            if (i && seplen != 0) {
10098
766k
                memcpy(res_data,
10099
766k
                          sep_data,
10100
766k
                          kind * seplen);
10101
766k
                res_data += kind * seplen;
10102
766k
            }
10103
10104
271M
            itemlen = PyUnicode_GET_LENGTH(item);
10105
271M
            if (itemlen != 0) {
10106
234M
                memcpy(res_data,
10107
234M
                          PyUnicode_DATA(item),
10108
234M
                          kind * itemlen);
10109
234M
                res_data += kind * itemlen;
10110
234M
            }
10111
271M
        }
10112
53.4M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10113
53.4M
                           + kind * PyUnicode_GET_LENGTH(res));
10114
53.4M
    }
10115
6.63M
    else {
10116
108M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10117
101M
            Py_ssize_t itemlen;
10118
101M
            item = items[i];
10119
10120
            /* Copy item, and maybe the separator. */
10121
101M
            if (i && seplen != 0) {
10122
1.01M
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10123
1.01M
                res_offset += seplen;
10124
1.01M
            }
10125
10126
101M
            itemlen = PyUnicode_GET_LENGTH(item);
10127
101M
            if (itemlen != 0) {
10128
98.1M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10129
98.1M
                res_offset += itemlen;
10130
98.1M
            }
10131
101M
        }
10132
6.63M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10133
6.63M
    }
10134
10135
60.0M
    Py_XDECREF(sep);
10136
60.0M
    assert(_PyUnicode_CheckConsistency(res, 1));
10137
60.0M
    return res;
10138
10139
0
  onError:
10140
0
    Py_XDECREF(sep);
10141
0
    Py_XDECREF(res);
10142
0
    return NULL;
10143
60.0M
}
10144
10145
void
10146
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10147
                    Py_UCS4 fill_char)
10148
587
{
10149
587
    const int kind = PyUnicode_KIND(unicode);
10150
587
    void *data = PyUnicode_DATA(unicode);
10151
587
    assert(_PyUnicode_IsModifiable(unicode));
10152
587
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10153
587
    assert(start >= 0);
10154
587
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10155
587
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10156
587
}
10157
10158
Py_ssize_t
10159
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10160
               Py_UCS4 fill_char)
10161
587
{
10162
587
    Py_ssize_t maxlen;
10163
10164
587
    if (!PyUnicode_Check(unicode)) {
10165
0
        PyErr_BadInternalCall();
10166
0
        return -1;
10167
0
    }
10168
587
    if (unicode_check_modifiable(unicode))
10169
0
        return -1;
10170
10171
587
    if (start < 0) {
10172
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10173
0
        return -1;
10174
0
    }
10175
587
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10176
0
        PyErr_SetString(PyExc_ValueError,
10177
0
                         "fill character is bigger than "
10178
0
                         "the string maximum character");
10179
0
        return -1;
10180
0
    }
10181
10182
587
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10183
587
    length = Py_MIN(maxlen, length);
10184
587
    if (length <= 0)
10185
0
        return 0;
10186
10187
587
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10188
587
    return length;
10189
587
}
10190
10191
static PyObject *
10192
pad(PyObject *self,
10193
    Py_ssize_t left,
10194
    Py_ssize_t right,
10195
    Py_UCS4 fill)
10196
48
{
10197
48
    PyObject *u;
10198
48
    Py_UCS4 maxchar;
10199
48
    int kind;
10200
48
    void *data;
10201
10202
48
    if (left < 0)
10203
0
        left = 0;
10204
48
    if (right < 0)
10205
0
        right = 0;
10206
10207
48
    if (left == 0 && right == 0)
10208
0
        return unicode_result_unchanged(self);
10209
10210
48
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10211
48
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10212
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10213
0
        return NULL;
10214
0
    }
10215
48
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10216
48
    maxchar = Py_MAX(maxchar, fill);
10217
48
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10218
48
    if (!u)
10219
0
        return NULL;
10220
10221
48
    kind = PyUnicode_KIND(u);
10222
48
    data = PyUnicode_DATA(u);
10223
48
    if (left)
10224
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10225
48
    if (right)
10226
48
        _PyUnicode_Fill(kind, data, fill,
10227
48
                        left + _PyUnicode_LENGTH(self), right);
10228
48
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10229
48
    assert(_PyUnicode_CheckConsistency(u, 1));
10230
48
    return u;
10231
48
}
10232
10233
PyObject *
10234
PyUnicode_Splitlines(PyObject *string, int keepends)
10235
13.5k
{
10236
13.5k
    PyObject *list;
10237
10238
13.5k
    if (ensure_unicode(string) < 0)
10239
0
        return NULL;
10240
10241
13.5k
    switch (PyUnicode_KIND(string)) {
10242
3.53k
    case PyUnicode_1BYTE_KIND:
10243
3.53k
        if (PyUnicode_IS_ASCII(string))
10244
2.69k
            list = asciilib_splitlines(
10245
2.69k
                string, PyUnicode_1BYTE_DATA(string),
10246
2.69k
                PyUnicode_GET_LENGTH(string), keepends);
10247
848
        else
10248
848
            list = ucs1lib_splitlines(
10249
848
                string, PyUnicode_1BYTE_DATA(string),
10250
848
                PyUnicode_GET_LENGTH(string), keepends);
10251
3.53k
        break;
10252
6.87k
    case PyUnicode_2BYTE_KIND:
10253
6.87k
        list = ucs2lib_splitlines(
10254
6.87k
            string, PyUnicode_2BYTE_DATA(string),
10255
6.87k
            PyUnicode_GET_LENGTH(string), keepends);
10256
6.87k
        break;
10257
3.09k
    case PyUnicode_4BYTE_KIND:
10258
3.09k
        list = ucs4lib_splitlines(
10259
3.09k
            string, PyUnicode_4BYTE_DATA(string),
10260
3.09k
            PyUnicode_GET_LENGTH(string), keepends);
10261
3.09k
        break;
10262
0
    default:
10263
0
        Py_UNREACHABLE();
10264
13.5k
    }
10265
13.5k
    return list;
10266
13.5k
}
10267
10268
static PyObject *
10269
split(PyObject *self,
10270
      PyObject *substring,
10271
      Py_ssize_t maxcount)
10272
27.0M
{
10273
27.0M
    int kind1, kind2;
10274
27.0M
    const void *buf1, *buf2;
10275
27.0M
    Py_ssize_t len1, len2;
10276
27.0M
    PyObject* out;
10277
27.0M
    len1 = PyUnicode_GET_LENGTH(self);
10278
27.0M
    kind1 = PyUnicode_KIND(self);
10279
10280
27.0M
    if (substring == NULL) {
10281
168k
        if (maxcount < 0) {
10282
144k
            maxcount = (len1 - 1) / 2 + 1;
10283
144k
        }
10284
168k
        switch (kind1) {
10285
106k
        case PyUnicode_1BYTE_KIND:
10286
106k
            if (PyUnicode_IS_ASCII(self))
10287
79.6k
                return asciilib_split_whitespace(
10288
79.6k
                    self,  PyUnicode_1BYTE_DATA(self),
10289
79.6k
                    len1, maxcount
10290
79.6k
                    );
10291
27.3k
            else
10292
27.3k
                return ucs1lib_split_whitespace(
10293
27.3k
                    self,  PyUnicode_1BYTE_DATA(self),
10294
27.3k
                    len1, maxcount
10295
27.3k
                    );
10296
50.2k
        case PyUnicode_2BYTE_KIND:
10297
50.2k
            return ucs2lib_split_whitespace(
10298
50.2k
                self,  PyUnicode_2BYTE_DATA(self),
10299
50.2k
                len1, maxcount
10300
50.2k
                );
10301
11.5k
        case PyUnicode_4BYTE_KIND:
10302
11.5k
            return ucs4lib_split_whitespace(
10303
11.5k
                self,  PyUnicode_4BYTE_DATA(self),
10304
11.5k
                len1, maxcount
10305
11.5k
                );
10306
0
        default:
10307
0
            Py_UNREACHABLE();
10308
168k
        }
10309
168k
    }
10310
10311
26.8M
    kind2 = PyUnicode_KIND(substring);
10312
26.8M
    len2 = PyUnicode_GET_LENGTH(substring);
10313
26.8M
    if (maxcount < 0) {
10314
        // if len2 == 0, it will raise ValueError.
10315
20.7M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10316
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10317
20.7M
        maxcount = maxcount < 0 ? len1 : maxcount;
10318
20.7M
    }
10319
26.8M
    if (kind1 < kind2 || len1 < len2) {
10320
1.35M
        out = PyList_New(1);
10321
1.35M
        if (out == NULL)
10322
0
            return NULL;
10323
1.35M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10324
1.35M
        return out;
10325
1.35M
    }
10326
25.5M
    buf1 = PyUnicode_DATA(self);
10327
25.5M
    buf2 = PyUnicode_DATA(substring);
10328
25.5M
    if (kind2 != kind1) {
10329
239k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10330
239k
        if (!buf2)
10331
0
            return NULL;
10332
239k
    }
10333
10334
25.5M
    switch (kind1) {
10335
25.2M
    case PyUnicode_1BYTE_KIND:
10336
25.2M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10337
24.0M
            out = asciilib_split(
10338
24.0M
                self,  buf1, len1, buf2, len2, maxcount);
10339
1.26M
        else
10340
1.26M
            out = ucs1lib_split(
10341
1.26M
                self,  buf1, len1, buf2, len2, maxcount);
10342
25.2M
        break;
10343
202k
    case PyUnicode_2BYTE_KIND:
10344
202k
        out = ucs2lib_split(
10345
202k
            self,  buf1, len1, buf2, len2, maxcount);
10346
202k
        break;
10347
36.1k
    case PyUnicode_4BYTE_KIND:
10348
36.1k
        out = ucs4lib_split(
10349
36.1k
            self,  buf1, len1, buf2, len2, maxcount);
10350
36.1k
        break;
10351
0
    default:
10352
0
        out = NULL;
10353
25.5M
    }
10354
25.5M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10355
25.5M
    if (kind2 != kind1)
10356
239k
        PyMem_Free((void *)buf2);
10357
25.5M
    return out;
10358
25.5M
}
10359
10360
static PyObject *
10361
rsplit(PyObject *self,
10362
       PyObject *substring,
10363
       Py_ssize_t maxcount)
10364
78
{
10365
78
    int kind1, kind2;
10366
78
    const void *buf1, *buf2;
10367
78
    Py_ssize_t len1, len2;
10368
78
    PyObject* out;
10369
10370
78
    len1 = PyUnicode_GET_LENGTH(self);
10371
78
    kind1 = PyUnicode_KIND(self);
10372
10373
78
    if (substring == NULL) {
10374
0
        if (maxcount < 0) {
10375
0
            maxcount = (len1 - 1) / 2 + 1;
10376
0
        }
10377
0
        switch (kind1) {
10378
0
        case PyUnicode_1BYTE_KIND:
10379
0
            if (PyUnicode_IS_ASCII(self))
10380
0
                return asciilib_rsplit_whitespace(
10381
0
                    self,  PyUnicode_1BYTE_DATA(self),
10382
0
                    len1, maxcount
10383
0
                    );
10384
0
            else
10385
0
                return ucs1lib_rsplit_whitespace(
10386
0
                    self,  PyUnicode_1BYTE_DATA(self),
10387
0
                    len1, maxcount
10388
0
                    );
10389
0
        case PyUnicode_2BYTE_KIND:
10390
0
            return ucs2lib_rsplit_whitespace(
10391
0
                self,  PyUnicode_2BYTE_DATA(self),
10392
0
                len1, maxcount
10393
0
                );
10394
0
        case PyUnicode_4BYTE_KIND:
10395
0
            return ucs4lib_rsplit_whitespace(
10396
0
                self,  PyUnicode_4BYTE_DATA(self),
10397
0
                len1, maxcount
10398
0
                );
10399
0
        default:
10400
0
            Py_UNREACHABLE();
10401
0
        }
10402
0
    }
10403
78
    kind2 = PyUnicode_KIND(substring);
10404
78
    len2 = PyUnicode_GET_LENGTH(substring);
10405
78
    if (maxcount < 0) {
10406
        // if len2 == 0, it will raise ValueError.
10407
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10408
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10409
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10410
0
    }
10411
78
    if (kind1 < kind2 || len1 < len2) {
10412
0
        out = PyList_New(1);
10413
0
        if (out == NULL)
10414
0
            return NULL;
10415
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10416
0
        return out;
10417
0
    }
10418
78
    buf1 = PyUnicode_DATA(self);
10419
78
    buf2 = PyUnicode_DATA(substring);
10420
78
    if (kind2 != kind1) {
10421
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10422
0
        if (!buf2)
10423
0
            return NULL;
10424
0
    }
10425
10426
78
    switch (kind1) {
10427
78
    case PyUnicode_1BYTE_KIND:
10428
78
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10429
78
            out = asciilib_rsplit(
10430
78
                self,  buf1, len1, buf2, len2, maxcount);
10431
0
        else
10432
0
            out = ucs1lib_rsplit(
10433
0
                self,  buf1, len1, buf2, len2, maxcount);
10434
78
        break;
10435
0
    case PyUnicode_2BYTE_KIND:
10436
0
        out = ucs2lib_rsplit(
10437
0
            self,  buf1, len1, buf2, len2, maxcount);
10438
0
        break;
10439
0
    case PyUnicode_4BYTE_KIND:
10440
0
        out = ucs4lib_rsplit(
10441
0
            self,  buf1, len1, buf2, len2, maxcount);
10442
0
        break;
10443
0
    default:
10444
0
        out = NULL;
10445
78
    }
10446
78
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10447
78
    if (kind2 != kind1)
10448
0
        PyMem_Free((void *)buf2);
10449
78
    return out;
10450
78
}
10451
10452
static Py_ssize_t
10453
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10454
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10455
135M
{
10456
135M
    switch (kind) {
10457
27.9M
    case PyUnicode_1BYTE_KIND:
10458
27.9M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10459
23.6M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10460
4.35M
        else
10461
4.35M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10462
61.6M
    case PyUnicode_2BYTE_KIND:
10463
61.6M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10464
45.6M
    case PyUnicode_4BYTE_KIND:
10465
45.6M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10466
135M
    }
10467
135M
    Py_UNREACHABLE();
10468
135M
}
10469
10470
static Py_ssize_t
10471
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10472
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10473
43.1M
{
10474
43.1M
    switch (kind) {
10475
36.7M
    case PyUnicode_1BYTE_KIND:
10476
36.7M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10477
6.25M
    case PyUnicode_2BYTE_KIND:
10478
6.25M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10479
120k
    case PyUnicode_4BYTE_KIND:
10480
120k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10481
43.1M
    }
10482
43.1M
    Py_UNREACHABLE();
10483
43.1M
}
10484
10485
static void
10486
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10487
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10488
1.70M
{
10489
1.70M
    int kind = PyUnicode_KIND(u);
10490
1.70M
    void *data = PyUnicode_DATA(u);
10491
1.70M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10492
1.70M
    if (kind == PyUnicode_1BYTE_KIND) {
10493
661k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10494
661k
                                      (Py_UCS1 *)data + len,
10495
661k
                                      u1, u2, maxcount);
10496
661k
    }
10497
1.04M
    else if (kind == PyUnicode_2BYTE_KIND) {
10498
1.03M
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10499
1.03M
                                      (Py_UCS2 *)data + len,
10500
1.03M
                                      u1, u2, maxcount);
10501
1.03M
    }
10502
14.2k
    else {
10503
14.2k
        assert(kind == PyUnicode_4BYTE_KIND);
10504
14.2k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10505
14.2k
                                      (Py_UCS4 *)data + len,
10506
14.2k
                                      u1, u2, maxcount);
10507
14.2k
    }
10508
1.70M
}
10509
10510
static PyObject *
10511
replace(PyObject *self, PyObject *str1,
10512
        PyObject *str2, Py_ssize_t maxcount)
10513
75.6M
{
10514
75.6M
    PyObject *u;
10515
75.6M
    const char *sbuf = PyUnicode_DATA(self);
10516
75.6M
    const void *buf1 = PyUnicode_DATA(str1);
10517
75.6M
    const void *buf2 = PyUnicode_DATA(str2);
10518
75.6M
    int srelease = 0, release1 = 0, release2 = 0;
10519
75.6M
    int skind = PyUnicode_KIND(self);
10520
75.6M
    int kind1 = PyUnicode_KIND(str1);
10521
75.6M
    int kind2 = PyUnicode_KIND(str2);
10522
75.6M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10523
75.6M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10524
75.6M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10525
75.6M
    int mayshrink;
10526
75.6M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10527
10528
75.6M
    if (slen < len1)
10529
25.0M
        goto nothing;
10530
10531
50.6M
    if (maxcount < 0)
10532
50.6M
        maxcount = PY_SSIZE_T_MAX;
10533
0
    else if (maxcount == 0)
10534
0
        goto nothing;
10535
10536
50.6M
    if (str1 == str2)
10537
30.2k
        goto nothing;
10538
10539
50.5M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10540
50.5M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10541
50.5M
    if (maxchar < maxchar_str1)
10542
        /* substring too wide to be present */
10543
0
        goto nothing;
10544
50.5M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10545
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10546
       result string. */
10547
50.5M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10548
50.5M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10549
10550
50.5M
    if (len1 == len2) {
10551
        /* same length */
10552
7.44M
        if (len1 == 0)
10553
0
            goto nothing;
10554
7.44M
        if (len1 == 1) {
10555
            /* replace characters */
10556
7.43M
            Py_UCS4 u1, u2;
10557
7.43M
            Py_ssize_t pos;
10558
10559
7.43M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10560
7.43M
            pos = findchar(sbuf, skind, slen, u1, 1);
10561
7.43M
            if (pos < 0)
10562
5.72M
                goto nothing;
10563
1.70M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10564
1.70M
            u = PyUnicode_New(slen, maxchar);
10565
1.70M
            if (!u)
10566
0
                goto error;
10567
10568
1.70M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10569
1.70M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10570
1.70M
        }
10571
9.27k
        else {
10572
9.27k
            int rkind = skind;
10573
9.27k
            char *res;
10574
9.27k
            Py_ssize_t i;
10575
10576
9.27k
            if (kind1 < rkind) {
10577
                /* widen substring */
10578
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10579
0
                if (!buf1) goto error;
10580
0
                release1 = 1;
10581
0
            }
10582
9.27k
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10583
9.27k
            if (i < 0)
10584
9.27k
                goto nothing;
10585
0
            if (rkind > kind2) {
10586
                /* widen replacement */
10587
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10588
0
                if (!buf2) goto error;
10589
0
                release2 = 1;
10590
0
            }
10591
0
            else if (rkind < kind2) {
10592
                /* widen self and buf1 */
10593
0
                rkind = kind2;
10594
0
                if (release1) {
10595
0
                    assert(buf1 != PyUnicode_DATA(str1));
10596
0
                    PyMem_Free((void *)buf1);
10597
0
                    buf1 = PyUnicode_DATA(str1);
10598
0
                    release1 = 0;
10599
0
                }
10600
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10601
0
                if (!sbuf) goto error;
10602
0
                srelease = 1;
10603
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10604
0
                if (!buf1) goto error;
10605
0
                release1 = 1;
10606
0
            }
10607
0
            u = PyUnicode_New(slen, maxchar);
10608
0
            if (!u)
10609
0
                goto error;
10610
0
            assert(PyUnicode_KIND(u) == rkind);
10611
0
            res = PyUnicode_DATA(u);
10612
10613
0
            memcpy(res, sbuf, rkind * slen);
10614
            /* change everything in-place, starting with this one */
10615
0
            memcpy(res + rkind * i,
10616
0
                   buf2,
10617
0
                   rkind * len2);
10618
0
            i += len1;
10619
10620
0
            while ( --maxcount > 0) {
10621
0
                i = anylib_find(rkind, self,
10622
0
                                sbuf+rkind*i, slen-i,
10623
0
                                str1, buf1, len1, i);
10624
0
                if (i == -1)
10625
0
                    break;
10626
0
                memcpy(res + rkind * i,
10627
0
                       buf2,
10628
0
                       rkind * len2);
10629
0
                i += len1;
10630
0
            }
10631
0
        }
10632
7.44M
    }
10633
43.1M
    else {
10634
43.1M
        Py_ssize_t n, i, j, ires;
10635
43.1M
        Py_ssize_t new_size;
10636
43.1M
        int rkind = skind;
10637
43.1M
        char *res;
10638
10639
43.1M
        if (kind1 < rkind) {
10640
            /* widen substring */
10641
6.37M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10642
6.37M
            if (!buf1) goto error;
10643
6.37M
            release1 = 1;
10644
6.37M
        }
10645
43.1M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10646
43.1M
        if (n == 0)
10647
37.7M
            goto nothing;
10648
5.38M
        if (kind2 < rkind) {
10649
            /* widen replacement */
10650
1.43M
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10651
1.43M
            if (!buf2) goto error;
10652
1.43M
            release2 = 1;
10653
1.43M
        }
10654
3.95M
        else if (kind2 > rkind) {
10655
            /* widen self and buf1 */
10656
0
            rkind = kind2;
10657
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10658
0
            if (!sbuf) goto error;
10659
0
            srelease = 1;
10660
0
            if (release1) {
10661
0
                assert(buf1 != PyUnicode_DATA(str1));
10662
0
                PyMem_Free((void *)buf1);
10663
0
                buf1 = PyUnicode_DATA(str1);
10664
0
                release1 = 0;
10665
0
            }
10666
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10667
0
            if (!buf1) goto error;
10668
0
            release1 = 1;
10669
0
        }
10670
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10671
           PyUnicode_GET_LENGTH(str1)); */
10672
5.38M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10673
0
                PyErr_SetString(PyExc_OverflowError,
10674
0
                                "replace string is too long");
10675
0
                goto error;
10676
0
        }
10677
5.38M
        new_size = slen + n * (len2 - len1);
10678
5.38M
        if (new_size == 0) {
10679
0
            u = _PyUnicode_GetEmpty();
10680
0
            goto done;
10681
0
        }
10682
5.38M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10683
0
            PyErr_SetString(PyExc_OverflowError,
10684
0
                            "replace string is too long");
10685
0
            goto error;
10686
0
        }
10687
5.38M
        u = PyUnicode_New(new_size, maxchar);
10688
5.38M
        if (!u)
10689
0
            goto error;
10690
5.38M
        assert(PyUnicode_KIND(u) == rkind);
10691
5.38M
        res = PyUnicode_DATA(u);
10692
5.38M
        ires = i = 0;
10693
5.38M
        if (len1 > 0) {
10694
140M
            while (n-- > 0) {
10695
                /* look for next match */
10696
135M
                j = anylib_find(rkind, self,
10697
135M
                                sbuf + rkind * i, slen-i,
10698
135M
                                str1, buf1, len1, i);
10699
135M
                if (j == -1)
10700
0
                    break;
10701
135M
                else if (j > i) {
10702
                    /* copy unchanged part [i:j] */
10703
23.5M
                    memcpy(res + rkind * ires,
10704
23.5M
                           sbuf + rkind * i,
10705
23.5M
                           rkind * (j-i));
10706
23.5M
                    ires += j - i;
10707
23.5M
                }
10708
                /* copy substitution string */
10709
135M
                if (len2 > 0) {
10710
135M
                    memcpy(res + rkind * ires,
10711
135M
                           buf2,
10712
135M
                           rkind * len2);
10713
135M
                    ires += len2;
10714
135M
                }
10715
135M
                i = j + len1;
10716
135M
            }
10717
5.38M
            if (i < slen)
10718
                /* copy tail [i:] */
10719
5.26M
                memcpy(res + rkind * ires,
10720
5.26M
                       sbuf + rkind * i,
10721
5.26M
                       rkind * (slen-i));
10722
5.38M
        }
10723
0
        else {
10724
            /* interleave */
10725
0
            while (n > 0) {
10726
0
                memcpy(res + rkind * ires,
10727
0
                       buf2,
10728
0
                       rkind * len2);
10729
0
                ires += len2;
10730
0
                if (--n <= 0)
10731
0
                    break;
10732
0
                memcpy(res + rkind * ires,
10733
0
                       sbuf + rkind * i,
10734
0
                       rkind);
10735
0
                ires++;
10736
0
                i++;
10737
0
            }
10738
0
            memcpy(res + rkind * ires,
10739
0
                   sbuf + rkind * i,
10740
0
                   rkind * (slen-i));
10741
0
        }
10742
5.38M
    }
10743
10744
7.09M
    if (mayshrink) {
10745
0
        unicode_adjust_maxchar(&u);
10746
0
        if (u == NULL)
10747
0
            goto error;
10748
0
    }
10749
10750
7.09M
  done:
10751
7.09M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10752
7.09M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10753
7.09M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10754
7.09M
    if (srelease)
10755
0
        PyMem_Free((void *)sbuf);
10756
7.09M
    if (release1)
10757
1.43M
        PyMem_Free((void *)buf1);
10758
7.09M
    if (release2)
10759
1.43M
        PyMem_Free((void *)buf2);
10760
7.09M
    assert(_PyUnicode_CheckConsistency(u, 1));
10761
7.09M
    return u;
10762
10763
68.5M
  nothing:
10764
    /* nothing to replace; return original string (when possible) */
10765
68.5M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10766
68.5M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10767
68.5M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10768
68.5M
    if (srelease)
10769
0
        PyMem_Free((void *)sbuf);
10770
68.5M
    if (release1)
10771
4.94M
        PyMem_Free((void *)buf1);
10772
68.5M
    if (release2)
10773
0
        PyMem_Free((void *)buf2);
10774
68.5M
    return unicode_result_unchanged(self);
10775
10776
0
  error:
10777
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10778
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10779
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10780
0
    if (srelease)
10781
0
        PyMem_Free((void *)sbuf);
10782
0
    if (release1)
10783
0
        PyMem_Free((void *)buf1);
10784
0
    if (release2)
10785
0
        PyMem_Free((void *)buf2);
10786
0
    return NULL;
10787
7.09M
}
10788
10789
/* --- Unicode Object Methods --------------------------------------------- */
10790
10791
/*[clinic input]
10792
@permit_long_docstring_body
10793
str.title as unicode_title
10794
10795
Return a version of the string where each word is titlecased.
10796
10797
More specifically, words start with uppercased characters and all remaining
10798
cased characters have lower case.
10799
[clinic start generated code]*/
10800
10801
static PyObject *
10802
unicode_title_impl(PyObject *self)
10803
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10804
0
{
10805
0
    return case_operation(self, do_title);
10806
0
}
10807
10808
/*[clinic input]
10809
@permit_long_docstring_body
10810
str.capitalize as unicode_capitalize
10811
10812
Return a capitalized version of the string.
10813
10814
More specifically, make the first character have upper case and the rest lower
10815
case.
10816
[clinic start generated code]*/
10817
10818
static PyObject *
10819
unicode_capitalize_impl(PyObject *self)
10820
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10821
0
{
10822
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10823
0
        return unicode_result_unchanged(self);
10824
0
    return case_operation(self, do_capitalize);
10825
0
}
10826
10827
/*[clinic input]
10828
str.casefold as unicode_casefold
10829
10830
Return a version of the string suitable for caseless comparisons.
10831
[clinic start generated code]*/
10832
10833
static PyObject *
10834
unicode_casefold_impl(PyObject *self)
10835
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10836
0
{
10837
0
    if (PyUnicode_IS_ASCII(self))
10838
0
        return ascii_upper_or_lower(self, 1);
10839
0
    return case_operation(self, do_casefold);
10840
0
}
10841
10842
10843
/* Argument converter. Accepts a single Unicode character. */
10844
10845
static int
10846
convert_uc(PyObject *obj, void *addr)
10847
122
{
10848
122
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10849
10850
122
    if (!PyUnicode_Check(obj)) {
10851
0
        PyErr_Format(PyExc_TypeError,
10852
0
                     "The fill character must be a unicode character, "
10853
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10854
0
        return 0;
10855
0
    }
10856
122
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10857
0
        PyErr_SetString(PyExc_TypeError,
10858
0
                        "The fill character must be exactly one character long");
10859
0
        return 0;
10860
0
    }
10861
122
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10862
122
    return 1;
10863
122
}
10864
10865
/*[clinic input]
10866
str.center as unicode_center
10867
10868
    width: Py_ssize_t
10869
    fillchar: Py_UCS4 = ' '
10870
    /
10871
10872
Return a centered string of length width.
10873
10874
Padding is done using the specified fill character (default is a space).
10875
[clinic start generated code]*/
10876
10877
static PyObject *
10878
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10879
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10880
0
{
10881
0
    Py_ssize_t marg, left;
10882
10883
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10884
0
        return unicode_result_unchanged(self);
10885
10886
0
    marg = width - PyUnicode_GET_LENGTH(self);
10887
0
    left = marg / 2 + (marg & width & 1);
10888
10889
0
    return pad(self, left, marg - left, fillchar);
10890
0
}
10891
10892
/* This function assumes that str1 and str2 are readied by the caller. */
10893
10894
static int
10895
unicode_compare(PyObject *str1, PyObject *str2)
10896
17.7M
{
10897
17.7M
#define COMPARE(TYPE1, TYPE2) \
10898
17.7M
    do { \
10899
15.2M
        TYPE1* p1 = (TYPE1 *)data1; \
10900
15.2M
        TYPE2* p2 = (TYPE2 *)data2; \
10901
15.2M
        TYPE1* end = p1 + len; \
10902
15.2M
        Py_UCS4 c1, c2; \
10903
15.2M
        for (; p1 != end; p1++, p2++) { \
10904
15.2M
            c1 = *p1; \
10905
15.2M
            c2 = *p2; \
10906
15.2M
            if (c1 != c2) \
10907
15.2M
                return (c1 < c2) ? -1 : 1; \
10908
15.2M
        } \
10909
15.2M
    } \
10910
15.2M
    while (0)
10911
10912
17.7M
    int kind1, kind2;
10913
17.7M
    const void *data1, *data2;
10914
17.7M
    Py_ssize_t len1, len2, len;
10915
10916
17.7M
    kind1 = PyUnicode_KIND(str1);
10917
17.7M
    kind2 = PyUnicode_KIND(str2);
10918
17.7M
    data1 = PyUnicode_DATA(str1);
10919
17.7M
    data2 = PyUnicode_DATA(str2);
10920
17.7M
    len1 = PyUnicode_GET_LENGTH(str1);
10921
17.7M
    len2 = PyUnicode_GET_LENGTH(str2);
10922
17.7M
    len = Py_MIN(len1, len2);
10923
10924
17.7M
    switch(kind1) {
10925
1.29M
    case PyUnicode_1BYTE_KIND:
10926
1.29M
    {
10927
1.29M
        switch(kind2) {
10928
295k
        case PyUnicode_1BYTE_KIND:
10929
295k
        {
10930
295k
            int cmp = memcmp(data1, data2, len);
10931
            /* normalize result of memcmp() into the range [-1; 1] */
10932
295k
            if (cmp < 0)
10933
261k
                return -1;
10934
33.5k
            if (cmp > 0)
10935
26.2k
                return 1;
10936
7.35k
            break;
10937
33.5k
        }
10938
862k
        case PyUnicode_2BYTE_KIND:
10939
862k
            COMPARE(Py_UCS1, Py_UCS2);
10940
0
            break;
10941
133k
        case PyUnicode_4BYTE_KIND:
10942
133k
            COMPARE(Py_UCS1, Py_UCS4);
10943
0
            break;
10944
0
        default:
10945
0
            Py_UNREACHABLE();
10946
1.29M
        }
10947
7.35k
        break;
10948
1.29M
    }
10949
13.7M
    case PyUnicode_2BYTE_KIND:
10950
13.7M
    {
10951
13.7M
        switch(kind2) {
10952
3.36k
        case PyUnicode_1BYTE_KIND:
10953
3.36k
            COMPARE(Py_UCS2, Py_UCS1);
10954
0
            break;
10955
12.5M
        case PyUnicode_2BYTE_KIND:
10956
12.5M
        {
10957
12.5M
            COMPARE(Py_UCS2, Py_UCS2);
10958
0
            break;
10959
12.5M
        }
10960
1.16M
        case PyUnicode_4BYTE_KIND:
10961
1.16M
            COMPARE(Py_UCS2, Py_UCS4);
10962
0
            break;
10963
0
        default:
10964
0
            Py_UNREACHABLE();
10965
13.7M
        }
10966
0
        break;
10967
13.7M
    }
10968
2.72M
    case PyUnicode_4BYTE_KIND:
10969
2.72M
    {
10970
2.72M
        switch(kind2) {
10971
1.12k
        case PyUnicode_1BYTE_KIND:
10972
1.12k
            COMPARE(Py_UCS4, Py_UCS1);
10973
0
            break;
10974
485k
        case PyUnicode_2BYTE_KIND:
10975
485k
            COMPARE(Py_UCS4, Py_UCS2);
10976
0
            break;
10977
2.23M
        case PyUnicode_4BYTE_KIND:
10978
2.23M
        {
10979
2.23M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10980
2.23M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10981
            /* normalize result of wmemcmp() into the range [-1; 1] */
10982
2.23M
            if (cmp < 0)
10983
1.09M
                return -1;
10984
1.13M
            if (cmp > 0)
10985
1.13M
                return 1;
10986
#else
10987
            COMPARE(Py_UCS4, Py_UCS4);
10988
#endif
10989
0
            break;
10990
1.13M
        }
10991
0
        default:
10992
0
            Py_UNREACHABLE();
10993
2.72M
        }
10994
0
        break;
10995
2.72M
    }
10996
0
    default:
10997
0
        Py_UNREACHABLE();
10998
17.7M
    }
10999
11000
7.35k
    if (len1 == len2)
11001
7.32k
        return 0;
11002
36
    if (len1 < len2)
11003
15
        return -1;
11004
21
    else
11005
21
        return 1;
11006
11007
36
#undef COMPARE
11008
36
}
11009
11010
11011
int
11012
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11013
642M
{
11014
642M
    assert(PyUnicode_Check(str1));
11015
642M
    assert(PyUnicode_Check(str2));
11016
642M
    if (str1 == str2) {
11017
113M
        return 1;
11018
113M
    }
11019
528M
    return unicode_eq(str1, str2);
11020
642M
}
11021
11022
11023
int
11024
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11025
0
{
11026
0
    if (!PyUnicode_Check(str1)) {
11027
0
        PyErr_Format(PyExc_TypeError,
11028
0
                     "first argument must be str, not %T", str1);
11029
0
        return -1;
11030
0
    }
11031
0
    if (!PyUnicode_Check(str2)) {
11032
0
        PyErr_Format(PyExc_TypeError,
11033
0
                     "second argument must be str, not %T", str2);
11034
0
        return -1;
11035
0
    }
11036
11037
0
    return _PyUnicode_Equal(str1, str2);
11038
0
}
11039
11040
11041
int
11042
PyUnicode_Compare(PyObject *left, PyObject *right)
11043
225k
{
11044
225k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11045
        /* a string is equal to itself */
11046
225k
        if (left == right)
11047
0
            return 0;
11048
11049
225k
        return unicode_compare(left, right);
11050
225k
    }
11051
0
    PyErr_Format(PyExc_TypeError,
11052
0
                 "Can't compare %.100s and %.100s",
11053
0
                 Py_TYPE(left)->tp_name,
11054
0
                 Py_TYPE(right)->tp_name);
11055
0
    return -1;
11056
225k
}
11057
11058
int
11059
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11060
3.93M
{
11061
3.93M
    Py_ssize_t i;
11062
3.93M
    int kind;
11063
3.93M
    Py_UCS4 chr;
11064
11065
3.93M
    assert(_PyUnicode_CHECK(uni));
11066
3.93M
    kind = PyUnicode_KIND(uni);
11067
3.93M
    if (kind == PyUnicode_1BYTE_KIND) {
11068
3.92M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11069
3.92M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11070
3.92M
        size_t len, len2 = strlen(str);
11071
3.92M
        int cmp;
11072
11073
3.92M
        len = Py_MIN(len1, len2);
11074
3.92M
        cmp = memcmp(data, str, len);
11075
3.92M
        if (cmp != 0) {
11076
3.45M
            if (cmp < 0)
11077
58.0k
                return -1;
11078
3.39M
            else
11079
3.39M
                return 1;
11080
3.45M
        }
11081
477k
        if (len1 > len2)
11082
305
            return 1; /* uni is longer */
11083
477k
        if (len1 < len2)
11084
707
            return -1; /* str is longer */
11085
476k
        return 0;
11086
477k
    }
11087
1.24k
    else {
11088
1.24k
        const void *data = PyUnicode_DATA(uni);
11089
        /* Compare Unicode string and source character set string */
11090
2.33k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11091
2.12k
            if (chr != (unsigned char)str[i])
11092
1.04k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11093
        /* This check keeps Python strings that end in '\0' from comparing equal
11094
         to C strings identical up to that point. */
11095
204
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11096
204
            return 1; /* uni is longer */
11097
0
        if (str[i])
11098
0
            return -1; /* str is longer */
11099
0
        return 0;
11100
0
    }
11101
3.93M
}
11102
11103
int
11104
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11105
28
{
11106
28
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11107
28
}
11108
11109
int
11110
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11111
28
{
11112
28
    assert(_PyUnicode_CHECK(unicode));
11113
28
    assert(str);
11114
11115
28
    if (PyUnicode_IS_ASCII(unicode)) {
11116
28
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11117
28
        return size == len &&
11118
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11119
28
    }
11120
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11121
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11122
0
        return size == len &&
11123
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11124
0
    }
11125
11126
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11127
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11128
0
        return 0;
11129
0
    }
11130
0
    const unsigned char *s = (const unsigned char *)str;
11131
0
    const unsigned char *ends = s + (size_t)size;
11132
0
    int kind = PyUnicode_KIND(unicode);
11133
0
    const void *data = PyUnicode_DATA(unicode);
11134
    /* Compare Unicode string and UTF-8 string */
11135
0
    for (Py_ssize_t i = 0; i < len; i++) {
11136
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11137
0
        if (ch < 0x80) {
11138
0
            if (ends == s || s[0] != ch) {
11139
0
                return 0;
11140
0
            }
11141
0
            s += 1;
11142
0
        }
11143
0
        else if (ch < 0x800) {
11144
0
            if ((ends - s) < 2 ||
11145
0
                s[0] != (0xc0 | (ch >> 6)) ||
11146
0
                s[1] != (0x80 | (ch & 0x3f)))
11147
0
            {
11148
0
                return 0;
11149
0
            }
11150
0
            s += 2;
11151
0
        }
11152
0
        else if (ch < 0x10000) {
11153
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11154
0
                (ends - s) < 3 ||
11155
0
                s[0] != (0xe0 | (ch >> 12)) ||
11156
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11157
0
                s[2] != (0x80 | (ch & 0x3f)))
11158
0
            {
11159
0
                return 0;
11160
0
            }
11161
0
            s += 3;
11162
0
        }
11163
0
        else {
11164
0
            assert(ch <= MAX_UNICODE);
11165
0
            if ((ends - s) < 4 ||
11166
0
                s[0] != (0xf0 | (ch >> 18)) ||
11167
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11168
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11169
0
                s[3] != (0x80 | (ch & 0x3f)))
11170
0
            {
11171
0
                return 0;
11172
0
            }
11173
0
            s += 4;
11174
0
        }
11175
0
    }
11176
0
    return s == ends;
11177
0
}
11178
11179
int
11180
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11181
42.1M
{
11182
42.1M
    size_t len;
11183
42.1M
    assert(_PyUnicode_CHECK(unicode));
11184
42.1M
    assert(str);
11185
#ifndef NDEBUG
11186
    for (const char *p = str; *p; p++) {
11187
        assert((unsigned char)*p < 128);
11188
    }
11189
#endif
11190
42.1M
    if (!PyUnicode_IS_ASCII(unicode))
11191
122k
        return 0;
11192
42.0M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11193
42.0M
    return strlen(str) == len &&
11194
627k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11195
42.1M
}
11196
11197
PyObject *
11198
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11199
52.4M
{
11200
52.4M
    int result;
11201
11202
52.4M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11203
92.7k
        Py_RETURN_NOTIMPLEMENTED;
11204
11205
52.3M
    if (left == right) {
11206
2.56k
        switch (op) {
11207
2.24k
        case Py_EQ:
11208
2.24k
        case Py_LE:
11209
2.24k
        case Py_GE:
11210
            /* a string is equal to itself */
11211
2.24k
            Py_RETURN_TRUE;
11212
320
        case Py_NE:
11213
320
        case Py_LT:
11214
320
        case Py_GT:
11215
320
            Py_RETURN_FALSE;
11216
0
        default:
11217
0
            PyErr_BadArgument();
11218
0
            return NULL;
11219
2.56k
        }
11220
2.56k
    }
11221
52.3M
    else if (op == Py_EQ || op == Py_NE) {
11222
34.8M
        result = unicode_eq(left, right);
11223
34.8M
        result ^= (op == Py_NE);
11224
34.8M
        return PyBool_FromLong(result);
11225
34.8M
    }
11226
17.5M
    else {
11227
17.5M
        result = unicode_compare(left, right);
11228
17.5M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11229
17.5M
    }
11230
52.3M
}
11231
11232
int
11233
PyUnicode_Contains(PyObject *str, PyObject *substr)
11234
216M
{
11235
216M
    int kind1, kind2;
11236
216M
    const void *buf1, *buf2;
11237
216M
    Py_ssize_t len1, len2;
11238
216M
    int result;
11239
11240
216M
    if (!PyUnicode_Check(substr)) {
11241
0
        PyErr_Format(PyExc_TypeError,
11242
0
                     "'in <string>' requires string as left operand, not %.100s",
11243
0
                     Py_TYPE(substr)->tp_name);
11244
0
        return -1;
11245
0
    }
11246
216M
    if (ensure_unicode(str) < 0)
11247
0
        return -1;
11248
11249
216M
    kind1 = PyUnicode_KIND(str);
11250
216M
    kind2 = PyUnicode_KIND(substr);
11251
216M
    if (kind1 < kind2)
11252
14.2M
        return 0;
11253
201M
    len1 = PyUnicode_GET_LENGTH(str);
11254
201M
    len2 = PyUnicode_GET_LENGTH(substr);
11255
201M
    if (len1 < len2)
11256
1.02M
        return 0;
11257
200M
    buf1 = PyUnicode_DATA(str);
11258
200M
    buf2 = PyUnicode_DATA(substr);
11259
200M
    if (len2 == 1) {
11260
186M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11261
186M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11262
186M
        return result;
11263
186M
    }
11264
14.2M
    if (kind2 != kind1) {
11265
18.5k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11266
18.5k
        if (!buf2)
11267
0
            return -1;
11268
18.5k
    }
11269
11270
14.2M
    switch (kind1) {
11271
14.2M
    case PyUnicode_1BYTE_KIND:
11272
14.2M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11273
14.2M
        break;
11274
14.1k
    case PyUnicode_2BYTE_KIND:
11275
14.1k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11276
14.1k
        break;
11277
4.34k
    case PyUnicode_4BYTE_KIND:
11278
4.34k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11279
4.34k
        break;
11280
0
    default:
11281
0
        Py_UNREACHABLE();
11282
14.2M
    }
11283
11284
14.2M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11285
14.2M
    if (kind2 != kind1)
11286
18.5k
        PyMem_Free((void *)buf2);
11287
11288
14.2M
    return result;
11289
14.2M
}
11290
11291
/* Concat to string or Unicode object giving a new Unicode object. */
11292
11293
PyObject *
11294
PyUnicode_Concat(PyObject *left, PyObject *right)
11295
51.7M
{
11296
51.7M
    PyObject *result;
11297
51.7M
    Py_UCS4 maxchar, maxchar2;
11298
51.7M
    Py_ssize_t left_len, right_len, new_len;
11299
11300
51.7M
    if (ensure_unicode(left) < 0)
11301
0
        return NULL;
11302
11303
51.7M
    if (!PyUnicode_Check(right)) {
11304
0
        PyErr_Format(PyExc_TypeError,
11305
0
            "can only concatenate str (not \"%.200s\") to str",
11306
0
            Py_TYPE(right)->tp_name);
11307
0
        return NULL;
11308
0
    }
11309
11310
    /* Shortcuts */
11311
51.7M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11312
51.7M
    if (left == empty) {
11313
105k
        return PyUnicode_FromObject(right);
11314
105k
    }
11315
51.5M
    if (right == empty) {
11316
4.55M
        return PyUnicode_FromObject(left);
11317
4.55M
    }
11318
11319
47.0M
    left_len = PyUnicode_GET_LENGTH(left);
11320
47.0M
    right_len = PyUnicode_GET_LENGTH(right);
11321
47.0M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11322
0
        PyErr_SetString(PyExc_OverflowError,
11323
0
                        "strings are too large to concat");
11324
0
        return NULL;
11325
0
    }
11326
47.0M
    new_len = left_len + right_len;
11327
11328
47.0M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11329
47.0M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11330
47.0M
    maxchar = Py_MAX(maxchar, maxchar2);
11331
11332
    /* Concat the two Unicode strings */
11333
47.0M
    result = PyUnicode_New(new_len, maxchar);
11334
47.0M
    if (result == NULL)
11335
0
        return NULL;
11336
47.0M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11337
47.0M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11338
47.0M
    assert(_PyUnicode_CheckConsistency(result, 1));
11339
47.0M
    return result;
11340
47.0M
}
11341
11342
void
11343
PyUnicode_Append(PyObject **p_left, PyObject *right)
11344
5.43M
{
11345
5.43M
    PyObject *left, *res;
11346
5.43M
    Py_UCS4 maxchar, maxchar2;
11347
5.43M
    Py_ssize_t left_len, right_len, new_len;
11348
11349
5.43M
    if (p_left == NULL) {
11350
0
        if (!PyErr_Occurred())
11351
0
            PyErr_BadInternalCall();
11352
0
        return;
11353
0
    }
11354
5.43M
    left = *p_left;
11355
5.43M
    if (right == NULL || left == NULL
11356
5.43M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11357
0
        if (!PyErr_Occurred())
11358
0
            PyErr_BadInternalCall();
11359
0
        goto error;
11360
0
    }
11361
11362
    /* Shortcuts */
11363
5.43M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11364
5.43M
    if (left == empty) {
11365
513k
        Py_DECREF(left);
11366
513k
        *p_left = Py_NewRef(right);
11367
513k
        return;
11368
513k
    }
11369
4.92M
    if (right == empty) {
11370
15.3k
        return;
11371
15.3k
    }
11372
11373
4.90M
    left_len = PyUnicode_GET_LENGTH(left);
11374
4.90M
    right_len = PyUnicode_GET_LENGTH(right);
11375
4.90M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11376
0
        PyErr_SetString(PyExc_OverflowError,
11377
0
                        "strings are too large to concat");
11378
0
        goto error;
11379
0
    }
11380
4.90M
    new_len = left_len + right_len;
11381
11382
4.90M
    if (_PyUnicode_IsModifiable(left)
11383
4.90M
        && PyUnicode_CheckExact(right)
11384
4.90M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11385
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11386
           to change the structure size, but characters are stored just after
11387
           the structure, and so it requires to move all characters which is
11388
           not so different than duplicating the string. */
11389
1.95M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11390
1.95M
    {
11391
        /* append inplace */
11392
1.95M
        if (unicode_resize(p_left, new_len) != 0)
11393
0
            goto error;
11394
11395
        /* copy 'right' into the newly allocated area of 'left' */
11396
1.95M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11397
1.95M
    }
11398
2.94M
    else {
11399
2.94M
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11400
2.94M
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11401
2.94M
        maxchar = Py_MAX(maxchar, maxchar2);
11402
11403
        /* Concat the two Unicode strings */
11404
2.94M
        res = PyUnicode_New(new_len, maxchar);
11405
2.94M
        if (res == NULL)
11406
0
            goto error;
11407
2.94M
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11408
2.94M
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11409
2.94M
        Py_DECREF(left);
11410
2.94M
        *p_left = res;
11411
2.94M
    }
11412
4.90M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11413
4.90M
    return;
11414
11415
0
error:
11416
0
    Py_CLEAR(*p_left);
11417
0
}
11418
11419
void
11420
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11421
8
{
11422
8
    PyUnicode_Append(pleft, right);
11423
8
    Py_XDECREF(right);
11424
8
}
11425
11426
/*[clinic input]
11427
@permit_long_summary
11428
@text_signature "($self, sub[, start[, end]], /)"
11429
str.count as unicode_count -> Py_ssize_t
11430
11431
    self as str: self
11432
    sub as substr: unicode
11433
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11434
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11435
    /
11436
11437
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11438
11439
Optional arguments start and end are interpreted as in slice notation.
11440
[clinic start generated code]*/
11441
11442
static Py_ssize_t
11443
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11444
                   Py_ssize_t end)
11445
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11446
27.7M
{
11447
27.7M
    assert(PyUnicode_Check(str));
11448
27.7M
    assert(PyUnicode_Check(substr));
11449
11450
27.7M
    Py_ssize_t result;
11451
27.7M
    int kind1, kind2;
11452
27.7M
    const void *buf1 = NULL, *buf2 = NULL;
11453
27.7M
    Py_ssize_t len1, len2;
11454
11455
27.7M
    kind1 = PyUnicode_KIND(str);
11456
27.7M
    kind2 = PyUnicode_KIND(substr);
11457
27.7M
    if (kind1 < kind2)
11458
0
        return 0;
11459
11460
27.7M
    len1 = PyUnicode_GET_LENGTH(str);
11461
27.7M
    len2 = PyUnicode_GET_LENGTH(substr);
11462
27.7M
    ADJUST_INDICES(start, end, len1);
11463
27.7M
    if (end - start < len2)
11464
3.75M
        return 0;
11465
11466
23.9M
    buf1 = PyUnicode_DATA(str);
11467
23.9M
    buf2 = PyUnicode_DATA(substr);
11468
23.9M
    if (kind2 != kind1) {
11469
4.78M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11470
4.78M
        if (!buf2)
11471
0
            goto onError;
11472
4.78M
    }
11473
11474
    // We don't reuse `anylib_count` here because of the explicit casts.
11475
23.9M
    switch (kind1) {
11476
19.2M
    case PyUnicode_1BYTE_KIND:
11477
19.2M
        result = ucs1lib_count(
11478
19.2M
            ((const Py_UCS1*)buf1) + start, end - start,
11479
19.2M
            buf2, len2, PY_SSIZE_T_MAX
11480
19.2M
            );
11481
19.2M
        break;
11482
3.08M
    case PyUnicode_2BYTE_KIND:
11483
3.08M
        result = ucs2lib_count(
11484
3.08M
            ((const Py_UCS2*)buf1) + start, end - start,
11485
3.08M
            buf2, len2, PY_SSIZE_T_MAX
11486
3.08M
            );
11487
3.08M
        break;
11488
1.70M
    case PyUnicode_4BYTE_KIND:
11489
1.70M
        result = ucs4lib_count(
11490
1.70M
            ((const Py_UCS4*)buf1) + start, end - start,
11491
1.70M
            buf2, len2, PY_SSIZE_T_MAX
11492
1.70M
            );
11493
1.70M
        break;
11494
0
    default:
11495
0
        Py_UNREACHABLE();
11496
23.9M
    }
11497
11498
23.9M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11499
23.9M
    if (kind2 != kind1)
11500
4.78M
        PyMem_Free((void *)buf2);
11501
11502
23.9M
    return result;
11503
0
  onError:
11504
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11505
0
    if (kind2 != kind1)
11506
0
        PyMem_Free((void *)buf2);
11507
0
    return -1;
11508
23.9M
}
11509
11510
/*[clinic input]
11511
str.encode as unicode_encode
11512
11513
    encoding: str(c_default="NULL") = 'utf-8'
11514
        The encoding in which to encode the string.
11515
    errors: str(c_default="NULL") = 'strict'
11516
        The error handling scheme to use for encoding errors.
11517
        The default is 'strict' meaning that encoding errors raise a
11518
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11519
        'xmlcharrefreplace' as well as any other name registered with
11520
        codecs.register_error that can handle UnicodeEncodeErrors.
11521
11522
Encode the string using the codec registered for encoding.
11523
[clinic start generated code]*/
11524
11525
static PyObject *
11526
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11527
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11528
21.3M
{
11529
21.3M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11530
21.3M
}
11531
11532
/*[clinic input]
11533
str.expandtabs as unicode_expandtabs
11534
11535
    tabsize: int = 8
11536
11537
Return a copy where all tab characters are expanded using spaces.
11538
11539
If tabsize is not given, a tab size of 8 characters is assumed.
11540
[clinic start generated code]*/
11541
11542
static PyObject *
11543
unicode_expandtabs_impl(PyObject *self, int tabsize)
11544
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11545
7.25M
{
11546
7.25M
    Py_ssize_t i, j, line_pos, src_len, incr;
11547
7.25M
    Py_UCS4 ch;
11548
7.25M
    PyObject *u;
11549
7.25M
    const void *src_data;
11550
7.25M
    void *dest_data;
11551
7.25M
    int kind;
11552
7.25M
    int found;
11553
11554
    /* First pass: determine size of output string */
11555
7.25M
    src_len = PyUnicode_GET_LENGTH(self);
11556
7.25M
    i = j = line_pos = 0;
11557
7.25M
    kind = PyUnicode_KIND(self);
11558
7.25M
    src_data = PyUnicode_DATA(self);
11559
7.25M
    found = 0;
11560
134M
    for (; i < src_len; i++) {
11561
127M
        ch = PyUnicode_READ(kind, src_data, i);
11562
127M
        if (ch == '\t') {
11563
8.52M
            found = 1;
11564
8.52M
            if (tabsize > 0) {
11565
8.52M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11566
8.52M
                if (j > PY_SSIZE_T_MAX - incr)
11567
0
                    goto overflow;
11568
8.52M
                line_pos += incr;
11569
8.52M
                j += incr;
11570
8.52M
            }
11571
8.52M
        }
11572
118M
        else {
11573
118M
            if (j > PY_SSIZE_T_MAX - 1)
11574
0
                goto overflow;
11575
118M
            line_pos++;
11576
118M
            j++;
11577
118M
            if (ch == '\n' || ch == '\r')
11578
4.45k
                line_pos = 0;
11579
118M
        }
11580
127M
    }
11581
7.25M
    if (!found)
11582
7.07M
        return unicode_result_unchanged(self);
11583
11584
    /* Second pass: create output string and fill it */
11585
186k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11586
186k
    if (!u)
11587
0
        return NULL;
11588
186k
    dest_data = PyUnicode_DATA(u);
11589
11590
186k
    i = j = line_pos = 0;
11591
11592
25.1M
    for (; i < src_len; i++) {
11593
24.9M
        ch = PyUnicode_READ(kind, src_data, i);
11594
24.9M
        if (ch == '\t') {
11595
8.52M
            if (tabsize > 0) {
11596
8.52M
                incr = tabsize - (line_pos % tabsize);
11597
8.52M
                line_pos += incr;
11598
8.52M
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11599
8.52M
                j += incr;
11600
8.52M
            }
11601
8.52M
        }
11602
16.4M
        else {
11603
16.4M
            line_pos++;
11604
16.4M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11605
16.4M
            j++;
11606
16.4M
            if (ch == '\n' || ch == '\r')
11607
0
                line_pos = 0;
11608
16.4M
        }
11609
24.9M
    }
11610
186k
    assert (j == PyUnicode_GET_LENGTH(u));
11611
186k
    return unicode_result(u);
11612
11613
0
  overflow:
11614
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11615
0
    return NULL;
11616
186k
}
11617
11618
/*[clinic input]
11619
@permit_long_summary
11620
str.find as unicode_find = str.count
11621
11622
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11623
11624
Optional arguments start and end are interpreted as in slice notation.
11625
Return -1 on failure.
11626
[clinic start generated code]*/
11627
11628
static Py_ssize_t
11629
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11630
                  Py_ssize_t end)
11631
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11632
23.7M
{
11633
23.7M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11634
23.7M
    if (result < 0) {
11635
5.74M
        return -1;
11636
5.74M
    }
11637
17.9M
    return result;
11638
23.7M
}
11639
11640
static PyObject *
11641
unicode_getitem(PyObject *self, Py_ssize_t index)
11642
66.4M
{
11643
66.4M
    const void *data;
11644
66.4M
    int kind;
11645
66.4M
    Py_UCS4 ch;
11646
11647
66.4M
    if (!PyUnicode_Check(self)) {
11648
0
        PyErr_BadArgument();
11649
0
        return NULL;
11650
0
    }
11651
66.4M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11652
14.7k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11653
14.7k
        return NULL;
11654
14.7k
    }
11655
66.4M
    kind = PyUnicode_KIND(self);
11656
66.4M
    data = PyUnicode_DATA(self);
11657
66.4M
    ch = PyUnicode_READ(kind, data, index);
11658
66.4M
    return unicode_char(ch);
11659
66.4M
}
11660
11661
/* Believe it or not, this produces the same value for ASCII strings
11662
   as bytes_hash(). */
11663
static Py_hash_t
11664
unicode_hash(PyObject *self)
11665
1.18G
{
11666
1.18G
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11667
11668
#ifdef Py_DEBUG
11669
    assert(_Py_HashSecret_Initialized);
11670
#endif
11671
1.18G
    Py_hash_t hash = PyUnicode_HASH(self);
11672
1.18G
    if (hash != -1) {
11673
1.11G
        return hash;
11674
1.11G
    }
11675
69.4M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11676
69.4M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11677
11678
69.4M
    PyUnicode_SET_HASH(self, x);
11679
69.4M
    return x;
11680
1.18G
}
11681
11682
/*[clinic input]
11683
@permit_long_summary
11684
str.index as unicode_index = str.count
11685
11686
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11687
11688
Optional arguments start and end are interpreted as in slice notation.
11689
Raises ValueError when the substring is not found.
11690
[clinic start generated code]*/
11691
11692
static Py_ssize_t
11693
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11694
                   Py_ssize_t end)
11695
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11696
47.8k
{
11697
47.8k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11698
47.8k
    if (result == -1) {
11699
733
        PyErr_SetString(PyExc_ValueError, "substring not found");
11700
733
    }
11701
47.1k
    else if (result < 0) {
11702
0
        return -1;
11703
0
    }
11704
47.8k
    return result;
11705
47.8k
}
11706
11707
/*[clinic input]
11708
str.isascii as unicode_isascii
11709
11710
Return True if all characters in the string are ASCII, False otherwise.
11711
11712
ASCII characters have code points in the range U+0000-U+007F.
11713
Empty string is ASCII too.
11714
[clinic start generated code]*/
11715
11716
static PyObject *
11717
unicode_isascii_impl(PyObject *self)
11718
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11719
9.58k
{
11720
9.58k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11721
9.58k
}
11722
11723
/*[clinic input]
11724
@permit_long_docstring_body
11725
str.islower as unicode_islower
11726
11727
Return True if the string is a lowercase string, False otherwise.
11728
11729
A string is lowercase if all cased characters in the string are lowercase and
11730
there is at least one cased character in the string.
11731
[clinic start generated code]*/
11732
11733
static PyObject *
11734
unicode_islower_impl(PyObject *self)
11735
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11736
0
{
11737
0
    Py_ssize_t i, length;
11738
0
    int kind;
11739
0
    const void *data;
11740
0
    int cased;
11741
11742
0
    length = PyUnicode_GET_LENGTH(self);
11743
0
    kind = PyUnicode_KIND(self);
11744
0
    data = PyUnicode_DATA(self);
11745
11746
    /* Shortcut for single character strings */
11747
0
    if (length == 1)
11748
0
        return PyBool_FromLong(
11749
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11750
11751
    /* Special case for empty strings */
11752
0
    if (length == 0)
11753
0
        Py_RETURN_FALSE;
11754
11755
0
    cased = 0;
11756
0
    for (i = 0; i < length; i++) {
11757
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11758
11759
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11760
0
            Py_RETURN_FALSE;
11761
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11762
0
            cased = 1;
11763
0
    }
11764
0
    return PyBool_FromLong(cased);
11765
0
}
11766
11767
/*[clinic input]
11768
@permit_long_docstring_body
11769
str.isupper as unicode_isupper
11770
11771
Return True if the string is an uppercase string, False otherwise.
11772
11773
A string is uppercase if all cased characters in the string are uppercase and
11774
there is at least one cased character in the string.
11775
[clinic start generated code]*/
11776
11777
static PyObject *
11778
unicode_isupper_impl(PyObject *self)
11779
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11780
14.2k
{
11781
14.2k
    Py_ssize_t i, length;
11782
14.2k
    int kind;
11783
14.2k
    const void *data;
11784
14.2k
    int cased;
11785
11786
14.2k
    length = PyUnicode_GET_LENGTH(self);
11787
14.2k
    kind = PyUnicode_KIND(self);
11788
14.2k
    data = PyUnicode_DATA(self);
11789
11790
    /* Shortcut for single character strings */
11791
14.2k
    if (length == 1)
11792
0
        return PyBool_FromLong(
11793
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11794
11795
    /* Special case for empty strings */
11796
14.2k
    if (length == 0)
11797
0
        Py_RETURN_FALSE;
11798
11799
14.2k
    cased = 0;
11800
180k
    for (i = 0; i < length; i++) {
11801
167k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11802
11803
167k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11804
1.77k
            Py_RETURN_FALSE;
11805
166k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11806
12.6k
            cased = 1;
11807
167k
    }
11808
12.5k
    return PyBool_FromLong(cased);
11809
14.2k
}
11810
11811
/*[clinic input]
11812
str.istitle as unicode_istitle
11813
11814
Return True if the string is a title-cased string, False otherwise.
11815
11816
In a title-cased string, upper- and title-case characters may only
11817
follow uncased characters and lowercase characters only cased ones.
11818
[clinic start generated code]*/
11819
11820
static PyObject *
11821
unicode_istitle_impl(PyObject *self)
11822
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11823
0
{
11824
0
    Py_ssize_t i, length;
11825
0
    int kind;
11826
0
    const void *data;
11827
0
    int cased, previous_is_cased;
11828
11829
0
    length = PyUnicode_GET_LENGTH(self);
11830
0
    kind = PyUnicode_KIND(self);
11831
0
    data = PyUnicode_DATA(self);
11832
11833
    /* Shortcut for single character strings */
11834
0
    if (length == 1) {
11835
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11836
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11837
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11838
0
    }
11839
11840
    /* Special case for empty strings */
11841
0
    if (length == 0)
11842
0
        Py_RETURN_FALSE;
11843
11844
0
    cased = 0;
11845
0
    previous_is_cased = 0;
11846
0
    for (i = 0; i < length; i++) {
11847
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11848
11849
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11850
0
            if (previous_is_cased)
11851
0
                Py_RETURN_FALSE;
11852
0
            previous_is_cased = 1;
11853
0
            cased = 1;
11854
0
        }
11855
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11856
0
            if (!previous_is_cased)
11857
0
                Py_RETURN_FALSE;
11858
0
            previous_is_cased = 1;
11859
0
            cased = 1;
11860
0
        }
11861
0
        else
11862
0
            previous_is_cased = 0;
11863
0
    }
11864
0
    return PyBool_FromLong(cased);
11865
0
}
11866
11867
/*[clinic input]
11868
@permit_long_docstring_body
11869
str.isspace as unicode_isspace
11870
11871
Return True if the string is a whitespace string, False otherwise.
11872
11873
A string is whitespace if all characters in the string are whitespace and there
11874
is at least one character in the string.
11875
[clinic start generated code]*/
11876
11877
static PyObject *
11878
unicode_isspace_impl(PyObject *self)
11879
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11880
21.1M
{
11881
21.1M
    Py_ssize_t i, length;
11882
21.1M
    int kind;
11883
21.1M
    const void *data;
11884
11885
21.1M
    length = PyUnicode_GET_LENGTH(self);
11886
21.1M
    kind = PyUnicode_KIND(self);
11887
21.1M
    data = PyUnicode_DATA(self);
11888
11889
    /* Shortcut for single character strings */
11890
21.1M
    if (length == 1)
11891
21.1M
        return PyBool_FromLong(
11892
21.1M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11893
11894
    /* Special case for empty strings */
11895
1.33k
    if (length == 0)
11896
284
        Py_RETURN_FALSE;
11897
11898
7.60k
    for (i = 0; i < length; i++) {
11899
7.50k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11900
7.50k
        if (!Py_UNICODE_ISSPACE(ch))
11901
944
            Py_RETURN_FALSE;
11902
7.50k
    }
11903
1.04k
    Py_RETURN_TRUE;
11904
1.04k
}
11905
11906
/*[clinic input]
11907
@permit_long_docstring_body
11908
str.isalpha as unicode_isalpha
11909
11910
Return True if the string is an alphabetic string, False otherwise.
11911
11912
A string is alphabetic if all characters in the string are alphabetic and there
11913
is at least one character in the string.
11914
[clinic start generated code]*/
11915
11916
static PyObject *
11917
unicode_isalpha_impl(PyObject *self)
11918
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11919
14
{
11920
14
    Py_ssize_t i, length;
11921
14
    int kind;
11922
14
    const void *data;
11923
11924
14
    length = PyUnicode_GET_LENGTH(self);
11925
14
    kind = PyUnicode_KIND(self);
11926
14
    data = PyUnicode_DATA(self);
11927
11928
    /* Shortcut for single character strings */
11929
14
    if (length == 1)
11930
12
        return PyBool_FromLong(
11931
12
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11932
11933
    /* Special case for empty strings */
11934
2
    if (length == 0)
11935
0
        Py_RETURN_FALSE;
11936
11937
2
    for (i = 0; i < length; i++) {
11938
2
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11939
2
            Py_RETURN_FALSE;
11940
2
    }
11941
2
    Py_RETURN_TRUE;
11942
2
}
11943
11944
/*[clinic input]
11945
@permit_long_docstring_body
11946
str.isalnum as unicode_isalnum
11947
11948
Return True if the string is an alpha-numeric string, False otherwise.
11949
11950
A string is alpha-numeric if all characters in the string are alpha-numeric and
11951
there is at least one character in the string.
11952
[clinic start generated code]*/
11953
11954
static PyObject *
11955
unicode_isalnum_impl(PyObject *self)
11956
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11957
0
{
11958
0
    int kind;
11959
0
    const void *data;
11960
0
    Py_ssize_t len, i;
11961
11962
0
    kind = PyUnicode_KIND(self);
11963
0
    data = PyUnicode_DATA(self);
11964
0
    len = PyUnicode_GET_LENGTH(self);
11965
11966
    /* Shortcut for single character strings */
11967
0
    if (len == 1) {
11968
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11969
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11970
0
    }
11971
11972
    /* Special case for empty strings */
11973
0
    if (len == 0)
11974
0
        Py_RETURN_FALSE;
11975
11976
0
    for (i = 0; i < len; i++) {
11977
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11978
0
        if (!Py_UNICODE_ISALNUM(ch))
11979
0
            Py_RETURN_FALSE;
11980
0
    }
11981
0
    Py_RETURN_TRUE;
11982
0
}
11983
11984
/*[clinic input]
11985
@permit_long_docstring_body
11986
str.isdecimal as unicode_isdecimal
11987
11988
Return True if the string is a decimal string, False otherwise.
11989
11990
A string is a decimal string if all characters in the string are decimal and
11991
there is at least one character in the string.
11992
[clinic start generated code]*/
11993
11994
static PyObject *
11995
unicode_isdecimal_impl(PyObject *self)
11996
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
11997
1.30k
{
11998
1.30k
    Py_ssize_t i, length;
11999
1.30k
    int kind;
12000
1.30k
    const void *data;
12001
12002
1.30k
    length = PyUnicode_GET_LENGTH(self);
12003
1.30k
    kind = PyUnicode_KIND(self);
12004
1.30k
    data = PyUnicode_DATA(self);
12005
12006
    /* Shortcut for single character strings */
12007
1.30k
    if (length == 1)
12008
156
        return PyBool_FromLong(
12009
156
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12010
12011
    /* Special case for empty strings */
12012
1.15k
    if (length == 0)
12013
0
        Py_RETURN_FALSE;
12014
12015
6.86k
    for (i = 0; i < length; i++) {
12016
6.33k
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12017
624
            Py_RETURN_FALSE;
12018
6.33k
    }
12019
1.15k
    Py_RETURN_TRUE;
12020
1.15k
}
12021
12022
/*[clinic input]
12023
@permit_long_docstring_body
12024
str.isdigit as unicode_isdigit
12025
12026
Return True if the string is a digit string, False otherwise.
12027
12028
A string is a digit string if all characters in the string are digits and there
12029
is at least one character in the string.
12030
[clinic start generated code]*/
12031
12032
static PyObject *
12033
unicode_isdigit_impl(PyObject *self)
12034
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12035
1.43M
{
12036
1.43M
    Py_ssize_t i, length;
12037
1.43M
    int kind;
12038
1.43M
    const void *data;
12039
12040
1.43M
    length = PyUnicode_GET_LENGTH(self);
12041
1.43M
    kind = PyUnicode_KIND(self);
12042
1.43M
    data = PyUnicode_DATA(self);
12043
12044
    /* Shortcut for single character strings */
12045
1.43M
    if (length == 1) {
12046
1.43M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12047
1.43M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12048
1.43M
    }
12049
12050
    /* Special case for empty strings */
12051
510
    if (length == 0)
12052
0
        Py_RETURN_FALSE;
12053
12054
1.82k
    for (i = 0; i < length; i++) {
12055
1.31k
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12056
0
            Py_RETURN_FALSE;
12057
1.31k
    }
12058
510
    Py_RETURN_TRUE;
12059
510
}
12060
12061
/*[clinic input]
12062
@permit_long_docstring_body
12063
str.isnumeric as unicode_isnumeric
12064
12065
Return True if the string is a numeric string, False otherwise.
12066
12067
A string is numeric if all characters in the string are numeric and there is at
12068
least one character in the string.
12069
[clinic start generated code]*/
12070
12071
static PyObject *
12072
unicode_isnumeric_impl(PyObject *self)
12073
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12074
0
{
12075
0
    Py_ssize_t i, length;
12076
0
    int kind;
12077
0
    const void *data;
12078
12079
0
    length = PyUnicode_GET_LENGTH(self);
12080
0
    kind = PyUnicode_KIND(self);
12081
0
    data = PyUnicode_DATA(self);
12082
12083
    /* Shortcut for single character strings */
12084
0
    if (length == 1)
12085
0
        return PyBool_FromLong(
12086
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12087
12088
    /* Special case for empty strings */
12089
0
    if (length == 0)
12090
0
        Py_RETURN_FALSE;
12091
12092
0
    for (i = 0; i < length; i++) {
12093
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12094
0
            Py_RETURN_FALSE;
12095
0
    }
12096
0
    Py_RETURN_TRUE;
12097
0
}
12098
12099
Py_ssize_t
12100
_PyUnicode_ScanIdentifier(PyObject *self)
12101
74.6k
{
12102
74.6k
    Py_ssize_t i;
12103
74.6k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12104
74.6k
    if (len == 0) {
12105
        /* an empty string is not a valid identifier */
12106
0
        return 0;
12107
0
    }
12108
12109
74.6k
    int kind = PyUnicode_KIND(self);
12110
74.6k
    const void *data = PyUnicode_DATA(self);
12111
74.6k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12112
    /* PEP 3131 says that the first character must be in
12113
       XID_Start and subsequent characters in XID_Continue,
12114
       and for the ASCII range, the 2.x rules apply (i.e
12115
       start with letters and underscore, continue with
12116
       letters, digits, underscore). However, given the current
12117
       definition of XID_Start and XID_Continue, it is sufficient
12118
       to check just for these, except that _ must be allowed
12119
       as starting an identifier.  */
12120
74.6k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12121
828
        return 0;
12122
828
    }
12123
12124
610k
    for (i = 1; i < len; i++) {
12125
536k
        ch = PyUnicode_READ(kind, data, i);
12126
536k
        if (!_PyUnicode_IsXidContinue(ch)) {
12127
338
            return i;
12128
338
        }
12129
536k
    }
12130
73.4k
    return i;
12131
73.8k
}
12132
12133
int
12134
PyUnicode_IsIdentifier(PyObject *self)
12135
63.2k
{
12136
63.2k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12137
63.2k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12138
    /* an empty string is not a valid identifier */
12139
63.2k
    return len && i == len;
12140
63.2k
}
12141
12142
/*[clinic input]
12143
@permit_long_docstring_body
12144
str.isidentifier as unicode_isidentifier
12145
12146
Return True if the string is a valid Python identifier, False otherwise.
12147
12148
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12149
such as "def" or "class".
12150
[clinic start generated code]*/
12151
12152
static PyObject *
12153
unicode_isidentifier_impl(PyObject *self)
12154
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12155
61.0k
{
12156
61.0k
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12157
61.0k
}
12158
12159
/*[clinic input]
12160
@permit_long_summary
12161
str.isprintable as unicode_isprintable
12162
12163
Return True if all characters in the string are printable, False otherwise.
12164
12165
A character is printable if repr() may use it in its output.
12166
[clinic start generated code]*/
12167
12168
static PyObject *
12169
unicode_isprintable_impl(PyObject *self)
12170
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12171
1.25M
{
12172
1.25M
    Py_ssize_t i, length;
12173
1.25M
    int kind;
12174
1.25M
    const void *data;
12175
12176
1.25M
    length = PyUnicode_GET_LENGTH(self);
12177
1.25M
    kind = PyUnicode_KIND(self);
12178
1.25M
    data = PyUnicode_DATA(self);
12179
12180
    /* Shortcut for single character strings */
12181
1.25M
    if (length == 1)
12182
1.25M
        return PyBool_FromLong(
12183
1.25M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12184
12185
0
    for (i = 0; i < length; i++) {
12186
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12187
0
            Py_RETURN_FALSE;
12188
0
        }
12189
0
    }
12190
0
    Py_RETURN_TRUE;
12191
0
}
12192
12193
/*[clinic input]
12194
@permit_long_docstring_body
12195
str.join as unicode_join
12196
12197
    iterable: object
12198
    /
12199
12200
Concatenate any number of strings.
12201
12202
The string whose method is called is inserted in between each given string.
12203
The result is returned as a new string.
12204
12205
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12206
[clinic start generated code]*/
12207
12208
static PyObject *
12209
unicode_join(PyObject *self, PyObject *iterable)
12210
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12211
25.6M
{
12212
25.6M
    return PyUnicode_Join(self, iterable);
12213
25.6M
}
12214
12215
static Py_ssize_t
12216
unicode_length(PyObject *self)
12217
39.2M
{
12218
39.2M
    return PyUnicode_GET_LENGTH(self);
12219
39.2M
}
12220
12221
/*[clinic input]
12222
str.ljust as unicode_ljust
12223
12224
    width: Py_ssize_t
12225
    fillchar: Py_UCS4 = ' '
12226
    /
12227
12228
Return a left-justified string of length width.
12229
12230
Padding is done using the specified fill character (default is a space).
12231
[clinic start generated code]*/
12232
12233
static PyObject *
12234
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12235
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12236
122
{
12237
122
    if (PyUnicode_GET_LENGTH(self) >= width)
12238
74
        return unicode_result_unchanged(self);
12239
12240
48
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12241
122
}
12242
12243
/*[clinic input]
12244
str.lower as unicode_lower
12245
12246
Return a copy of the string converted to lowercase.
12247
[clinic start generated code]*/
12248
12249
static PyObject *
12250
unicode_lower_impl(PyObject *self)
12251
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12252
86.1M
{
12253
86.1M
    if (PyUnicode_IS_ASCII(self))
12254
83.0M
        return ascii_upper_or_lower(self, 1);
12255
3.13M
    return case_operation(self, do_lower);
12256
86.1M
}
12257
12258
79.8M
#define LEFTSTRIP 0
12259
110M
#define RIGHTSTRIP 1
12260
43.2M
#define BOTHSTRIP 2
12261
12262
/* Arrays indexed by above */
12263
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12264
12265
0
#define STRIPNAME(i) (stripfuncnames[i])
12266
12267
/* externally visible for str.strip(unicode) */
12268
PyObject *
12269
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12270
10.4M
{
12271
10.4M
    const void *data;
12272
10.4M
    int kind;
12273
10.4M
    Py_ssize_t i, j, len;
12274
10.4M
    BLOOM_MASK sepmask;
12275
10.4M
    Py_ssize_t seplen;
12276
12277
10.4M
    kind = PyUnicode_KIND(self);
12278
10.4M
    data = PyUnicode_DATA(self);
12279
10.4M
    len = PyUnicode_GET_LENGTH(self);
12280
10.4M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12281
10.4M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12282
10.4M
                              PyUnicode_DATA(sepobj),
12283
10.4M
                              seplen);
12284
12285
10.4M
    i = 0;
12286
10.4M
    if (striptype != RIGHTSTRIP) {
12287
506k
        while (i < len) {
12288
503k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12289
503k
            if (!BLOOM(sepmask, ch))
12290
472k
                break;
12291
31.0k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12292
2.65k
                break;
12293
28.4k
            i++;
12294
28.4k
        }
12295
478k
    }
12296
12297
10.4M
    j = len;
12298
10.4M
    if (striptype != LEFTSTRIP) {
12299
10.0M
        j--;
12300
10.7M
        while (j >= i) {
12301
5.95M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12302
5.95M
            if (!BLOOM(sepmask, ch))
12303
5.13M
                break;
12304
817k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12305
93.4k
                break;
12306
723k
            j--;
12307
723k
        }
12308
12309
10.0M
        j++;
12310
10.0M
    }
12311
12312
10.4M
    return PyUnicode_Substring(self, i, j);
12313
10.4M
}
12314
12315
PyObject*
12316
_PyUnicode_BinarySlice(PyObject *container, PyObject *start_o, PyObject *stop_o)
12317
29.3M
{
12318
29.3M
    assert(PyUnicode_CheckExact(container));
12319
29.3M
    Py_ssize_t len = PyUnicode_GET_LENGTH(container);
12320
29.3M
    Py_ssize_t istart, istop;
12321
29.3M
    if (!_PyEval_UnpackIndices(start_o, stop_o, len, &istart, &istop)) {
12322
0
        return NULL;
12323
0
    }
12324
29.3M
    return PyUnicode_Substring(container, istart, istop);
12325
29.3M
}
12326
12327
PyObject*
12328
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12329
278M
{
12330
278M
    const unsigned char *data;
12331
278M
    int kind;
12332
278M
    Py_ssize_t length;
12333
12334
278M
    length = PyUnicode_GET_LENGTH(self);
12335
278M
    end = Py_MIN(end, length);
12336
12337
278M
    if (start == 0 && end == length)
12338
76.7M
        return unicode_result_unchanged(self);
12339
12340
201M
    if (start < 0 || end < 0) {
12341
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12342
0
        return NULL;
12343
0
    }
12344
201M
    if (start >= length || end < start)
12345
5.00M
        _Py_RETURN_UNICODE_EMPTY();
12346
12347
196M
    length = end - start;
12348
196M
    if (PyUnicode_IS_ASCII(self)) {
12349
68.3M
        data = PyUnicode_1BYTE_DATA(self);
12350
68.3M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12351
68.3M
    }
12352
128M
    else {
12353
128M
        kind = PyUnicode_KIND(self);
12354
128M
        data = PyUnicode_1BYTE_DATA(self);
12355
128M
        return PyUnicode_FromKindAndData(kind,
12356
128M
                                         data + kind * start,
12357
128M
                                         length);
12358
128M
    }
12359
196M
}
12360
12361
static PyObject *
12362
do_strip(PyObject *self, int striptype)
12363
67.2M
{
12364
67.2M
    Py_ssize_t len, i, j;
12365
12366
67.2M
    len = PyUnicode_GET_LENGTH(self);
12367
12368
67.2M
    if (PyUnicode_IS_ASCII(self)) {
12369
50.9M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12370
12371
50.9M
        i = 0;
12372
50.9M
        if (striptype != RIGHTSTRIP) {
12373
37.8M
            while (i < len) {
12374
30.8M
                Py_UCS1 ch = data[i];
12375
30.8M
                if (!_Py_ascii_whitespace[ch])
12376
25.9M
                    break;
12377
4.89M
                i++;
12378
4.89M
            }
12379
32.9M
        }
12380
12381
50.9M
        j = len;
12382
50.9M
        if (striptype != LEFTSTRIP) {
12383
50.6M
            j--;
12384
66.5M
            while (j >= i) {
12385
49.2M
                Py_UCS1 ch = data[j];
12386
49.2M
                if (!_Py_ascii_whitespace[ch])
12387
33.2M
                    break;
12388
15.9M
                j--;
12389
15.9M
            }
12390
50.6M
            j++;
12391
50.6M
        }
12392
50.9M
    }
12393
16.3M
    else {
12394
16.3M
        int kind = PyUnicode_KIND(self);
12395
16.3M
        const void *data = PyUnicode_DATA(self);
12396
12397
16.3M
        i = 0;
12398
16.3M
        if (striptype != RIGHTSTRIP) {
12399
14.6M
            while (i < len) {
12400
14.6M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12401
14.6M
                if (!Py_UNICODE_ISSPACE(ch))
12402
11.9M
                    break;
12403
2.73M
                i++;
12404
2.73M
            }
12405
11.9M
        }
12406
12407
16.3M
        j = len;
12408
16.3M
        if (striptype != LEFTSTRIP) {
12409
15.0M
            j--;
12410
18.4M
            while (j >= i) {
12411
18.3M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12412
18.3M
                if (!Py_UNICODE_ISSPACE(ch))
12413
15.0M
                    break;
12414
3.38M
                j--;
12415
3.38M
            }
12416
15.0M
            j++;
12417
15.0M
        }
12418
16.3M
    }
12419
12420
67.2M
    return PyUnicode_Substring(self, i, j);
12421
67.2M
}
12422
12423
12424
static PyObject *
12425
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12426
77.7M
{
12427
77.7M
    if (sep != Py_None) {
12428
10.4M
        if (PyUnicode_Check(sep))
12429
10.4M
            return _PyUnicode_XStrip(self, striptype, sep);
12430
0
        else {
12431
0
            PyErr_Format(PyExc_TypeError,
12432
0
                         "%s arg must be None or str",
12433
0
                         STRIPNAME(striptype));
12434
0
            return NULL;
12435
0
        }
12436
10.4M
    }
12437
12438
67.2M
    return do_strip(self, striptype);
12439
77.7M
}
12440
12441
12442
/*[clinic input]
12443
@permit_long_summary
12444
str.strip as unicode_strip
12445
12446
    chars: object = None
12447
    /
12448
12449
Return a copy of the string with leading and trailing whitespace removed.
12450
12451
If chars is given and not None, remove characters in chars instead.
12452
[clinic start generated code]*/
12453
12454
static PyObject *
12455
unicode_strip_impl(PyObject *self, PyObject *chars)
12456
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12457
43.2M
{
12458
43.2M
    return do_argstrip(self, BOTHSTRIP, chars);
12459
43.2M
}
12460
12461
12462
/*[clinic input]
12463
str.lstrip as unicode_lstrip
12464
12465
    chars: object = None
12466
    /
12467
12468
Return a copy of the string with leading whitespace removed.
12469
12470
If chars is given and not None, remove characters in chars instead.
12471
[clinic start generated code]*/
12472
12473
static PyObject *
12474
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12475
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12476
2.09M
{
12477
2.09M
    return do_argstrip(self, LEFTSTRIP, chars);
12478
2.09M
}
12479
12480
12481
/*[clinic input]
12482
str.rstrip as unicode_rstrip
12483
12484
    chars: object = None
12485
    /
12486
12487
Return a copy of the string with trailing whitespace removed.
12488
12489
If chars is given and not None, remove characters in chars instead.
12490
[clinic start generated code]*/
12491
12492
static PyObject *
12493
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12494
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12495
32.4M
{
12496
32.4M
    return do_argstrip(self, RIGHTSTRIP, chars);
12497
32.4M
}
12498
12499
12500
static PyObject*
12501
unicode_repeat(PyObject *str, Py_ssize_t len)
12502
646k
{
12503
646k
    PyObject *u;
12504
646k
    Py_ssize_t nchars, n;
12505
12506
646k
    if (len < 1)
12507
28.3k
        _Py_RETURN_UNICODE_EMPTY();
12508
12509
    /* no repeat, return original string */
12510
617k
    if (len == 1)
12511
107k
        return unicode_result_unchanged(str);
12512
12513
510k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12514
0
        PyErr_SetString(PyExc_OverflowError,
12515
0
                        "repeated string is too long");
12516
0
        return NULL;
12517
0
    }
12518
510k
    nchars = len * PyUnicode_GET_LENGTH(str);
12519
12520
510k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12521
510k
    if (!u)
12522
0
        return NULL;
12523
510k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12524
12525
510k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12526
508k
        int kind = PyUnicode_KIND(str);
12527
508k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12528
508k
        if (kind == PyUnicode_1BYTE_KIND) {
12529
508k
            void *to = PyUnicode_DATA(u);
12530
508k
            memset(to, (unsigned char)fill_char, len);
12531
508k
        }
12532
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12533
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12534
0
            for (n = 0; n < len; ++n)
12535
0
                ucs2[n] = fill_char;
12536
0
        } else {
12537
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12538
0
            assert(kind == PyUnicode_4BYTE_KIND);
12539
0
            for (n = 0; n < len; ++n)
12540
0
                ucs4[n] = fill_char;
12541
0
        }
12542
508k
    }
12543
2.65k
    else {
12544
2.65k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12545
2.65k
        char *to = (char *) PyUnicode_DATA(u);
12546
2.65k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12547
2.65k
            PyUnicode_GET_LENGTH(str) * char_size);
12548
2.65k
    }
12549
12550
510k
    assert(_PyUnicode_CheckConsistency(u, 1));
12551
510k
    return u;
12552
510k
}
12553
12554
PyObject *
12555
PyUnicode_Replace(PyObject *str,
12556
                  PyObject *substr,
12557
                  PyObject *replstr,
12558
                  Py_ssize_t maxcount)
12559
0
{
12560
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12561
0
            ensure_unicode(replstr) < 0)
12562
0
        return NULL;
12563
0
    return replace(str, substr, replstr, maxcount);
12564
0
}
12565
12566
/*[clinic input]
12567
@permit_long_docstring_body
12568
str.replace as unicode_replace
12569
12570
    old: unicode
12571
    new: unicode
12572
    /
12573
    count: Py_ssize_t = -1
12574
        Maximum number of occurrences to replace.
12575
        -1 (the default value) means replace all occurrences.
12576
12577
Return a copy with all occurrences of substring old replaced by new.
12578
12579
If the optional argument count is given, only the first count occurrences are
12580
replaced.
12581
[clinic start generated code]*/
12582
12583
static PyObject *
12584
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12585
                     Py_ssize_t count)
12586
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12587
75.6M
{
12588
75.6M
    return replace(self, old, new, count);
12589
75.6M
}
12590
12591
/*[clinic input]
12592
@permit_long_docstring_body
12593
str.removeprefix as unicode_removeprefix
12594
12595
    prefix: unicode
12596
    /
12597
12598
Return a str with the given prefix string removed if present.
12599
12600
If the string starts with the prefix string, return string[len(prefix):].
12601
Otherwise, return a copy of the original string.
12602
[clinic start generated code]*/
12603
12604
static PyObject *
12605
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12606
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12607
308
{
12608
308
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12609
308
    if (match == -1) {
12610
0
        return NULL;
12611
0
    }
12612
308
    if (match) {
12613
80
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12614
80
                                   PyUnicode_GET_LENGTH(self));
12615
80
    }
12616
228
    return unicode_result_unchanged(self);
12617
308
}
12618
12619
/*[clinic input]
12620
str.removesuffix as unicode_removesuffix
12621
12622
    suffix: unicode
12623
    /
12624
12625
Return a str with the given suffix string removed if present.
12626
12627
If the string ends with the suffix string and that suffix is not empty,
12628
return string[:-len(suffix)]. Otherwise, return a copy of the original
12629
string.
12630
[clinic start generated code]*/
12631
12632
static PyObject *
12633
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12634
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12635
0
{
12636
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12637
0
    if (match == -1) {
12638
0
        return NULL;
12639
0
    }
12640
0
    if (match) {
12641
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12642
0
                                            - PyUnicode_GET_LENGTH(suffix));
12643
0
    }
12644
0
    return unicode_result_unchanged(self);
12645
0
}
12646
12647
static PyObject *
12648
unicode_repr(PyObject *unicode)
12649
9.94M
{
12650
9.94M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12651
9.94M
    const void *idata = PyUnicode_DATA(unicode);
12652
12653
    /* Compute length of output, quote characters, and
12654
       maximum character */
12655
9.94M
    Py_ssize_t osize = 0;
12656
9.94M
    Py_UCS4 maxch = 127;
12657
9.94M
    Py_ssize_t squote = 0;
12658
9.94M
    Py_ssize_t dquote = 0;
12659
9.94M
    int ikind = PyUnicode_KIND(unicode);
12660
271M
    for (Py_ssize_t i = 0; i < isize; i++) {
12661
261M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12662
261M
        Py_ssize_t incr = 1;
12663
261M
        switch (ch) {
12664
191k
        case '\'': squote++; break;
12665
582k
        case '"':  dquote++; break;
12666
4.51M
        case '\\': case '\t': case '\r': case '\n':
12667
4.51M
            incr = 2;
12668
4.51M
            break;
12669
256M
        default:
12670
            /* Fast-path ASCII */
12671
256M
            if (ch < ' ' || ch == 0x7f)
12672
147M
                incr = 4; /* \xHH */
12673
109M
            else if (ch < 0x7f)
12674
97.7M
                ;
12675
11.4M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12676
11.0M
                maxch = (ch > maxch) ? ch : maxch;
12677
419k
            else if (ch < 0x100)
12678
94.3k
                incr = 4; /* \xHH */
12679
324k
            else if (ch < 0x10000)
12680
87.7k
                incr = 6; /* \uHHHH */
12681
236k
            else
12682
236k
                incr = 10; /* \uHHHHHHHH */
12683
261M
        }
12684
261M
        if (osize > PY_SSIZE_T_MAX - incr) {
12685
0
            PyErr_SetString(PyExc_OverflowError,
12686
0
                            "string is too long to generate repr");
12687
0
            return NULL;
12688
0
        }
12689
261M
        osize += incr;
12690
261M
    }
12691
12692
9.94M
    Py_UCS4 quote = '\'';
12693
9.94M
    int changed = (osize != isize);
12694
9.94M
    if (squote) {
12695
102k
        changed = 1;
12696
102k
        if (dquote)
12697
            /* Both squote and dquote present. Use squote,
12698
               and escape them */
12699
8.73k
            osize += squote;
12700
93.9k
        else
12701
93.9k
            quote = '"';
12702
102k
    }
12703
9.94M
    osize += 2;   /* quotes */
12704
12705
9.94M
    PyObject *repr = PyUnicode_New(osize, maxch);
12706
9.94M
    if (repr == NULL)
12707
0
        return NULL;
12708
9.94M
    int okind = PyUnicode_KIND(repr);
12709
9.94M
    void *odata = PyUnicode_DATA(repr);
12710
12711
9.94M
    if (!changed) {
12712
5.62M
        PyUnicode_WRITE(okind, odata, 0, quote);
12713
12714
5.62M
        _PyUnicode_FastCopyCharacters(repr, 1,
12715
5.62M
                                      unicode, 0,
12716
5.62M
                                      isize);
12717
12718
5.62M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12719
5.62M
    }
12720
4.32M
    else {
12721
4.32M
        switch (okind) {
12722
4.04M
        case PyUnicode_1BYTE_KIND:
12723
4.04M
            ucs1lib_repr(unicode, quote, odata);
12724
4.04M
            break;
12725
268k
        case PyUnicode_2BYTE_KIND:
12726
268k
            ucs2lib_repr(unicode, quote, odata);
12727
268k
            break;
12728
9.90k
        default:
12729
9.90k
            assert(okind == PyUnicode_4BYTE_KIND);
12730
9.90k
            ucs4lib_repr(unicode, quote, odata);
12731
4.32M
        }
12732
4.32M
    }
12733
12734
9.94M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12735
9.94M
    return repr;
12736
9.94M
}
12737
12738
/*[clinic input]
12739
@permit_long_summary
12740
str.rfind as unicode_rfind = str.count
12741
12742
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12743
12744
Optional arguments start and end are interpreted as in slice notation.
12745
Return -1 on failure.
12746
[clinic start generated code]*/
12747
12748
static Py_ssize_t
12749
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12750
                   Py_ssize_t end)
12751
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12752
273k
{
12753
273k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12754
273k
    if (result < 0) {
12755
8.57k
        return -1;
12756
8.57k
    }
12757
265k
    return result;
12758
273k
}
12759
12760
/*[clinic input]
12761
@permit_long_summary
12762
str.rindex as unicode_rindex = str.count
12763
12764
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12765
12766
Optional arguments start and end are interpreted as in slice notation.
12767
Raises ValueError when the substring is not found.
12768
[clinic start generated code]*/
12769
12770
static Py_ssize_t
12771
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12772
                    Py_ssize_t end)
12773
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12774
147k
{
12775
147k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12776
147k
    if (result == -1) {
12777
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12778
0
    }
12779
147k
    else if (result < 0) {
12780
0
        return -1;
12781
0
    }
12782
147k
    return result;
12783
147k
}
12784
12785
/*[clinic input]
12786
str.rjust as unicode_rjust
12787
12788
    width: Py_ssize_t
12789
    fillchar: Py_UCS4 = ' '
12790
    /
12791
12792
Return a right-justified string of length width.
12793
12794
Padding is done using the specified fill character (default is a space).
12795
[clinic start generated code]*/
12796
12797
static PyObject *
12798
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12799
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12800
0
{
12801
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12802
0
        return unicode_result_unchanged(self);
12803
12804
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12805
0
}
12806
12807
PyObject *
12808
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12809
0
{
12810
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12811
0
        return NULL;
12812
12813
0
    return split(s, sep, maxsplit);
12814
0
}
12815
12816
/*[clinic input]
12817
@permit_long_summary
12818
str.split as unicode_split
12819
12820
    sep: object = None
12821
        The separator used to split the string.
12822
12823
        When set to None (the default value), will split on any whitespace
12824
        character (including \n \r \t \f and spaces) and will discard
12825
        empty strings from the result.
12826
    maxsplit: Py_ssize_t = -1
12827
        Maximum number of splits.
12828
        -1 (the default value) means no limit.
12829
12830
Return a list of the substrings in the string, using sep as the separator string.
12831
12832
Splitting starts at the front of the string and works to the end.
12833
12834
Note, str.split() is mainly useful for data that has been intentionally
12835
delimited.  With natural text that includes punctuation, consider using
12836
the regular expression module.
12837
12838
[clinic start generated code]*/
12839
12840
static PyObject *
12841
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12842
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12843
27.0M
{
12844
27.0M
    if (sep == Py_None)
12845
168k
        return split(self, NULL, maxsplit);
12846
26.8M
    if (PyUnicode_Check(sep))
12847
26.8M
        return split(self, sep, maxsplit);
12848
12849
0
    PyErr_Format(PyExc_TypeError,
12850
0
                 "must be str or None, not %.100s",
12851
0
                 Py_TYPE(sep)->tp_name);
12852
0
    return NULL;
12853
26.8M
}
12854
12855
PyObject *
12856
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12857
11.2M
{
12858
11.2M
    PyObject* out;
12859
11.2M
    int kind1, kind2;
12860
11.2M
    const void *buf1, *buf2;
12861
11.2M
    Py_ssize_t len1, len2;
12862
12863
11.2M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12864
0
        return NULL;
12865
12866
11.2M
    kind1 = PyUnicode_KIND(str_obj);
12867
11.2M
    kind2 = PyUnicode_KIND(sep_obj);
12868
11.2M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12869
11.2M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12870
11.2M
    if (kind1 < kind2 || len1 < len2) {
12871
1.25k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12872
1.25k
        return PyTuple_Pack(3, str_obj, empty, empty);
12873
1.25k
    }
12874
11.2M
    buf1 = PyUnicode_DATA(str_obj);
12875
11.2M
    buf2 = PyUnicode_DATA(sep_obj);
12876
11.2M
    if (kind2 != kind1) {
12877
81.7k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12878
81.7k
        if (!buf2)
12879
0
            return NULL;
12880
81.7k
    }
12881
12882
11.2M
    switch (kind1) {
12883
11.1M
    case PyUnicode_1BYTE_KIND:
12884
11.1M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12885
3.67M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12886
7.48M
        else
12887
7.48M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12888
11.1M
        break;
12889
71.8k
    case PyUnicode_2BYTE_KIND:
12890
71.8k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12891
71.8k
        break;
12892
9.92k
    case PyUnicode_4BYTE_KIND:
12893
9.92k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12894
9.92k
        break;
12895
0
    default:
12896
0
        Py_UNREACHABLE();
12897
11.2M
    }
12898
12899
11.2M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12900
11.2M
    if (kind2 != kind1)
12901
81.7k
        PyMem_Free((void *)buf2);
12902
12903
11.2M
    return out;
12904
11.2M
}
12905
12906
12907
PyObject *
12908
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12909
61.0k
{
12910
61.0k
    PyObject* out;
12911
61.0k
    int kind1, kind2;
12912
61.0k
    const void *buf1, *buf2;
12913
61.0k
    Py_ssize_t len1, len2;
12914
12915
61.0k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12916
0
        return NULL;
12917
12918
61.0k
    kind1 = PyUnicode_KIND(str_obj);
12919
61.0k
    kind2 = PyUnicode_KIND(sep_obj);
12920
61.0k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12921
61.0k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12922
61.0k
    if (kind1 < kind2 || len1 < len2) {
12923
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12924
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12925
0
    }
12926
61.0k
    buf1 = PyUnicode_DATA(str_obj);
12927
61.0k
    buf2 = PyUnicode_DATA(sep_obj);
12928
61.0k
    if (kind2 != kind1) {
12929
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12930
0
        if (!buf2)
12931
0
            return NULL;
12932
0
    }
12933
12934
61.0k
    switch (kind1) {
12935
61.0k
    case PyUnicode_1BYTE_KIND:
12936
61.0k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12937
61.0k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12938
0
        else
12939
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12940
61.0k
        break;
12941
0
    case PyUnicode_2BYTE_KIND:
12942
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12943
0
        break;
12944
0
    case PyUnicode_4BYTE_KIND:
12945
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12946
0
        break;
12947
0
    default:
12948
0
        Py_UNREACHABLE();
12949
61.0k
    }
12950
12951
61.0k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12952
61.0k
    if (kind2 != kind1)
12953
0
        PyMem_Free((void *)buf2);
12954
12955
61.0k
    return out;
12956
61.0k
}
12957
12958
/*[clinic input]
12959
@permit_long_docstring_body
12960
str.partition as unicode_partition
12961
12962
    sep: object
12963
    /
12964
12965
Partition the string into three parts using the given separator.
12966
12967
This will search for the separator in the string.  If the separator is found,
12968
returns a 3-tuple containing the part before the separator, the separator
12969
itself, and the part after it.
12970
12971
If the separator is not found, returns a 3-tuple containing the original string
12972
and two empty strings.
12973
[clinic start generated code]*/
12974
12975
static PyObject *
12976
unicode_partition(PyObject *self, PyObject *sep)
12977
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
12978
11.2M
{
12979
11.2M
    return PyUnicode_Partition(self, sep);
12980
11.2M
}
12981
12982
/*[clinic input]
12983
@permit_long_docstring_body
12984
str.rpartition as unicode_rpartition = str.partition
12985
12986
Partition the string into three parts using the given separator.
12987
12988
This will search for the separator in the string, starting at the end. If
12989
the separator is found, returns a 3-tuple containing the part before the
12990
separator, the separator itself, and the part after it.
12991
12992
If the separator is not found, returns a 3-tuple containing two empty strings
12993
and the original string.
12994
[clinic start generated code]*/
12995
12996
static PyObject *
12997
unicode_rpartition(PyObject *self, PyObject *sep)
12998
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
12999
61.0k
{
13000
61.0k
    return PyUnicode_RPartition(self, sep);
13001
61.0k
}
13002
13003
PyObject *
13004
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13005
0
{
13006
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13007
0
        return NULL;
13008
13009
0
    return rsplit(s, sep, maxsplit);
13010
0
}
13011
13012
/*[clinic input]
13013
@permit_long_summary
13014
str.rsplit as unicode_rsplit = str.split
13015
13016
Return a list of the substrings in the string, using sep as the separator string.
13017
13018
Splitting starts at the end of the string and works to the front.
13019
[clinic start generated code]*/
13020
13021
static PyObject *
13022
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13023
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13024
78
{
13025
78
    if (sep == Py_None)
13026
0
        return rsplit(self, NULL, maxsplit);
13027
78
    if (PyUnicode_Check(sep))
13028
78
        return rsplit(self, sep, maxsplit);
13029
13030
0
    PyErr_Format(PyExc_TypeError,
13031
0
                 "must be str or None, not %.100s",
13032
0
                 Py_TYPE(sep)->tp_name);
13033
0
    return NULL;
13034
78
}
13035
13036
/*[clinic input]
13037
@permit_long_docstring_body
13038
str.splitlines as unicode_splitlines
13039
13040
    keepends: bool = False
13041
13042
Return a list of the lines in the string, breaking at line boundaries.
13043
13044
Line breaks are not included in the resulting list unless keepends is given and
13045
true.
13046
[clinic start generated code]*/
13047
13048
static PyObject *
13049
unicode_splitlines_impl(PyObject *self, int keepends)
13050
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13051
13.5k
{
13052
13.5k
    return PyUnicode_Splitlines(self, keepends);
13053
13.5k
}
13054
13055
static
13056
PyObject *unicode_str(PyObject *self)
13057
2.84M
{
13058
2.84M
    return unicode_result_unchanged(self);
13059
2.84M
}
13060
13061
/*[clinic input]
13062
@permit_long_summary
13063
str.swapcase as unicode_swapcase
13064
13065
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13066
[clinic start generated code]*/
13067
13068
static PyObject *
13069
unicode_swapcase_impl(PyObject *self)
13070
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13071
0
{
13072
0
    return case_operation(self, do_swapcase);
13073
0
}
13074
13075
static int
13076
unicode_maketrans_from_dict(PyObject *x, PyObject *newdict)
13077
0
{
13078
0
    PyObject *key, *value;
13079
0
    Py_ssize_t i = 0;
13080
0
    int res;
13081
0
    while (PyDict_Next(x, &i, &key, &value)) {
13082
0
        if (PyUnicode_Check(key)) {
13083
0
            PyObject *newkey;
13084
0
            int kind;
13085
0
            const void *data;
13086
0
            if (PyUnicode_GET_LENGTH(key) != 1) {
13087
0
                PyErr_SetString(PyExc_ValueError, "string keys in translate"
13088
0
                                "table must be of length 1");
13089
0
                return -1;
13090
0
            }
13091
0
            kind = PyUnicode_KIND(key);
13092
0
            data = PyUnicode_DATA(key);
13093
0
            newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13094
0
            if (!newkey)
13095
0
                return -1;
13096
0
            res = PyDict_SetItem(newdict, newkey, value);
13097
0
            Py_DECREF(newkey);
13098
0
            if (res < 0)
13099
0
                return -1;
13100
0
        }
13101
0
        else if (PyLong_Check(key)) {
13102
0
            if (PyDict_SetItem(newdict, key, value) < 0)
13103
0
                return -1;
13104
0
        }
13105
0
        else {
13106
0
            PyErr_SetString(PyExc_TypeError, "keys in translate table must"
13107
0
                            "be strings or integers");
13108
0
            return -1;
13109
0
        }
13110
0
    }
13111
0
    return 0;
13112
0
}
13113
13114
/*[clinic input]
13115
13116
@staticmethod
13117
str.maketrans as unicode_maketrans
13118
13119
  x: object
13120
13121
  y: unicode=NULL
13122
13123
  z: unicode=NULL
13124
13125
  /
13126
13127
Return a translation table usable for str.translate().
13128
13129
If there is only one argument, it must be a dictionary mapping Unicode
13130
ordinals (integers) or characters to Unicode ordinals, strings or None.
13131
Character keys will be then converted to ordinals.
13132
If there are two arguments, they must be strings of equal length, and
13133
in the resulting dictionary, each character in x will be mapped to the
13134
character at the same position in y. If there is a third argument, it
13135
must be a string, whose characters will be mapped to None in the result.
13136
[clinic start generated code]*/
13137
13138
static PyObject *
13139
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13140
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13141
4
{
13142
4
    PyObject *new = NULL, *key, *value;
13143
4
    Py_ssize_t i = 0;
13144
4
    int res;
13145
13146
4
    new = PyDict_New();
13147
4
    if (!new)
13148
0
        return NULL;
13149
4
    if (y != NULL) {
13150
4
        int x_kind, y_kind, z_kind;
13151
4
        const void *x_data, *y_data, *z_data;
13152
13153
        /* x must be a string too, of equal length */
13154
4
        if (!PyUnicode_Check(x)) {
13155
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13156
0
                            "be a string if there is a second argument");
13157
0
            goto err;
13158
0
        }
13159
4
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13160
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13161
0
                            "arguments must have equal length");
13162
0
            goto err;
13163
0
        }
13164
        /* create entries for translating chars in x to those in y */
13165
4
        x_kind = PyUnicode_KIND(x);
13166
4
        y_kind = PyUnicode_KIND(y);
13167
4
        x_data = PyUnicode_DATA(x);
13168
4
        y_data = PyUnicode_DATA(y);
13169
36
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13170
32
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13171
32
            if (!key)
13172
0
                goto err;
13173
32
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13174
32
            if (!value) {
13175
0
                Py_DECREF(key);
13176
0
                goto err;
13177
0
            }
13178
32
            res = PyDict_SetItem(new, key, value);
13179
32
            Py_DECREF(key);
13180
32
            Py_DECREF(value);
13181
32
            if (res < 0)
13182
0
                goto err;
13183
32
        }
13184
        /* create entries for deleting chars in z */
13185
4
        if (z != NULL) {
13186
0
            z_kind = PyUnicode_KIND(z);
13187
0
            z_data = PyUnicode_DATA(z);
13188
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13189
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13190
0
                if (!key)
13191
0
                    goto err;
13192
0
                res = PyDict_SetItem(new, key, Py_None);
13193
0
                Py_DECREF(key);
13194
0
                if (res < 0)
13195
0
                    goto err;
13196
0
            }
13197
0
        }
13198
4
    } else {
13199
        /* x must be a dict */
13200
0
        if (!PyAnyDict_CheckExact(x)) {
13201
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13202
0
                            "to maketrans it must be a dict");
13203
0
            goto err;
13204
0
        }
13205
        /* copy entries into the new dict, converting string keys to int keys */
13206
0
        int errcode;
13207
0
        Py_BEGIN_CRITICAL_SECTION(x);
13208
0
        errcode = unicode_maketrans_from_dict(x, new);
13209
0
        Py_END_CRITICAL_SECTION();
13210
0
        if (errcode < 0)
13211
0
            goto err;
13212
0
    }
13213
4
    return new;
13214
0
  err:
13215
0
    Py_DECREF(new);
13216
0
    return NULL;
13217
4
}
13218
13219
/*[clinic input]
13220
@permit_long_docstring_body
13221
str.translate as unicode_translate
13222
13223
    table: object
13224
        Translation table, which must be a mapping of Unicode ordinals to
13225
        Unicode ordinals, strings, or None.
13226
    /
13227
13228
Replace each character in the string using the given translation table.
13229
13230
The table must implement lookup/indexing via __getitem__, for instance a
13231
dictionary or list.  If this operation raises LookupError, the character is
13232
left untouched.  Characters mapped to None are deleted.
13233
[clinic start generated code]*/
13234
13235
static PyObject *
13236
unicode_translate(PyObject *self, PyObject *table)
13237
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13238
12.3k
{
13239
12.3k
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13240
12.3k
}
13241
13242
/*[clinic input]
13243
str.upper as unicode_upper
13244
13245
Return a copy of the string converted to uppercase.
13246
[clinic start generated code]*/
13247
13248
static PyObject *
13249
unicode_upper_impl(PyObject *self)
13250
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13251
306
{
13252
306
    if (PyUnicode_IS_ASCII(self))
13253
306
        return ascii_upper_or_lower(self, 0);
13254
0
    return case_operation(self, do_upper);
13255
306
}
13256
13257
/*[clinic input]
13258
@permit_long_summary
13259
str.zfill as unicode_zfill
13260
13261
    width: Py_ssize_t
13262
    /
13263
13264
Pad a numeric string with zeros on the left, to fill a field of the given width.
13265
13266
The string is never truncated.
13267
[clinic start generated code]*/
13268
13269
static PyObject *
13270
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13271
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13272
0
{
13273
0
    Py_ssize_t fill;
13274
0
    PyObject *u;
13275
0
    int kind;
13276
0
    const void *data;
13277
0
    Py_UCS4 chr;
13278
13279
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13280
0
        return unicode_result_unchanged(self);
13281
13282
0
    fill = width - PyUnicode_GET_LENGTH(self);
13283
13284
0
    u = pad(self, fill, 0, '0');
13285
13286
0
    if (u == NULL)
13287
0
        return NULL;
13288
13289
0
    kind = PyUnicode_KIND(u);
13290
0
    data = PyUnicode_DATA(u);
13291
0
    chr = PyUnicode_READ(kind, data, fill);
13292
13293
0
    if (chr == '+' || chr == '-') {
13294
        /* move sign to beginning of string */
13295
0
        PyUnicode_WRITE(kind, data, 0, chr);
13296
0
        PyUnicode_WRITE(kind, data, fill, '0');
13297
0
    }
13298
13299
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13300
0
    return u;
13301
0
}
13302
13303
/*[clinic input]
13304
@permit_long_summary
13305
@text_signature "($self, prefix[, start[, end]], /)"
13306
str.startswith as unicode_startswith
13307
13308
    prefix as subobj: object
13309
        A string or a tuple of strings to try.
13310
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13311
        Optional start position. Default: start of the string.
13312
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13313
        Optional stop position. Default: end of the string.
13314
    /
13315
13316
Return True if the string starts with the specified prefix, False otherwise.
13317
[clinic start generated code]*/
13318
13319
static PyObject *
13320
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13321
                        Py_ssize_t end)
13322
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13323
78.3M
{
13324
78.3M
    if (PyTuple_Check(subobj)) {
13325
9.33M
        Py_ssize_t i;
13326
33.8M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13327
24.5M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13328
24.5M
            if (!PyUnicode_Check(substring)) {
13329
0
                PyErr_Format(PyExc_TypeError,
13330
0
                             "tuple for startswith must only contain str, "
13331
0
                             "not %.100s",
13332
0
                             Py_TYPE(substring)->tp_name);
13333
0
                return NULL;
13334
0
            }
13335
24.5M
            int result = tailmatch(self, substring, start, end, -1);
13336
24.5M
            if (result < 0) {
13337
0
                return NULL;
13338
0
            }
13339
24.5M
            if (result) {
13340
66.1k
                Py_RETURN_TRUE;
13341
66.1k
            }
13342
24.5M
        }
13343
        /* nothing matched */
13344
9.33M
        Py_RETURN_FALSE;
13345
9.33M
    }
13346
68.9M
    if (!PyUnicode_Check(subobj)) {
13347
0
        PyErr_Format(PyExc_TypeError,
13348
0
                     "startswith first arg must be str or "
13349
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13350
0
        return NULL;
13351
0
    }
13352
68.9M
    int result = tailmatch(self, subobj, start, end, -1);
13353
68.9M
    if (result < 0) {
13354
0
        return NULL;
13355
0
    }
13356
68.9M
    return PyBool_FromLong(result);
13357
68.9M
}
13358
13359
13360
/*[clinic input]
13361
@permit_long_summary
13362
@text_signature "($self, suffix[, start[, end]], /)"
13363
str.endswith as unicode_endswith
13364
13365
    suffix as subobj: object
13366
        A string or a tuple of strings to try.
13367
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13368
        Optional start position. Default: start of the string.
13369
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13370
        Optional stop position. Default: end of the string.
13371
    /
13372
13373
Return True if the string ends with the specified suffix, False otherwise.
13374
[clinic start generated code]*/
13375
13376
static PyObject *
13377
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13378
                      Py_ssize_t end)
13379
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13380
10.5M
{
13381
10.5M
    if (PyTuple_Check(subobj)) {
13382
197k
        Py_ssize_t i;
13383
381k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13384
355k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13385
355k
            if (!PyUnicode_Check(substring)) {
13386
0
                PyErr_Format(PyExc_TypeError,
13387
0
                             "tuple for endswith must only contain str, "
13388
0
                             "not %.100s",
13389
0
                             Py_TYPE(substring)->tp_name);
13390
0
                return NULL;
13391
0
            }
13392
355k
            int result = tailmatch(self, substring, start, end, +1);
13393
355k
            if (result < 0) {
13394
0
                return NULL;
13395
0
            }
13396
355k
            if (result) {
13397
170k
                Py_RETURN_TRUE;
13398
170k
            }
13399
355k
        }
13400
197k
        Py_RETURN_FALSE;
13401
197k
    }
13402
10.3M
    if (!PyUnicode_Check(subobj)) {
13403
0
        PyErr_Format(PyExc_TypeError,
13404
0
                     "endswith first arg must be str or "
13405
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13406
0
        return NULL;
13407
0
    }
13408
10.3M
    int result = tailmatch(self, subobj, start, end, +1);
13409
10.3M
    if (result < 0) {
13410
0
        return NULL;
13411
0
    }
13412
10.3M
    return PyBool_FromLong(result);
13413
10.3M
}
13414
13415
13416
#include "stringlib/unicode_format.h"
13417
13418
PyDoc_STRVAR(format__doc__,
13419
             "format($self, /, *args, **kwargs)\n\
13420
--\n\
13421
\n\
13422
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13423
The substitutions are identified by braces ('{' and '}').");
13424
13425
PyDoc_STRVAR(format_map__doc__,
13426
             "format_map($self, mapping, /)\n\
13427
--\n\
13428
\n\
13429
Return a formatted version of the string, using substitutions from mapping.\n\
13430
The substitutions are identified by braces ('{' and '}').");
13431
13432
/*[clinic input]
13433
str.__format__ as unicode___format__
13434
13435
    format_spec: unicode
13436
    /
13437
13438
Return a formatted version of the string as described by format_spec.
13439
[clinic start generated code]*/
13440
13441
static PyObject *
13442
unicode___format___impl(PyObject *self, PyObject *format_spec)
13443
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13444
0
{
13445
0
    _PyUnicodeWriter writer;
13446
0
    int ret;
13447
13448
0
    _PyUnicodeWriter_Init(&writer);
13449
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13450
0
                                          self, format_spec, 0,
13451
0
                                          PyUnicode_GET_LENGTH(format_spec));
13452
0
    if (ret == -1) {
13453
0
        _PyUnicodeWriter_Dealloc(&writer);
13454
0
        return NULL;
13455
0
    }
13456
0
    return _PyUnicodeWriter_Finish(&writer);
13457
0
}
13458
13459
/*[clinic input]
13460
str.__sizeof__ as unicode_sizeof
13461
13462
Return the size of the string in memory, in bytes.
13463
[clinic start generated code]*/
13464
13465
static PyObject *
13466
unicode_sizeof_impl(PyObject *self)
13467
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13468
0
{
13469
0
    Py_ssize_t size;
13470
13471
    /* If it's a compact object, account for base structure +
13472
       character data. */
13473
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13474
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13475
0
    }
13476
0
    else if (PyUnicode_IS_COMPACT(self)) {
13477
0
        size = sizeof(PyCompactUnicodeObject) +
13478
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13479
0
    }
13480
0
    else {
13481
        /* If it is a two-block object, account for base object, and
13482
           for character block if present. */
13483
0
        size = sizeof(PyUnicodeObject);
13484
0
        if (_PyUnicode_DATA_ANY(self))
13485
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13486
0
                PyUnicode_KIND(self);
13487
0
    }
13488
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13489
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13490
13491
0
    return PyLong_FromSsize_t(size);
13492
0
}
13493
13494
static PyObject *
13495
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13496
0
{
13497
0
    PyObject *copy = _PyUnicode_Copy(v);
13498
0
    if (!copy)
13499
0
        return NULL;
13500
0
    return Py_BuildValue("(N)", copy);
13501
0
}
13502
13503
/*
13504
This function searchs the longest common leading whitespace
13505
of all lines in the [src, end).
13506
It returns the length of the common leading whitespace and sets `output` to
13507
point to the beginning of the common leading whitespace if length > 0.
13508
*/
13509
static Py_ssize_t
13510
search_longest_common_leading_whitespace(
13511
    const char *const src,
13512
    const char *const end,
13513
    const char **output)
13514
0
{
13515
    // [_start, _start + _len)
13516
    // describes the current longest common leading whitespace
13517
0
    const char *_start = NULL;
13518
0
    Py_ssize_t _len = 0;
13519
13520
0
    for (const char *iter = src; iter < end; ++iter) {
13521
0
        const char *line_start = iter;
13522
0
        const char *leading_whitespace_end = NULL;
13523
13524
        // scan the whole line
13525
0
        while (iter < end && *iter != '\n') {
13526
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13527
                /* `iter` points to the first non-whitespace character
13528
                   in this line */
13529
0
                if (iter == line_start) {
13530
                    // some line has no indent, fast exit!
13531
0
                    return 0;
13532
0
                }
13533
0
                leading_whitespace_end = iter;
13534
0
            }
13535
0
            ++iter;
13536
0
        }
13537
13538
        // if this line has all white space, skip it
13539
0
        if (!leading_whitespace_end) {
13540
0
            continue;
13541
0
        }
13542
13543
0
        if (!_start) {
13544
            // update the first leading whitespace
13545
0
            _start = line_start;
13546
0
            _len = leading_whitespace_end - line_start;
13547
0
            assert(_len > 0);
13548
0
        }
13549
0
        else {
13550
            /* We then compare with the current longest leading whitespace.
13551
13552
               [line_start, leading_whitespace_end) is the leading
13553
               whitespace of this line,
13554
13555
               [_start, _start + _len) is the leading whitespace of the
13556
               current longest leading whitespace. */
13557
0
            Py_ssize_t new_len = 0;
13558
0
            const char *_iter = _start, *line_iter = line_start;
13559
13560
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13561
0
                   && *_iter == *line_iter)
13562
0
            {
13563
0
                ++_iter;
13564
0
                ++line_iter;
13565
0
                ++new_len;
13566
0
            }
13567
13568
0
            _len = new_len;
13569
0
            if (_len == 0) {
13570
                // No common things now, fast exit!
13571
0
                return 0;
13572
0
            }
13573
0
        }
13574
0
    }
13575
13576
0
    assert(_len >= 0);
13577
0
    if (_len > 0) {
13578
0
        *output = _start;
13579
0
    }
13580
0
    return _len;
13581
0
}
13582
13583
/* Dedent a string.
13584
   Behaviour is expected to be an exact match of `textwrap.dedent`.
13585
   Return a new reference on success, NULL with exception set on error.
13586
   */
13587
PyObject *
13588
_PyUnicode_Dedent(PyObject *unicode)
13589
0
{
13590
0
    Py_ssize_t src_len = 0;
13591
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13592
0
    if (!src) {
13593
0
        return NULL;
13594
0
    }
13595
0
    assert(src_len >= 0);
13596
0
    if (src_len == 0) {
13597
0
        return Py_NewRef(unicode);
13598
0
    }
13599
13600
0
    const char *const end = src + src_len;
13601
13602
    // [whitespace_start, whitespace_start + whitespace_len)
13603
    // describes the current longest common leading whitespace
13604
0
    const char *whitespace_start = NULL;
13605
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13606
0
        src, end, &whitespace_start);
13607
13608
0
    if (whitespace_len == 0) {
13609
0
        return Py_NewRef(unicode);
13610
0
    }
13611
13612
    // now we should trigger a dedent
13613
0
    char *dest = PyMem_Malloc(src_len);
13614
0
    if (!dest) {
13615
0
        PyErr_NoMemory();
13616
0
        return NULL;
13617
0
    }
13618
0
    char *dest_iter = dest;
13619
13620
0
    for (const char *iter = src; iter < end; ++iter) {
13621
0
        const char *line_start = iter;
13622
0
        bool in_leading_space = true;
13623
13624
        // iterate over a line to find the end of a line
13625
0
        while (iter < end && *iter != '\n') {
13626
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13627
0
                in_leading_space = false;
13628
0
            }
13629
0
            ++iter;
13630
0
        }
13631
13632
        // invariant: *iter == '\n' or iter == end
13633
0
        bool append_newline = iter < end;
13634
13635
        // if this line has all white space, write '\n' and continue
13636
0
        if (in_leading_space && append_newline) {
13637
0
            *dest_iter++ = '\n';
13638
0
            continue;
13639
0
        }
13640
13641
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13642
            conditionally append '\n' */
13643
13644
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13645
0
        assert(new_line_len >= 0);
13646
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13647
13648
0
        dest_iter += new_line_len;
13649
13650
0
        if (append_newline) {
13651
0
            *dest_iter++ = '\n';
13652
0
        }
13653
0
    }
13654
13655
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13656
0
    PyMem_Free(dest);
13657
0
    return res;
13658
0
}
13659
13660
static PyMethodDef unicode_methods[] = {
13661
    UNICODE_ENCODE_METHODDEF
13662
    UNICODE_REPLACE_METHODDEF
13663
    UNICODE_SPLIT_METHODDEF
13664
    UNICODE_RSPLIT_METHODDEF
13665
    UNICODE_JOIN_METHODDEF
13666
    UNICODE_CAPITALIZE_METHODDEF
13667
    UNICODE_CASEFOLD_METHODDEF
13668
    UNICODE_TITLE_METHODDEF
13669
    UNICODE_CENTER_METHODDEF
13670
    UNICODE_COUNT_METHODDEF
13671
    UNICODE_EXPANDTABS_METHODDEF
13672
    UNICODE_FIND_METHODDEF
13673
    UNICODE_PARTITION_METHODDEF
13674
    UNICODE_INDEX_METHODDEF
13675
    UNICODE_LJUST_METHODDEF
13676
    UNICODE_LOWER_METHODDEF
13677
    UNICODE_LSTRIP_METHODDEF
13678
    UNICODE_RFIND_METHODDEF
13679
    UNICODE_RINDEX_METHODDEF
13680
    UNICODE_RJUST_METHODDEF
13681
    UNICODE_RSTRIP_METHODDEF
13682
    UNICODE_RPARTITION_METHODDEF
13683
    UNICODE_SPLITLINES_METHODDEF
13684
    UNICODE_STRIP_METHODDEF
13685
    UNICODE_SWAPCASE_METHODDEF
13686
    UNICODE_TRANSLATE_METHODDEF
13687
    UNICODE_UPPER_METHODDEF
13688
    UNICODE_STARTSWITH_METHODDEF
13689
    UNICODE_ENDSWITH_METHODDEF
13690
    UNICODE_REMOVEPREFIX_METHODDEF
13691
    UNICODE_REMOVESUFFIX_METHODDEF
13692
    UNICODE_ISASCII_METHODDEF
13693
    UNICODE_ISLOWER_METHODDEF
13694
    UNICODE_ISUPPER_METHODDEF
13695
    UNICODE_ISTITLE_METHODDEF
13696
    UNICODE_ISSPACE_METHODDEF
13697
    UNICODE_ISDECIMAL_METHODDEF
13698
    UNICODE_ISDIGIT_METHODDEF
13699
    UNICODE_ISNUMERIC_METHODDEF
13700
    UNICODE_ISALPHA_METHODDEF
13701
    UNICODE_ISALNUM_METHODDEF
13702
    UNICODE_ISIDENTIFIER_METHODDEF
13703
    UNICODE_ISPRINTABLE_METHODDEF
13704
    UNICODE_ZFILL_METHODDEF
13705
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13706
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13707
    UNICODE___FORMAT___METHODDEF
13708
    UNICODE_MAKETRANS_METHODDEF
13709
    UNICODE_SIZEOF_METHODDEF
13710
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13711
    {NULL, NULL}
13712
};
13713
13714
static PyObject *
13715
unicode_mod(PyObject *v, PyObject *w)
13716
25.1M
{
13717
25.1M
    if (!PyUnicode_Check(v))
13718
0
        Py_RETURN_NOTIMPLEMENTED;
13719
25.1M
    return PyUnicode_Format(v, w);
13720
25.1M
}
13721
13722
static PyNumberMethods unicode_as_number = {
13723
    0,              /*nb_add*/
13724
    0,              /*nb_subtract*/
13725
    0,              /*nb_multiply*/
13726
    unicode_mod,            /*nb_remainder*/
13727
};
13728
13729
static PySequenceMethods unicode_as_sequence = {
13730
    unicode_length,     /* sq_length */
13731
    PyUnicode_Concat,   /* sq_concat */
13732
    unicode_repeat,     /* sq_repeat */
13733
    unicode_getitem,    /* sq_item */
13734
    0,                  /* sq_slice */
13735
    0,                  /* sq_ass_item */
13736
    0,                  /* sq_ass_slice */
13737
    PyUnicode_Contains, /* sq_contains */
13738
};
13739
13740
static PyObject*
13741
unicode_subscript(PyObject* self, PyObject* item)
13742
92.2M
{
13743
92.2M
    if (_PyIndex_Check(item)) {
13744
66.4M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13745
66.4M
        if (i == -1 && PyErr_Occurred())
13746
0
            return NULL;
13747
66.4M
        if (i < 0)
13748
62.1k
            i += PyUnicode_GET_LENGTH(self);
13749
66.4M
        return unicode_getitem(self, i);
13750
66.4M
    } else if (PySlice_Check(item)) {
13751
25.7M
        Py_ssize_t start, stop, step, slicelength, i;
13752
25.7M
        size_t cur;
13753
25.7M
        PyObject *result;
13754
25.7M
        const void *src_data;
13755
25.7M
        void *dest_data;
13756
25.7M
        int src_kind, dest_kind;
13757
25.7M
        Py_UCS4 ch, max_char, kind_limit;
13758
13759
25.7M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13760
0
            return NULL;
13761
0
        }
13762
25.7M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13763
25.7M
                                            &start, &stop, step);
13764
13765
25.7M
        if (slicelength <= 0) {
13766
5.01M
            _Py_RETURN_UNICODE_EMPTY();
13767
20.7M
        } else if (start == 0 && step == 1 &&
13768
6.57M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13769
4.61M
            return unicode_result_unchanged(self);
13770
16.1M
        } else if (step == 1) {
13771
16.1M
            return PyUnicode_Substring(self,
13772
16.1M
                                       start, start + slicelength);
13773
16.1M
        }
13774
        /* General case */
13775
0
        src_kind = PyUnicode_KIND(self);
13776
0
        src_data = PyUnicode_DATA(self);
13777
0
        if (!PyUnicode_IS_ASCII(self)) {
13778
0
            kind_limit = kind_maxchar_limit(src_kind);
13779
0
            max_char = 0;
13780
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13781
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13782
0
                if (ch > max_char) {
13783
0
                    max_char = ch;
13784
0
                    if (max_char >= kind_limit)
13785
0
                        break;
13786
0
                }
13787
0
            }
13788
0
        }
13789
0
        else
13790
0
            max_char = 127;
13791
0
        result = PyUnicode_New(slicelength, max_char);
13792
0
        if (result == NULL)
13793
0
            return NULL;
13794
0
        dest_kind = PyUnicode_KIND(result);
13795
0
        dest_data = PyUnicode_DATA(result);
13796
13797
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13798
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13799
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13800
0
        }
13801
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13802
0
        return result;
13803
0
    } else {
13804
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13805
0
                     Py_TYPE(item)->tp_name);
13806
0
        return NULL;
13807
0
    }
13808
92.2M
}
13809
13810
static PyMappingMethods unicode_as_mapping = {
13811
    unicode_length,     /* mp_length */
13812
    unicode_subscript,  /* mp_subscript */
13813
    0,                  /* mp_ass_subscript */
13814
};
13815
13816
13817
static PyObject *
13818
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13819
13820
/*[clinic input]
13821
@classmethod
13822
str.__new__ as unicode_new
13823
13824
    object as x: object = NULL
13825
    encoding: str = NULL
13826
    errors: str = NULL
13827
13828
[clinic start generated code]*/
13829
13830
static PyObject *
13831
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13832
                 const char *errors)
13833
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13834
16.0M
{
13835
16.0M
    PyObject *unicode;
13836
16.0M
    if (x == NULL) {
13837
0
        unicode = _PyUnicode_GetEmpty();
13838
0
    }
13839
16.0M
    else if (encoding == NULL && errors == NULL) {
13840
16.0M
        unicode = PyObject_Str(x);
13841
16.0M
    }
13842
0
    else {
13843
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13844
0
    }
13845
13846
16.0M
    if (unicode != NULL && type != &PyUnicode_Type) {
13847
16.0M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13848
16.0M
    }
13849
16.0M
    return unicode;
13850
16.0M
}
13851
13852
static const char *
13853
arg_as_utf8(PyObject *obj, const char *name)
13854
698k
{
13855
698k
    if (!PyUnicode_Check(obj)) {
13856
0
        PyErr_Format(PyExc_TypeError,
13857
0
                     "str() argument '%s' must be str, not %T",
13858
0
                     name, obj);
13859
0
        return NULL;
13860
0
    }
13861
698k
    return _PyUnicode_AsUTF8NoNUL(obj);
13862
698k
}
13863
13864
static PyObject *
13865
unicode_vectorcall(PyObject *type, PyObject *const *args,
13866
                   size_t nargsf, PyObject *kwnames)
13867
435k
{
13868
435k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13869
13870
435k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13871
435k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13872
        // Fallback to unicode_new()
13873
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13874
0
        if (tuple == NULL) {
13875
0
            return NULL;
13876
0
        }
13877
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13878
0
        if (dict == NULL) {
13879
0
            Py_DECREF(tuple);
13880
0
            return NULL;
13881
0
        }
13882
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13883
0
        Py_DECREF(tuple);
13884
0
        Py_DECREF(dict);
13885
0
        return ret;
13886
0
    }
13887
435k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13888
0
        return NULL;
13889
0
    }
13890
435k
    if (nargs == 0) {
13891
3.87k
        return _PyUnicode_GetEmpty();
13892
3.87k
    }
13893
431k
    PyObject *object = args[0];
13894
431k
    if (nargs == 1) {
13895
1.23k
        return PyObject_Str(object);
13896
1.23k
    }
13897
430k
    const char *encoding = arg_as_utf8(args[1], "encoding");
13898
430k
    if (encoding == NULL) {
13899
0
        return NULL;
13900
0
    }
13901
430k
    const char *errors = NULL;
13902
430k
    if (nargs == 3) {
13903
267k
        errors = arg_as_utf8(args[2], "errors");
13904
267k
        if (errors == NULL) {
13905
0
            return NULL;
13906
0
        }
13907
267k
    }
13908
430k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13909
430k
}
13910
13911
static PyObject *
13912
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13913
16.0M
{
13914
16.0M
    PyObject *self;
13915
16.0M
    Py_ssize_t length, char_size;
13916
16.0M
    int share_utf8;
13917
16.0M
    int kind;
13918
16.0M
    void *data;
13919
13920
16.0M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13921
16.0M
    assert(_PyUnicode_CHECK(unicode));
13922
13923
16.0M
    self = type->tp_alloc(type, 0);
13924
16.0M
    if (self == NULL) {
13925
0
        return NULL;
13926
0
    }
13927
16.0M
    kind = PyUnicode_KIND(unicode);
13928
16.0M
    length = PyUnicode_GET_LENGTH(unicode);
13929
13930
16.0M
    _PyUnicode_LENGTH(self) = length;
13931
#ifdef Py_DEBUG
13932
    _PyUnicode_HASH(self) = -1;
13933
#else
13934
16.0M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13935
16.0M
#endif
13936
16.0M
    _PyUnicode_STATE(self).interned = 0;
13937
16.0M
    _PyUnicode_STATE(self).kind = kind;
13938
16.0M
    _PyUnicode_STATE(self).compact = 0;
13939
16.0M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13940
16.0M
    _PyUnicode_STATE(self).statically_allocated = 0;
13941
16.0M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13942
16.0M
    PyUnicode_SET_UTF8(self, NULL);
13943
16.0M
    _PyUnicode_DATA_ANY(self) = NULL;
13944
13945
16.0M
    share_utf8 = 0;
13946
16.0M
    if (kind == PyUnicode_1BYTE_KIND) {
13947
13.5M
        char_size = 1;
13948
13.5M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13949
13.4M
            share_utf8 = 1;
13950
13.5M
    }
13951
2.51M
    else if (kind == PyUnicode_2BYTE_KIND) {
13952
2.46M
        char_size = 2;
13953
2.46M
    }
13954
56.4k
    else {
13955
56.4k
        assert(kind == PyUnicode_4BYTE_KIND);
13956
56.4k
        char_size = 4;
13957
56.4k
    }
13958
13959
    /* Ensure we won't overflow the length. */
13960
16.0M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13961
0
        PyErr_NoMemory();
13962
0
        goto onError;
13963
0
    }
13964
16.0M
    data = PyMem_Malloc((length + 1) * char_size);
13965
16.0M
    if (data == NULL) {
13966
0
        PyErr_NoMemory();
13967
0
        goto onError;
13968
0
    }
13969
13970
16.0M
    _PyUnicode_DATA_ANY(self) = data;
13971
16.0M
    if (share_utf8) {
13972
13.4M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13973
13.4M
        PyUnicode_SET_UTF8(self, data);
13974
13.4M
    }
13975
13976
16.0M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13977
16.0M
    assert(_PyUnicode_CheckConsistency(self, 1));
13978
#ifdef Py_DEBUG
13979
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13980
#endif
13981
16.0M
    return self;
13982
13983
0
onError:
13984
0
    Py_DECREF(self);
13985
0
    return NULL;
13986
16.0M
}
13987
13988
void
13989
_PyUnicode_ExactDealloc(PyObject *op)
13990
106M
{
13991
106M
    assert(PyUnicode_CheckExact(op));
13992
106M
    unicode_dealloc(op);
13993
106M
}
13994
13995
PyDoc_STRVAR(unicode_doc,
13996
"str(object='') -> str\n\
13997
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
13998
\n\
13999
Create a new string object from the given object. If encoding or\n\
14000
errors is specified, then the object must expose a data buffer\n\
14001
that will be decoded using the given encoding and error handler.\n\
14002
Otherwise, returns the result of object.__str__() (if defined)\n\
14003
or repr(object).\n\
14004
encoding defaults to 'utf-8'.\n\
14005
errors defaults to 'strict'.");
14006
14007
static PyObject *unicode_iter(PyObject *seq);
14008
14009
PyTypeObject PyUnicode_Type = {
14010
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14011
    "str",                        /* tp_name */
14012
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14013
    0,                            /* tp_itemsize */
14014
    /* Slots */
14015
    unicode_dealloc,              /* tp_dealloc */
14016
    0,                            /* tp_vectorcall_offset */
14017
    0,                            /* tp_getattr */
14018
    0,                            /* tp_setattr */
14019
    0,                            /* tp_as_async */
14020
    unicode_repr,                 /* tp_repr */
14021
    &unicode_as_number,           /* tp_as_number */
14022
    &unicode_as_sequence,         /* tp_as_sequence */
14023
    &unicode_as_mapping,          /* tp_as_mapping */
14024
    unicode_hash,                 /* tp_hash*/
14025
    0,                            /* tp_call*/
14026
    unicode_str,                  /* tp_str */
14027
    PyObject_GenericGetAttr,      /* tp_getattro */
14028
    0,                            /* tp_setattro */
14029
    0,                            /* tp_as_buffer */
14030
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14031
        Py_TPFLAGS_UNICODE_SUBCLASS |
14032
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14033
    unicode_doc,                  /* tp_doc */
14034
    0,                            /* tp_traverse */
14035
    0,                            /* tp_clear */
14036
    PyUnicode_RichCompare,        /* tp_richcompare */
14037
    0,                            /* tp_weaklistoffset */
14038
    unicode_iter,                 /* tp_iter */
14039
    0,                            /* tp_iternext */
14040
    unicode_methods,              /* tp_methods */
14041
    0,                            /* tp_members */
14042
    0,                            /* tp_getset */
14043
    0,                            /* tp_base */
14044
    0,                            /* tp_dict */
14045
    0,                            /* tp_descr_get */
14046
    0,                            /* tp_descr_set */
14047
    0,                            /* tp_dictoffset */
14048
    0,                            /* tp_init */
14049
    0,                            /* tp_alloc */
14050
    unicode_new,                  /* tp_new */
14051
    PyObject_Free,                /* tp_free */
14052
    .tp_vectorcall = unicode_vectorcall,
14053
};
14054
14055
/* Initialize the Unicode implementation */
14056
14057
static void
14058
_init_global_state(void)
14059
34
{
14060
34
    static int initialized = 0;
14061
34
    if (initialized) {
14062
0
        return;
14063
0
    }
14064
34
    initialized = 1;
14065
14066
    /* initialize the linebreak bloom filter */
14067
34
    const Py_UCS2 linebreak[] = {
14068
34
        0x000A, /* LINE FEED */
14069
34
        0x000D, /* CARRIAGE RETURN */
14070
34
        0x001C, /* FILE SEPARATOR */
14071
34
        0x001D, /* GROUP SEPARATOR */
14072
34
        0x001E, /* RECORD SEPARATOR */
14073
34
        0x0085, /* NEXT LINE */
14074
34
        0x2028, /* LINE SEPARATOR */
14075
34
        0x2029, /* PARAGRAPH SEPARATOR */
14076
34
    };
14077
34
    bloom_linebreak = make_bloom_mask(
14078
34
        PyUnicode_2BYTE_KIND, linebreak,
14079
34
        Py_ARRAY_LENGTH(linebreak));
14080
34
}
14081
14082
void
14083
_PyUnicode_InitState(PyInterpreterState *interp)
14084
34
{
14085
34
    if (!_Py_IsMainInterpreter(interp)) {
14086
0
        return;
14087
0
    }
14088
34
    _init_global_state();
14089
34
}
14090
14091
14092
PyStatus
14093
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14094
34
{
14095
34
    if (_Py_IsMainInterpreter(interp)) {
14096
34
        PyStatus status = init_global_interned_strings(interp);
14097
34
        if (_PyStatus_EXCEPTION(status)) {
14098
0
            return status;
14099
0
        }
14100
34
    }
14101
34
    assert(INTERNED_STRINGS);
14102
14103
34
    if (init_interned_dict(interp)) {
14104
0
        PyErr_Clear();
14105
0
        return _PyStatus_ERR("failed to create interned dict");
14106
0
    }
14107
14108
34
    return _PyStatus_OK();
14109
34
}
14110
14111
14112
PyStatus
14113
_PyUnicode_InitTypes(PyInterpreterState *interp)
14114
34
{
14115
34
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14116
0
        goto error;
14117
0
    }
14118
34
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14119
0
        goto error;
14120
0
    }
14121
34
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14122
0
        goto error;
14123
0
    }
14124
34
    return _PyStatus_OK();
14125
14126
0
error:
14127
0
    return _PyStatus_ERR("Can't initialize unicode types");
14128
34
}
14129
14130
static /* non-null */ PyObject*
14131
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14132
37.8k
{
14133
    // Note that this steals a reference to `s`, but in many cases that
14134
    // stolen ref is returned, requiring no decref/incref.
14135
14136
37.8k
    assert(s != NULL);
14137
37.8k
    assert(_PyUnicode_CHECK(s));
14138
37.8k
    assert(_PyUnicode_STATE(s).statically_allocated);
14139
37.8k
    assert(!PyUnicode_CHECK_INTERNED(s));
14140
14141
#ifdef Py_DEBUG
14142
    /* We must not add process-global interned string if there's already a
14143
     * per-interpreter interned_dict, which might contain duplicates.
14144
     */
14145
    PyObject *interned = get_interned_dict(interp);
14146
    assert(interned == NULL);
14147
#endif
14148
14149
    /* Look in the global cache first. */
14150
37.8k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14151
    /* We should only init each string once */
14152
37.8k
    assert(r == NULL);
14153
    /* but just in case (for the non-debug build), handle this */
14154
37.8k
    if (r != NULL && r != s) {
14155
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14156
0
        assert(_PyUnicode_CHECK(r));
14157
0
        Py_DECREF(s);
14158
0
        return Py_NewRef(r);
14159
0
    }
14160
14161
37.8k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14162
0
        Py_FatalError("failed to intern static string");
14163
0
    }
14164
14165
37.8k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14166
37.8k
    return s;
14167
37.8k
}
14168
14169
void
14170
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14171
37.8k
{
14172
    // This should only be called as part of runtime initialization
14173
37.8k
    assert(!Py_IsInitialized());
14174
14175
37.8k
    *p = intern_static(interp, *p);
14176
37.8k
    assert(*p);
14177
37.8k
}
14178
14179
static void
14180
immortalize_interned(PyObject *s)
14181
282k
{
14182
282k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14183
282k
    assert(!_Py_IsImmortal(s));
14184
#ifdef Py_REF_DEBUG
14185
    /* The reference count value should be excluded from the RefTotal.
14186
       The decrements to these objects will not be registered so they
14187
       need to be accounted for in here. */
14188
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14189
        _Py_DecRefTotal(_PyThreadState_GET());
14190
    }
14191
#endif
14192
282k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14193
282k
    _Py_SetImmortal(s);
14194
282k
}
14195
14196
static /* non-null */ PyObject*
14197
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14198
              bool immortalize)
14199
97.7M
{
14200
    // Note that this steals a reference to `s`, but in many cases that
14201
    // stolen ref is returned, requiring no decref/incref.
14202
14203
#ifdef Py_DEBUG
14204
    assert(s != NULL);
14205
    assert(_PyUnicode_CHECK(s));
14206
#else
14207
97.7M
    if (s == NULL || !PyUnicode_Check(s)) {
14208
0
        return s;
14209
0
    }
14210
97.7M
#endif
14211
14212
    /* If it's a subclass, we don't really know what putting
14213
       it in the interned dict might do. */
14214
97.7M
    if (!PyUnicode_CheckExact(s)) {
14215
0
        return s;
14216
0
    }
14217
14218
    /* Is it already interned? */
14219
97.7M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14220
5.37M
        case SSTATE_NOT_INTERNED:
14221
            // no, go on
14222
5.37M
            break;
14223
33.4k
        case SSTATE_INTERNED_MORTAL:
14224
            // yes but we might need to make it immortal
14225
33.4k
            if (immortalize) {
14226
5.35k
                immortalize_interned(s);
14227
5.35k
            }
14228
33.4k
            return s;
14229
92.3M
        default:
14230
            // all done
14231
92.3M
            return s;
14232
97.7M
    }
14233
14234
    /* Statically allocated strings must be already interned. */
14235
97.7M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14236
14237
#if Py_GIL_DISABLED
14238
    /* In the free-threaded build, all interned strings are immortal */
14239
    immortalize = 1;
14240
#endif
14241
14242
    /* If it's already immortal, intern it as such */
14243
5.37M
    if (_Py_IsImmortal(s)) {
14244
0
        immortalize = 1;
14245
0
    }
14246
14247
    /* if it's a short string, get the singleton */
14248
5.37M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14249
18.0k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14250
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14251
0
        assert(PyUnicode_CHECK_INTERNED(r));
14252
0
        Py_DECREF(s);
14253
0
        return r;
14254
0
    }
14255
#ifdef Py_DEBUG
14256
    assert(!unicode_is_singleton(s));
14257
#endif
14258
14259
    /* Look in the global cache now. */
14260
5.37M
    {
14261
5.37M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14262
5.37M
        if (r != NULL) {
14263
562k
            assert(_PyUnicode_STATE(r).statically_allocated);
14264
562k
            assert(r != s);  // r must be statically_allocated; s is not
14265
562k
            Py_DECREF(s);
14266
562k
            return Py_NewRef(r);
14267
562k
        }
14268
5.37M
    }
14269
14270
    /* Do a setdefault on the per-interpreter cache. */
14271
4.80M
    PyObject *interned = get_interned_dict(interp);
14272
4.80M
    assert(interned != NULL);
14273
#ifdef Py_GIL_DISABLED
14274
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14275
#endif
14276
4.80M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14277
4.80M
    PyObject *t;
14278
4.80M
    {
14279
4.80M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14280
4.80M
        if (res < 0) {
14281
0
            PyErr_Clear();
14282
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14283
0
            return s;
14284
0
        }
14285
4.80M
        else if (res == 1) {
14286
            // value was already present (not inserted)
14287
4.05M
            Py_DECREF(s);
14288
4.05M
            if (immortalize &&
14289
1.16M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14290
9.74k
                immortalize_interned(t);
14291
9.74k
            }
14292
4.05M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14293
4.05M
            return t;
14294
4.05M
        }
14295
752k
        else {
14296
            // value was newly inserted
14297
752k
            assert (s == t);
14298
752k
            Py_DECREF(t);
14299
752k
        }
14300
4.80M
    }
14301
14302
    /* NOT_INTERNED -> INTERNED_MORTAL */
14303
14304
4.80M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14305
14306
752k
    if (!_Py_IsImmortal(s)) {
14307
        /* The two references in interned dict (key and value) are not counted.
14308
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14309
752k
        Py_DECREF(s);
14310
752k
        Py_DECREF(s);
14311
752k
    }
14312
752k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14313
14314
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14315
14316
#ifdef Py_DEBUG
14317
    if (_Py_IsImmortal(s)) {
14318
        assert(immortalize);
14319
    }
14320
#endif
14321
752k
    if (immortalize) {
14322
267k
        immortalize_interned(s);
14323
267k
    }
14324
14325
752k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14326
752k
    return s;
14327
4.80M
}
14328
14329
void
14330
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14331
16.8M
{
14332
16.8M
    *p = intern_common(interp, *p, 1);
14333
16.8M
    assert(*p);
14334
16.8M
}
14335
14336
void
14337
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14338
80.9M
{
14339
80.9M
    *p = intern_common(interp, *p, 0);
14340
80.9M
    assert(*p);
14341
80.9M
}
14342
14343
14344
void
14345
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14346
0
{
14347
0
    _PyUnicode_InternImmortal(interp, p);
14348
0
    return;
14349
0
}
14350
14351
void
14352
PyUnicode_InternInPlace(PyObject **p)
14353
0
{
14354
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14355
0
    _PyUnicode_InternMortal(interp, p);
14356
0
}
14357
14358
// Public-looking name kept for the stable ABI; user should not call this:
14359
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14360
void
14361
PyUnicode_InternImmortal(PyObject **p)
14362
0
{
14363
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14364
0
    _PyUnicode_InternImmortal(interp, p);
14365
0
}
14366
14367
PyObject *
14368
PyUnicode_InternFromString(const char *cp)
14369
858k
{
14370
858k
    PyObject *s = PyUnicode_FromString(cp);
14371
858k
    if (s == NULL) {
14372
0
        return NULL;
14373
0
    }
14374
858k
    PyInterpreterState *interp = _PyInterpreterState_GET();
14375
858k
    _PyUnicode_InternMortal(interp, &s);
14376
858k
    return s;
14377
858k
}
14378
14379
14380
void
14381
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14382
0
{
14383
0
    PyObject *interned = get_interned_dict(interp);
14384
0
    if (interned == NULL) {
14385
0
        return;
14386
0
    }
14387
0
    assert(PyDict_CheckExact(interned));
14388
14389
0
    if (has_shared_intern_dict(interp)) {
14390
        // the dict doesn't belong to this interpreter, skip the debug
14391
        // checks on it and just clear the pointer to it
14392
0
        clear_interned_dict(interp);
14393
0
        return;
14394
0
    }
14395
14396
#ifdef INTERNED_STATS
14397
    fprintf(stderr, "releasing %zd interned strings\n",
14398
            PyDict_GET_SIZE(interned));
14399
14400
    Py_ssize_t total_length = 0;
14401
#endif
14402
0
    Py_ssize_t pos = 0;
14403
0
    PyObject *s, *ignored_value;
14404
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14405
0
        int shared = 0;
14406
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14407
0
        case SSTATE_INTERNED_IMMORTAL:
14408
            /* Make immortal interned strings mortal again. */
14409
            // Skip the Immortal Instance check and restore
14410
            // the two references (key and value) ignored
14411
            // by PyUnicode_InternInPlace().
14412
0
            _Py_SetMortal(s, 2);
14413
#ifdef Py_REF_DEBUG
14414
            /* let's be pedantic with the ref total */
14415
            _Py_IncRefTotal(_PyThreadState_GET());
14416
            _Py_IncRefTotal(_PyThreadState_GET());
14417
#endif
14418
#ifdef INTERNED_STATS
14419
            total_length += PyUnicode_GET_LENGTH(s);
14420
#endif
14421
0
            break;
14422
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14423
            /* It is shared between interpreters, so we should unmark it
14424
               only when this is the last interpreter in which it's
14425
               interned.  We immortalize all the statically initialized
14426
               strings during startup, so we can rely on the
14427
               main interpreter to be the last one. */
14428
0
            if (!_Py_IsMainInterpreter(interp)) {
14429
0
                shared = 1;
14430
0
            }
14431
0
            break;
14432
0
        case SSTATE_INTERNED_MORTAL:
14433
            // Restore 2 references held by the interned dict; these will
14434
            // be decref'd by clear_interned_dict's PyDict_Clear.
14435
0
            _Py_RefcntAdd(s, 2);
14436
#ifdef Py_REF_DEBUG
14437
            /* let's be pedantic with the ref total */
14438
            _Py_IncRefTotal(_PyThreadState_GET());
14439
            _Py_IncRefTotal(_PyThreadState_GET());
14440
#endif
14441
0
            break;
14442
0
        case SSTATE_NOT_INTERNED:
14443
0
            _Py_FALLTHROUGH;
14444
0
        default:
14445
0
            Py_UNREACHABLE();
14446
0
        }
14447
0
        if (!shared) {
14448
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14449
0
        }
14450
0
    }
14451
#ifdef INTERNED_STATS
14452
    fprintf(stderr,
14453
            "total length of all interned strings: %zd characters\n",
14454
            total_length);
14455
#endif
14456
14457
0
    struct _Py_unicode_state *state = &interp->unicode;
14458
0
    struct _Py_unicode_ids *ids = &state->ids;
14459
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14460
0
        Py_XINCREF(ids->array[i]);
14461
0
    }
14462
0
    clear_interned_dict(interp);
14463
0
    if (_Py_IsMainInterpreter(interp)) {
14464
0
        clear_global_interned_strings();
14465
0
    }
14466
0
}
14467
14468
14469
/********************* Unicode Iterator **************************/
14470
14471
typedef struct {
14472
    PyObject_HEAD
14473
    Py_ssize_t it_index;
14474
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14475
} unicodeiterobject;
14476
14477
static void
14478
unicodeiter_dealloc(PyObject *op)
14479
2.31M
{
14480
2.31M
    unicodeiterobject *it = (unicodeiterobject *)op;
14481
2.31M
    _PyObject_GC_UNTRACK(it);
14482
2.31M
    Py_XDECREF(it->it_seq);
14483
2.31M
    PyObject_GC_Del(it);
14484
2.31M
}
14485
14486
static int
14487
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14488
4
{
14489
4
    unicodeiterobject *it = (unicodeiterobject *)op;
14490
4
    Py_VISIT(it->it_seq);
14491
4
    return 0;
14492
4
}
14493
14494
static PyObject *
14495
unicodeiter_next(PyObject *op)
14496
137M
{
14497
137M
    unicodeiterobject *it = (unicodeiterobject *)op;
14498
137M
    PyObject *seq;
14499
14500
137M
    assert(it != NULL);
14501
137M
    seq = it->it_seq;
14502
137M
    if (seq == NULL)
14503
0
        return NULL;
14504
137M
    assert(_PyUnicode_CHECK(seq));
14505
14506
137M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14507
136M
        int kind = PyUnicode_KIND(seq);
14508
136M
        const void *data = PyUnicode_DATA(seq);
14509
136M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14510
136M
        it->it_index++;
14511
136M
        return unicode_char(chr);
14512
136M
    }
14513
14514
1.09M
    it->it_seq = NULL;
14515
1.09M
    Py_DECREF(seq);
14516
1.09M
    return NULL;
14517
137M
}
14518
14519
static PyObject *
14520
unicode_ascii_iter_next(PyObject *op)
14521
88.3M
{
14522
88.3M
    unicodeiterobject *it = (unicodeiterobject *)op;
14523
88.3M
    assert(it != NULL);
14524
88.3M
    PyObject *seq = it->it_seq;
14525
88.3M
    if (seq == NULL) {
14526
0
        return NULL;
14527
0
    }
14528
88.3M
    assert(_PyUnicode_CHECK(seq));
14529
88.3M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14530
88.3M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14531
87.1M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14532
87.1M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14533
87.1M
                                              data, it->it_index);
14534
87.1M
        it->it_index++;
14535
87.1M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14536
87.1M
    }
14537
1.13M
    it->it_seq = NULL;
14538
1.13M
    Py_DECREF(seq);
14539
1.13M
    return NULL;
14540
88.3M
}
14541
14542
static PyObject *
14543
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14544
281k
{
14545
281k
    unicodeiterobject *it = (unicodeiterobject *)op;
14546
281k
    Py_ssize_t len = 0;
14547
281k
    if (it->it_seq)
14548
281k
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14549
281k
    return PyLong_FromSsize_t(len);
14550
281k
}
14551
14552
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14553
14554
static PyObject *
14555
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14556
0
{
14557
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14558
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14559
14560
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14561
     * call must be before access of iterator pointers.
14562
     * see issue #101765 */
14563
14564
0
    if (it->it_seq != NULL) {
14565
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14566
0
    } else {
14567
0
        PyObject *u = _PyUnicode_GetEmpty();
14568
0
        if (u == NULL) {
14569
0
            Py_XDECREF(iter);
14570
0
            return NULL;
14571
0
        }
14572
0
        return Py_BuildValue("N(N)", iter, u);
14573
0
    }
14574
0
}
14575
14576
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14577
14578
static PyObject *
14579
unicodeiter_setstate(PyObject *op, PyObject *state)
14580
0
{
14581
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14582
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14583
0
    if (index == -1 && PyErr_Occurred())
14584
0
        return NULL;
14585
0
    if (it->it_seq != NULL) {
14586
0
        if (index < 0)
14587
0
            index = 0;
14588
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14589
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14590
0
        it->it_index = index;
14591
0
    }
14592
0
    Py_RETURN_NONE;
14593
0
}
14594
14595
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14596
14597
static PyMethodDef unicodeiter_methods[] = {
14598
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14599
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14600
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14601
    {NULL,      NULL}       /* sentinel */
14602
};
14603
14604
PyTypeObject PyUnicodeIter_Type = {
14605
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14606
    "str_iterator",         /* tp_name */
14607
    sizeof(unicodeiterobject),      /* tp_basicsize */
14608
    0,                  /* tp_itemsize */
14609
    /* methods */
14610
    unicodeiter_dealloc,/* tp_dealloc */
14611
    0,                  /* tp_vectorcall_offset */
14612
    0,                  /* tp_getattr */
14613
    0,                  /* tp_setattr */
14614
    0,                  /* tp_as_async */
14615
    0,                  /* tp_repr */
14616
    0,                  /* tp_as_number */
14617
    0,                  /* tp_as_sequence */
14618
    0,                  /* tp_as_mapping */
14619
    0,                  /* tp_hash */
14620
    0,                  /* tp_call */
14621
    0,                  /* tp_str */
14622
    PyObject_GenericGetAttr,        /* tp_getattro */
14623
    0,                  /* tp_setattro */
14624
    0,                  /* tp_as_buffer */
14625
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14626
    0,                  /* tp_doc */
14627
    unicodeiter_traverse, /* tp_traverse */
14628
    0,                  /* tp_clear */
14629
    0,                  /* tp_richcompare */
14630
    0,                  /* tp_weaklistoffset */
14631
    PyObject_SelfIter,          /* tp_iter */
14632
    unicodeiter_next,   /* tp_iternext */
14633
    unicodeiter_methods,            /* tp_methods */
14634
    0,
14635
};
14636
14637
PyTypeObject _PyUnicodeASCIIIter_Type = {
14638
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14639
    .tp_name = "str_ascii_iterator",
14640
    .tp_basicsize = sizeof(unicodeiterobject),
14641
    .tp_dealloc = unicodeiter_dealloc,
14642
    .tp_getattro = PyObject_GenericGetAttr,
14643
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14644
    .tp_traverse = unicodeiter_traverse,
14645
    .tp_iter = PyObject_SelfIter,
14646
    .tp_iternext = unicode_ascii_iter_next,
14647
    .tp_methods = unicodeiter_methods,
14648
};
14649
14650
static PyObject *
14651
unicode_iter(PyObject *seq)
14652
2.31M
{
14653
2.31M
    unicodeiterobject *it;
14654
14655
2.31M
    if (!PyUnicode_Check(seq)) {
14656
0
        PyErr_BadInternalCall();
14657
0
        return NULL;
14658
0
    }
14659
2.31M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14660
1.21M
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14661
1.21M
    }
14662
1.09M
    else {
14663
1.09M
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14664
1.09M
    }
14665
2.31M
    if (it == NULL)
14666
0
        return NULL;
14667
2.31M
    it->it_index = 0;
14668
2.31M
    it->it_seq = Py_NewRef(seq);
14669
2.31M
    _PyObject_GC_TRACK(it);
14670
2.31M
    return (PyObject *)it;
14671
2.31M
}
14672
14673
static int
14674
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14675
136
{
14676
136
    int res;
14677
136
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14678
136
    if (res == -2) {
14679
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14680
0
        return -1;
14681
0
    }
14682
136
    if (res < 0) {
14683
0
        PyErr_NoMemory();
14684
0
        return -1;
14685
0
    }
14686
136
    return 0;
14687
136
}
14688
14689
14690
static int
14691
config_get_codec_name(wchar_t **config_encoding)
14692
68
{
14693
68
    char *encoding;
14694
68
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14695
0
        return -1;
14696
0
    }
14697
14698
68
    PyObject *name_obj = NULL;
14699
68
    PyObject *codec = _PyCodec_Lookup(encoding);
14700
68
    PyMem_RawFree(encoding);
14701
14702
68
    if (!codec)
14703
0
        goto error;
14704
14705
68
    name_obj = PyObject_GetAttrString(codec, "name");
14706
68
    Py_CLEAR(codec);
14707
68
    if (!name_obj) {
14708
0
        goto error;
14709
0
    }
14710
14711
68
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14712
68
    Py_DECREF(name_obj);
14713
68
    if (wname == NULL) {
14714
0
        goto error;
14715
0
    }
14716
14717
68
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14718
68
    if (raw_wname == NULL) {
14719
0
        PyMem_Free(wname);
14720
0
        PyErr_NoMemory();
14721
0
        goto error;
14722
0
    }
14723
14724
68
    PyMem_RawFree(*config_encoding);
14725
68
    *config_encoding = raw_wname;
14726
14727
68
    PyMem_Free(wname);
14728
68
    return 0;
14729
14730
0
error:
14731
0
    Py_XDECREF(codec);
14732
0
    Py_XDECREF(name_obj);
14733
0
    return -1;
14734
68
}
14735
14736
14737
static PyStatus
14738
init_stdio_encoding(PyInterpreterState *interp)
14739
34
{
14740
    /* Update the stdio encoding to the normalized Python codec name. */
14741
34
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14742
34
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14743
0
        return _PyStatus_ERR("failed to get the Python codec name "
14744
0
                             "of the stdio encoding");
14745
0
    }
14746
34
    return _PyStatus_OK();
14747
34
}
14748
14749
14750
static int
14751
init_fs_codec(PyInterpreterState *interp)
14752
34
{
14753
34
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14754
14755
34
    _Py_error_handler error_handler;
14756
34
    error_handler = get_error_handler_wide(config->filesystem_errors);
14757
34
    if (error_handler == _Py_ERROR_UNKNOWN) {
14758
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14759
0
        return -1;
14760
0
    }
14761
14762
34
    char *encoding, *errors;
14763
34
    if (encode_wstr_utf8(config->filesystem_encoding,
14764
34
                         &encoding,
14765
34
                         "filesystem_encoding") < 0) {
14766
0
        return -1;
14767
0
    }
14768
14769
34
    if (encode_wstr_utf8(config->filesystem_errors,
14770
34
                         &errors,
14771
34
                         "filesystem_errors") < 0) {
14772
0
        PyMem_RawFree(encoding);
14773
0
        return -1;
14774
0
    }
14775
14776
34
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14777
34
    PyMem_RawFree(fs_codec->encoding);
14778
34
    fs_codec->encoding = encoding;
14779
    /* encoding has been normalized by init_fs_encoding() */
14780
34
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14781
34
    PyMem_RawFree(fs_codec->errors);
14782
34
    fs_codec->errors = errors;
14783
34
    fs_codec->error_handler = error_handler;
14784
14785
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14786
    assert(fs_codec->utf8 == 1);
14787
#endif
14788
14789
    /* At this point, PyUnicode_EncodeFSDefault() and
14790
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14791
       the C implementation of the filesystem encoding. */
14792
14793
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14794
       global configuration variables. */
14795
34
    if (_Py_IsMainInterpreter(interp)) {
14796
14797
34
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14798
34
                                      fs_codec->errors) < 0) {
14799
0
            PyErr_NoMemory();
14800
0
            return -1;
14801
0
        }
14802
34
    }
14803
34
    return 0;
14804
34
}
14805
14806
14807
static PyStatus
14808
init_fs_encoding(PyThreadState *tstate)
14809
34
{
14810
34
    PyInterpreterState *interp = tstate->interp;
14811
14812
    /* Update the filesystem encoding to the normalized Python codec name.
14813
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14814
       (Python codec name). */
14815
34
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14816
34
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14817
0
        _Py_DumpPathConfig(tstate);
14818
0
        return _PyStatus_ERR("failed to get the Python codec "
14819
0
                             "of the filesystem encoding");
14820
0
    }
14821
14822
34
    if (init_fs_codec(interp) < 0) {
14823
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14824
0
    }
14825
34
    return _PyStatus_OK();
14826
34
}
14827
14828
14829
PyStatus
14830
_PyUnicode_InitEncodings(PyThreadState *tstate)
14831
34
{
14832
34
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14833
34
    if (_PyStatus_EXCEPTION(status)) {
14834
0
        return status;
14835
0
    }
14836
34
    status = init_fs_encoding(tstate);
14837
34
    if (_PyStatus_EXCEPTION(status)) {
14838
0
        return status;
14839
0
    }
14840
14841
34
    return init_stdio_encoding(tstate->interp);
14842
34
}
14843
14844
14845
static void
14846
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14847
0
{
14848
0
    PyMem_RawFree(fs_codec->encoding);
14849
0
    fs_codec->encoding = NULL;
14850
0
    fs_codec->utf8 = 0;
14851
0
    PyMem_RawFree(fs_codec->errors);
14852
0
    fs_codec->errors = NULL;
14853
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14854
0
}
14855
14856
14857
#ifdef MS_WINDOWS
14858
int
14859
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
14860
{
14861
    PyInterpreterState *interp = _PyInterpreterState_GET();
14862
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
14863
14864
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
14865
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
14866
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
14867
    if (encoding == NULL || errors == NULL) {
14868
        PyMem_RawFree(encoding);
14869
        PyMem_RawFree(errors);
14870
        PyErr_NoMemory();
14871
        return -1;
14872
    }
14873
14874
    PyMem_RawFree(config->filesystem_encoding);
14875
    config->filesystem_encoding = encoding;
14876
    PyMem_RawFree(config->filesystem_errors);
14877
    config->filesystem_errors = errors;
14878
14879
    return init_fs_codec(interp);
14880
}
14881
#endif
14882
14883
14884
#ifdef Py_DEBUG
14885
static inline int
14886
unicode_is_finalizing(void)
14887
{
14888
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14889
}
14890
#endif
14891
14892
14893
void
14894
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14895
0
{
14896
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14897
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14898
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14899
0
}
14900
14901
14902
void
14903
_PyUnicode_Fini(PyInterpreterState *interp)
14904
0
{
14905
0
    struct _Py_unicode_state *state = &interp->unicode;
14906
14907
0
    if (!has_shared_intern_dict(interp)) {
14908
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14909
0
        assert(get_interned_dict(interp) == NULL);
14910
0
    }
14911
14912
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14913
14914
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14915
    // subsequent initialization of interpreter.
14916
0
    interp->unicode.ucnhash_capi = NULL;
14917
14918
0
    unicode_clear_identifiers(state);
14919
0
}
14920
14921
/* A _string module, to export formatter_parser and formatter_field_name_split
14922
   to the string.Formatter class implemented in Python. */
14923
14924
static PyMethodDef _string_methods[] = {
14925
    {"formatter_field_name_split", formatter_field_name_split,
14926
     METH_O, PyDoc_STR("split the argument as a field name")},
14927
    {"formatter_parser", formatter_parser,
14928
     METH_O, PyDoc_STR("parse the argument as a format string")},
14929
    {NULL, NULL}
14930
};
14931
14932
static PyModuleDef_Slot module_slots[] = {
14933
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14934
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14935
    {0, NULL}
14936
};
14937
14938
static struct PyModuleDef _string_module = {
14939
    PyModuleDef_HEAD_INIT,
14940
    .m_name = "_string",
14941
    .m_doc = PyDoc_STR("string helper module"),
14942
    .m_size = 0,
14943
    .m_methods = _string_methods,
14944
    .m_slots = module_slots,
14945
};
14946
14947
PyMODINIT_FUNC
14948
PyInit__string(void)
14949
10
{
14950
10
    return PyModuleDef_Init(&_string_module);
14951
10
}
14952
14953
14954
#undef PyUnicode_KIND
14955
int PyUnicode_KIND(PyObject *op)
14956
0
{
14957
0
    if (!PyUnicode_Check(op)) {
14958
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14959
0
        return -1;
14960
0
    }
14961
0
    return _PyASCIIObject_CAST(op)->state.kind;
14962
0
}
14963
14964
#undef PyUnicode_DATA
14965
void* PyUnicode_DATA(PyObject *op)
14966
0
{
14967
0
    if (!PyUnicode_Check(op)) {
14968
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14969
0
        return NULL;
14970
0
    }
14971
0
    return _PyUnicode_DATA(op);
14972
0
}