Coverage Report

Created: 2026-04-20 06:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def c_default_init(self):
90
        import libclinic
91
        self.c_default = libclinic.c_unichar_repr(self.default)
92
93
[python start generated code]*/
94
/*[python end generated code: output=da39a3ee5e6b4b0d input=22f057b68fd9a65a]*/
95
96
/* --- Globals ------------------------------------------------------------
97
98
NOTE: In the interpreter's initialization phase, some globals are currently
99
      initialized dynamically as needed. In the process Unicode objects may
100
      be created before the Unicode type is ready.
101
102
*/
103
104
24.9M
#define MAX_UNICODE _Py_MAX_UNICODE
105
265M
#define ensure_unicode _PyUnicode_EnsureUnicode
106
107
#ifdef Py_DEBUG
108
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
109
#else
110
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
111
#endif
112
113
static inline char* _PyUnicode_UTF8(PyObject *op)
114
254M
{
115
254M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
116
254M
}
117
118
static inline char* PyUnicode_UTF8(PyObject *op)
119
171M
{
120
171M
    assert(_PyUnicode_CHECK(op));
121
171M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
122
155M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
123
155M
    }
124
15.8M
    else {
125
15.8M
         return _PyUnicode_UTF8(op);
126
15.8M
    }
127
171M
}
128
129
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
130
29.7M
{
131
29.7M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
132
29.7M
}
133
134
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
135
80.7M
{
136
80.7M
    assert(_PyUnicode_CHECK(op));
137
80.7M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
138
77.5M
         return _PyASCIIObject_CAST(op)->length;
139
77.5M
    }
140
3.11M
    else {
141
3.11M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
142
3.11M
    }
143
80.7M
}
144
145
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
146
29.7M
{
147
29.7M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
148
29.7M
}
149
150
#define _PyUnicode_LENGTH(op)                           \
151
624M
    (_PyASCIIObject_CAST(op)->length)
152
#define _PyUnicode_STATE(op)                            \
153
3.94G
    (_PyASCIIObject_CAST(op)->state)
154
#define _PyUnicode_HASH(op)                             \
155
578M
    (_PyASCIIObject_CAST(op)->hash)
156
157
1.05G
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
158
159
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
160
48.6M
{
161
48.6M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
162
48.6M
}
163
164
#define _PyUnicode_DATA_ANY(op)                         \
165
64.2M
    (_PyUnicodeObject_CAST(op)->data.any)
166
167
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
168
0
{
169
0
    assert(_PyUnicode_CHECK(op));
170
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
171
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
172
0
}
173
174
/* true if the Unicode object has an allocated UTF-8 memory block
175
   (not shared with other data) */
176
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
177
622M
{
178
622M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
179
224M
            && _PyUnicode_UTF8(op) != NULL
180
13.7M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
181
622M
}
182
183
184
213M
#define LATIN1 _Py_LATIN1_CHR
185
186
/* Forward declaration */
187
static PyObject *
188
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
189
                    const char *errors);
190
static PyObject *
191
unicode_decode_utf8(const char *s, Py_ssize_t size,
192
                    _Py_error_handler error_handler, const char *errors,
193
                    Py_ssize_t *consumed);
194
#ifdef Py_DEBUG
195
static inline int unicode_is_finalizing(void);
196
static int unicode_is_singleton(PyObject *unicode);
197
#endif
198
199
200
// Return a reference to the immortal empty string singleton.
201
PyObject*
202
_PyUnicode_GetEmpty(void)
203
107M
{
204
107M
    _Py_DECLARE_STR(empty, "");
205
107M
    return &_Py_STR(empty);
206
107M
}
207
208
/* This dictionary holds per-interpreter interned strings.
209
 * See InternalDocs/string_interning.md for details.
210
 */
211
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
212
6.24M
{
213
6.24M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
214
6.24M
}
215
216
/* This hashtable holds statically allocated interned strings.
217
 * See InternalDocs/string_interning.md for details.
218
 */
219
6.40M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
220
221
/* Get number of all interned strings for the current interpreter. */
222
Py_ssize_t
223
_PyUnicode_InternedSize(void)
224
0
{
225
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
226
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
227
0
}
228
229
/* Get number of immortal interned strings for the current interpreter. */
230
Py_ssize_t
231
_PyUnicode_InternedSize_Immortal(void)
232
0
{
233
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
234
0
    PyObject *key, *value;
235
0
    Py_ssize_t pos = 0;
236
0
    Py_ssize_t count = 0;
237
238
    // It's tempting to keep a count and avoid a loop here. But, this function
239
    // is intended for refleak tests. It spends extra work to report the true
240
    // value, to help detect bugs in optimizations.
241
242
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
243
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
244
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
245
0
           count++;
246
0
       }
247
0
    }
248
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
249
0
}
250
251
static Py_hash_t unicode_hash(PyObject *);
252
253
static Py_uhash_t
254
hashtable_unicode_hash(const void *key)
255
6.40M
{
256
6.40M
    return unicode_hash((PyObject *)key);
257
6.40M
}
258
259
static int
260
hashtable_unicode_compare(const void *key1, const void *key2)
261
538k
{
262
538k
    PyObject *obj1 = (PyObject *)key1;
263
538k
    PyObject *obj2 = (PyObject *)key2;
264
538k
    if (obj1 != NULL && obj2 != NULL) {
265
538k
        return unicode_eq(obj1, obj2);
266
538k
    }
267
0
    else {
268
0
        return obj1 == obj2;
269
0
    }
270
538k
}
271
272
/* Return true if this interpreter should share the main interpreter's
273
   intern_dict.  That's important for interpreters which load basic
274
   single-phase init extension modules (m_size == -1).  There could be interned
275
   immortal strings that are shared between interpreters, due to the
276
   PyDict_Update(mdict, m_copy) call in import_find_extension().
277
278
   It's not safe to deallocate those strings until all interpreters that
279
   potentially use them are freed.  By storing them in the main interpreter, we
280
   ensure they get freed after all other interpreters are freed.
281
*/
282
static bool
283
has_shared_intern_dict(PyInterpreterState *interp)
284
36
{
285
36
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
286
36
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
287
36
}
288
289
static int
290
init_interned_dict(PyInterpreterState *interp)
291
36
{
292
36
    assert(get_interned_dict(interp) == NULL);
293
36
    PyObject *interned;
294
36
    if (has_shared_intern_dict(interp)) {
295
0
        interned = get_interned_dict(_PyInterpreterState_Main());
296
0
        Py_INCREF(interned);
297
0
    }
298
36
    else {
299
36
        interned = PyDict_New();
300
36
        if (interned == NULL) {
301
0
            return -1;
302
0
        }
303
36
    }
304
36
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
305
36
    return 0;
306
36
}
307
308
static void
309
clear_interned_dict(PyInterpreterState *interp)
310
0
{
311
0
    PyObject *interned = get_interned_dict(interp);
312
0
    if (interned != NULL) {
313
0
        if (!has_shared_intern_dict(interp)) {
314
            // only clear if the dict belongs to this interpreter
315
0
            PyDict_Clear(interned);
316
0
        }
317
0
        Py_DECREF(interned);
318
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
319
0
    }
320
0
}
321
322
static PyStatus
323
init_global_interned_strings(PyInterpreterState *interp)
324
36
{
325
36
    assert(INTERNED_STRINGS == NULL);
326
36
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
327
328
36
    INTERNED_STRINGS = _Py_hashtable_new_full(
329
36
        hashtable_unicode_hash,
330
36
        hashtable_unicode_compare,
331
        // Objects stored here are immortal and statically allocated,
332
        // so we don't need key_destroy_func & value_destroy_func:
333
36
        NULL,
334
36
        NULL,
335
36
        &hashtable_alloc
336
36
    );
337
36
    if (INTERNED_STRINGS == NULL) {
338
0
        PyErr_Clear();
339
0
        return _PyStatus_ERR("failed to create global interned dict");
340
0
    }
341
342
    /* Intern statically allocated string identifiers, deepfreeze strings,
343
        * and one-byte latin-1 strings.
344
        * This must be done before any module initialization so that statically
345
        * allocated string identifiers are used instead of heap allocated strings.
346
        * Deepfreeze uses the interned identifiers if present to save space
347
        * else generates them and they are interned to speed up dict lookups.
348
    */
349
36
    _PyUnicode_InitStaticStrings(interp);
350
351
9.25k
    for (int i = 0; i < 256; i++) {
352
9.21k
        PyObject *s = LATIN1(i);
353
9.21k
        _PyUnicode_InternStatic(interp, &s);
354
9.21k
        assert(s == LATIN1(i));
355
9.21k
    }
356
#ifdef Py_DEBUG
357
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
358
359
    for (int i = 0; i < 256; i++) {
360
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
361
    }
362
#endif
363
36
    return _PyStatus_OK();
364
36
}
365
366
static void clear_global_interned_strings(void)
367
0
{
368
0
    if (INTERNED_STRINGS != NULL) {
369
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
370
0
        INTERNED_STRINGS = NULL;
371
0
    }
372
0
}
373
374
#define _Py_RETURN_UNICODE_EMPTY()   \
375
52.8M
    do {                             \
376
52.8M
        return _PyUnicode_GetEmpty();\
377
52.8M
    } while (0)
378
379
380
/* Fast detection of the most frequent whitespace characters */
381
const unsigned char _Py_ascii_whitespace[] = {
382
    0, 0, 0, 0, 0, 0, 0, 0,
383
/*     case 0x0009: * CHARACTER TABULATION */
384
/*     case 0x000A: * LINE FEED */
385
/*     case 0x000B: * LINE TABULATION */
386
/*     case 0x000C: * FORM FEED */
387
/*     case 0x000D: * CARRIAGE RETURN */
388
    0, 1, 1, 1, 1, 1, 0, 0,
389
    0, 0, 0, 0, 0, 0, 0, 0,
390
/*     case 0x001C: * FILE SEPARATOR */
391
/*     case 0x001D: * GROUP SEPARATOR */
392
/*     case 0x001E: * RECORD SEPARATOR */
393
/*     case 0x001F: * UNIT SEPARATOR */
394
    0, 0, 0, 0, 1, 1, 1, 1,
395
/*     case 0x0020: * SPACE */
396
    1, 0, 0, 0, 0, 0, 0, 0,
397
    0, 0, 0, 0, 0, 0, 0, 0,
398
    0, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
    0, 0, 0, 0, 0, 0, 0, 0,
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0
409
};
410
411
/* forward */
412
static PyObject* get_latin1_char(unsigned char ch);
413
414
415
static PyObject *
416
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
417
static PyObject *
418
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
421
422
static PyObject *
423
unicode_encode_call_errorhandler(const char *errors,
424
       PyObject **errorHandler,const char *encoding, const char *reason,
425
       PyObject *unicode, PyObject **exceptionObject,
426
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
427
428
static void
429
raise_encode_exception(PyObject **exceptionObject,
430
                       const char *encoding,
431
                       PyObject *unicode,
432
                       Py_ssize_t startpos, Py_ssize_t endpos,
433
                       const char *reason);
434
435
/* Same for linebreaks */
436
static const unsigned char ascii_linebreak[] = {
437
    0, 0, 0, 0, 0, 0, 0, 0,
438
/*         0x000A, * LINE FEED */
439
/*         0x000B, * LINE TABULATION */
440
/*         0x000C, * FORM FEED */
441
/*         0x000D, * CARRIAGE RETURN */
442
    0, 0, 1, 1, 1, 1, 0, 0,
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
/*         0x001C, * FILE SEPARATOR */
445
/*         0x001D, * GROUP SEPARATOR */
446
/*         0x001E, * RECORD SEPARATOR */
447
    0, 0, 0, 0, 1, 1, 1, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
    0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0
461
};
462
463
static int convert_uc(PyObject *obj, void *addr);
464
465
struct encoding_map;
466
#include "clinic/unicodeobject.c.h"
467
468
_Py_error_handler
469
_Py_GetErrorHandler(const char *errors)
470
3.84M
{
471
3.84M
    if (errors == NULL || strcmp(errors, "strict") == 0) {
472
3.03M
        return _Py_ERROR_STRICT;
473
3.03M
    }
474
805k
    if (strcmp(errors, "surrogateescape") == 0) {
475
572k
        return _Py_ERROR_SURROGATEESCAPE;
476
572k
    }
477
233k
    if (strcmp(errors, "replace") == 0) {
478
233k
        return _Py_ERROR_REPLACE;
479
233k
    }
480
0
    if (strcmp(errors, "ignore") == 0) {
481
0
        return _Py_ERROR_IGNORE;
482
0
    }
483
0
    if (strcmp(errors, "backslashreplace") == 0) {
484
0
        return _Py_ERROR_BACKSLASHREPLACE;
485
0
    }
486
0
    if (strcmp(errors, "surrogatepass") == 0) {
487
0
        return _Py_ERROR_SURROGATEPASS;
488
0
    }
489
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
490
0
        return _Py_ERROR_XMLCHARREFREPLACE;
491
0
    }
492
0
    return _Py_ERROR_OTHER;
493
0
}
494
495
496
static _Py_error_handler
497
get_error_handler_wide(const wchar_t *errors)
498
72
{
499
72
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
500
0
        return _Py_ERROR_STRICT;
501
0
    }
502
72
    if (wcscmp(errors, L"surrogateescape") == 0) {
503
72
        return _Py_ERROR_SURROGATEESCAPE;
504
72
    }
505
0
    if (wcscmp(errors, L"replace") == 0) {
506
0
        return _Py_ERROR_REPLACE;
507
0
    }
508
0
    if (wcscmp(errors, L"ignore") == 0) {
509
0
        return _Py_ERROR_IGNORE;
510
0
    }
511
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
512
0
        return _Py_ERROR_BACKSLASHREPLACE;
513
0
    }
514
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
515
0
        return _Py_ERROR_SURROGATEPASS;
516
0
    }
517
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
518
0
        return _Py_ERROR_XMLCHARREFREPLACE;
519
0
    }
520
0
    return _Py_ERROR_OTHER;
521
0
}
522
523
524
static inline int
525
unicode_check_encoding_errors(const char *encoding, const char *errors)
526
45.0M
{
527
45.0M
    if (encoding == NULL && errors == NULL) {
528
13.0M
        return 0;
529
13.0M
    }
530
531
31.9M
    PyInterpreterState *interp = _PyInterpreterState_GET();
532
31.9M
#ifndef Py_DEBUG
533
    /* In release mode, only check in development mode (-X dev) */
534
31.9M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
535
31.9M
        return 0;
536
31.9M
    }
537
#else
538
    /* Always check in debug mode */
539
#endif
540
541
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
542
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
543
0
    if (!interp->unicode.fs_codec.encoding) {
544
0
        return 0;
545
0
    }
546
547
    /* Disable checks during Python finalization. For example, it allows to
548
     * call PyObject_Dump() during finalization for debugging purpose.
549
     */
550
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
551
0
        return 0;
552
0
    }
553
554
0
    if (encoding != NULL
555
        // Fast path for the most common built-in encodings. Even if the codec
556
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
557
        // create a temporary Unicode string (the key in the cache).
558
0
        && strcmp(encoding, "utf-8") != 0
559
0
        && strcmp(encoding, "utf8") != 0
560
0
        && strcmp(encoding, "ascii") != 0)
561
0
    {
562
0
        PyObject *handler = _PyCodec_Lookup(encoding);
563
0
        if (handler == NULL) {
564
0
            return -1;
565
0
        }
566
0
        Py_DECREF(handler);
567
0
    }
568
569
0
    if (errors != NULL
570
        // Fast path for the most common built-in error handlers.
571
0
        && strcmp(errors, "strict") != 0
572
0
        && strcmp(errors, "ignore") != 0
573
0
        && strcmp(errors, "replace") != 0
574
0
        && strcmp(errors, "surrogateescape") != 0
575
0
        && strcmp(errors, "surrogatepass") != 0)
576
0
    {
577
0
        PyObject *handler = PyCodec_LookupError(errors);
578
0
        if (handler == NULL) {
579
0
            return -1;
580
0
        }
581
0
        Py_DECREF(handler);
582
0
    }
583
0
    return 0;
584
0
}
585
586
587
int
588
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
589
0
{
590
0
#define CHECK(expr) \
591
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
592
593
0
    assert(op != NULL);
594
0
    CHECK(PyUnicode_Check(op));
595
596
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
597
0
    int kind = ascii->state.kind;
598
599
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
600
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
601
0
    }
602
0
    else {
603
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
604
0
        void *data;
605
606
0
        if (ascii->state.compact == 1) {
607
0
            data = compact + 1;
608
0
            CHECK(kind == PyUnicode_1BYTE_KIND
609
0
                                 || kind == PyUnicode_2BYTE_KIND
610
0
                                 || kind == PyUnicode_4BYTE_KIND);
611
0
            CHECK(ascii->state.ascii == 0);
612
0
            CHECK(_PyUnicode_UTF8(op) != data);
613
0
        }
614
0
        else {
615
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
616
617
0
            data = unicode->data.any;
618
0
            CHECK(kind == PyUnicode_1BYTE_KIND
619
0
                     || kind == PyUnicode_2BYTE_KIND
620
0
                     || kind == PyUnicode_4BYTE_KIND);
621
0
            CHECK(ascii->state.compact == 0);
622
0
            CHECK(data != NULL);
623
0
            if (ascii->state.ascii) {
624
0
                CHECK(_PyUnicode_UTF8(op) == data);
625
0
                CHECK(compact->utf8_length == ascii->length);
626
0
            }
627
0
            else {
628
0
                CHECK(_PyUnicode_UTF8(op) != data);
629
0
            }
630
0
        }
631
0
#ifndef Py_GIL_DISABLED
632
0
        if (_PyUnicode_UTF8(op) == NULL)
633
0
            CHECK(compact->utf8_length == 0);
634
0
#endif
635
0
    }
636
637
    /* check that the best kind is used: O(n) operation */
638
0
    if (check_content) {
639
0
        Py_ssize_t i;
640
0
        Py_UCS4 maxchar = 0;
641
0
        const void *data;
642
0
        Py_UCS4 ch;
643
644
0
        data = PyUnicode_DATA(ascii);
645
0
        for (i=0; i < ascii->length; i++)
646
0
        {
647
0
            ch = PyUnicode_READ(kind, data, i);
648
0
            if (ch > maxchar)
649
0
                maxchar = ch;
650
0
        }
651
0
        if (kind == PyUnicode_1BYTE_KIND) {
652
0
            if (ascii->state.ascii == 0) {
653
0
                CHECK(maxchar >= 128);
654
0
                CHECK(maxchar <= 255);
655
0
            }
656
0
            else
657
0
                CHECK(maxchar < 128);
658
0
        }
659
0
        else if (kind == PyUnicode_2BYTE_KIND) {
660
0
            CHECK(maxchar >= 0x100);
661
0
            CHECK(maxchar <= 0xFFFF);
662
0
        }
663
0
        else {
664
0
            CHECK(maxchar >= 0x10000);
665
0
            CHECK(maxchar <= MAX_UNICODE);
666
0
        }
667
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
668
0
    }
669
670
    /* Check interning state */
671
#ifdef Py_DEBUG
672
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
673
    // extensions can make immortal strings mortal (but with a high enough
674
    // refcount).
675
    // The other way is extremely unlikely (worth a potential failed assertion
676
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
677
    switch (PyUnicode_CHECK_INTERNED(op)) {
678
        case SSTATE_NOT_INTERNED:
679
            if (ascii->state.statically_allocated) {
680
                // This state is for two exceptions:
681
                // - strings are currently checked before they're interned
682
                // - the 256 one-latin1-character strings
683
                //   are static but use SSTATE_NOT_INTERNED
684
            }
685
            else {
686
                CHECK(!_Py_IsImmortal(op));
687
            }
688
            break;
689
        case SSTATE_INTERNED_MORTAL:
690
            CHECK(!ascii->state.statically_allocated);
691
            CHECK(!_Py_IsImmortal(op));
692
            break;
693
        case SSTATE_INTERNED_IMMORTAL:
694
            CHECK(!ascii->state.statically_allocated);
695
            break;
696
        case SSTATE_INTERNED_IMMORTAL_STATIC:
697
            CHECK(ascii->state.statically_allocated);
698
            break;
699
        default:
700
            Py_UNREACHABLE();
701
    }
702
#endif
703
704
0
    return 1;
705
706
0
#undef CHECK
707
0
}
708
709
PyObject*
710
_PyUnicode_Result(PyObject *unicode)
711
60.3M
{
712
60.3M
    assert(_PyUnicode_CHECK(unicode));
713
714
60.3M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
715
60.3M
    if (length == 0) {
716
246
        PyObject *empty = _PyUnicode_GetEmpty();
717
246
        if (unicode != empty) {
718
0
            Py_DECREF(unicode);
719
0
        }
720
246
        return empty;
721
246
    }
722
723
60.3M
    if (length == 1) {
724
3.20M
        int kind = PyUnicode_KIND(unicode);
725
3.20M
        if (kind == PyUnicode_1BYTE_KIND) {
726
153k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
727
153k
            Py_UCS1 ch = data[0];
728
153k
            PyObject *latin1_char = LATIN1(ch);
729
153k
            if (unicode != latin1_char) {
730
148k
                Py_DECREF(unicode);
731
148k
            }
732
153k
            return latin1_char;
733
153k
        }
734
3.20M
    }
735
736
60.3M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
737
60.1M
    return unicode;
738
60.3M
}
739
1.74M
#define unicode_result _PyUnicode_Result
740
741
static PyObject*
742
unicode_result_unchanged(PyObject *unicode)
743
95.4M
{
744
95.4M
    if (PyUnicode_CheckExact(unicode)) {
745
92.5M
        return Py_NewRef(unicode);
746
92.5M
    }
747
2.87M
    else
748
        /* Subtype -- return genuine unicode string with the same value. */
749
2.87M
        return _PyUnicode_Copy(unicode);
750
95.4M
}
751
752
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
753
   ASCII, Latin1, UTF-8, etc. */
754
static char*
755
backslashreplace(PyBytesWriter *writer, char *str,
756
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
757
0
{
758
0
    Py_ssize_t size, i;
759
0
    Py_UCS4 ch;
760
0
    int kind;
761
0
    const void *data;
762
763
0
    kind = PyUnicode_KIND(unicode);
764
0
    data = PyUnicode_DATA(unicode);
765
766
0
    size = 0;
767
    /* determine replacement size */
768
0
    for (i = collstart; i < collend; ++i) {
769
0
        Py_ssize_t incr;
770
771
0
        ch = PyUnicode_READ(kind, data, i);
772
0
        if (ch < 0x100)
773
0
            incr = 2+2;
774
0
        else if (ch < 0x10000)
775
0
            incr = 2+4;
776
0
        else {
777
0
            assert(ch <= MAX_UNICODE);
778
0
            incr = 2+8;
779
0
        }
780
0
        if (size > PY_SSIZE_T_MAX - incr) {
781
0
            PyErr_SetString(PyExc_OverflowError,
782
0
                            "encoded result is too long for a Python string");
783
0
            return NULL;
784
0
        }
785
0
        size += incr;
786
0
    }
787
788
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
789
0
    if (str == NULL) {
790
0
        return NULL;
791
0
    }
792
793
    /* generate replacement */
794
0
    for (i = collstart; i < collend; ++i) {
795
0
        ch = PyUnicode_READ(kind, data, i);
796
0
        *str++ = '\\';
797
0
        if (ch >= 0x00010000) {
798
0
            *str++ = 'U';
799
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
800
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
801
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
802
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
803
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
804
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
805
0
        }
806
0
        else if (ch >= 0x100) {
807
0
            *str++ = 'u';
808
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
809
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
810
0
        }
811
0
        else
812
0
            *str++ = 'x';
813
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
814
0
        *str++ = Py_hexdigits[ch&0xf];
815
0
    }
816
0
    return str;
817
0
}
818
819
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
820
   ASCII, Latin1, UTF-8, etc. */
821
static char*
822
xmlcharrefreplace(PyBytesWriter *writer, char *str,
823
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
824
0
{
825
0
    Py_ssize_t size, i;
826
0
    Py_UCS4 ch;
827
0
    int kind;
828
0
    const void *data;
829
830
0
    kind = PyUnicode_KIND(unicode);
831
0
    data = PyUnicode_DATA(unicode);
832
833
0
    size = 0;
834
    /* determine replacement size */
835
0
    for (i = collstart; i < collend; ++i) {
836
0
        Py_ssize_t incr;
837
838
0
        ch = PyUnicode_READ(kind, data, i);
839
0
        if (ch < 10)
840
0
            incr = 2+1+1;
841
0
        else if (ch < 100)
842
0
            incr = 2+2+1;
843
0
        else if (ch < 1000)
844
0
            incr = 2+3+1;
845
0
        else if (ch < 10000)
846
0
            incr = 2+4+1;
847
0
        else if (ch < 100000)
848
0
            incr = 2+5+1;
849
0
        else if (ch < 1000000)
850
0
            incr = 2+6+1;
851
0
        else {
852
0
            assert(ch <= MAX_UNICODE);
853
0
            incr = 2+7+1;
854
0
        }
855
0
        if (size > PY_SSIZE_T_MAX - incr) {
856
0
            PyErr_SetString(PyExc_OverflowError,
857
0
                            "encoded result is too long for a Python string");
858
0
            return NULL;
859
0
        }
860
0
        size += incr;
861
0
    }
862
863
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
864
0
    if (str == NULL) {
865
0
        return NULL;
866
0
    }
867
868
    /* generate replacement */
869
0
    for (i = collstart; i < collend; ++i) {
870
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
871
0
        if (size < 0) {
872
0
            return NULL;
873
0
        }
874
0
        str += size;
875
0
    }
876
0
    return str;
877
0
}
878
879
/* --- Bloom Filters ----------------------------------------------------- */
880
881
/* stuff to implement simple "bloom filters" for Unicode characters.
882
   to keep things simple, we use a single bitmask, using the least 5
883
   bits from each unicode characters as the bit index. */
884
885
/* the linebreak mask is set up by _PyUnicode_Init() below */
886
887
#if LONG_BIT >= 128
888
#define BLOOM_WIDTH 128
889
#elif LONG_BIT >= 64
890
24.1M
#define BLOOM_WIDTH 64
891
#elif LONG_BIT >= 32
892
#define BLOOM_WIDTH 32
893
#else
894
#error "LONG_BIT is smaller than 32"
895
#endif
896
897
9.58M
#define BLOOM_MASK unsigned long
898
899
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
900
901
30.4M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
902
903
#define BLOOM_LINEBREAK(ch)                                             \
904
131M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
905
131M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
906
907
static inline BLOOM_MASK
908
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
909
4.79M
{
910
4.79M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
911
4.79M
    do {                                               \
912
4.79M
        TYPE *data = (TYPE *)PTR;                      \
913
4.79M
        TYPE *end = data + LEN;                        \
914
4.79M
        Py_UCS4 ch;                                    \
915
11.4M
        for (; data != end; data++) {                  \
916
6.64M
            ch = *data;                                \
917
6.64M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
918
6.64M
        }                                              \
919
4.79M
        break;                                         \
920
4.79M
    } while (0)
921
922
    /* calculate simple bloom-style bitmask for a given unicode string */
923
924
4.79M
    BLOOM_MASK mask;
925
926
4.79M
    mask = 0;
927
4.79M
    switch (kind) {
928
4.79M
    case PyUnicode_1BYTE_KIND:
929
4.79M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
930
4.79M
        break;
931
36
    case PyUnicode_2BYTE_KIND:
932
36
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
933
36
        break;
934
0
    case PyUnicode_4BYTE_KIND:
935
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
936
0
        break;
937
0
    default:
938
0
        Py_UNREACHABLE();
939
4.79M
    }
940
4.79M
    return mask;
941
942
4.79M
#undef BLOOM_UPDATE
943
4.79M
}
944
945
/* Compilation of templated routines */
946
947
1.02M
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
948
949
#include "stringlib/asciilib.h"
950
#include "stringlib/fastsearch.h"
951
#include "stringlib/partition.h"
952
#include "stringlib/split.h"
953
#include "stringlib/count.h"
954
#include "stringlib/find.h"
955
#include "stringlib/find_max_char.h"
956
#include "stringlib/undef.h"
957
958
#include "stringlib/ucs1lib.h"
959
#include "stringlib/fastsearch.h"
960
#include "stringlib/partition.h"
961
#include "stringlib/split.h"
962
#include "stringlib/count.h"
963
#include "stringlib/find.h"
964
#include "stringlib/replace.h"
965
#include "stringlib/repr.h"
966
#include "stringlib/find_max_char.h"
967
#include "stringlib/undef.h"
968
969
#include "stringlib/ucs2lib.h"
970
#include "stringlib/fastsearch.h"
971
#include "stringlib/partition.h"
972
#include "stringlib/split.h"
973
#include "stringlib/count.h"
974
#include "stringlib/find.h"
975
#include "stringlib/replace.h"
976
#include "stringlib/repr.h"
977
#include "stringlib/find_max_char.h"
978
#include "stringlib/undef.h"
979
980
#include "stringlib/ucs4lib.h"
981
#include "stringlib/fastsearch.h"
982
#include "stringlib/partition.h"
983
#include "stringlib/split.h"
984
#include "stringlib/count.h"
985
#include "stringlib/find.h"
986
#include "stringlib/replace.h"
987
#include "stringlib/repr.h"
988
#include "stringlib/find_max_char.h"
989
#include "stringlib/undef.h"
990
991
#undef STRINGLIB_GET_EMPTY
992
993
/* --- Unicode Object ----------------------------------------------------- */
994
995
static inline Py_ssize_t
996
findchar(const void *s, int kind,
997
         Py_ssize_t size, Py_UCS4 ch,
998
         int direction)
999
214M
{
1000
214M
    switch (kind) {
1001
206M
    case PyUnicode_1BYTE_KIND:
1002
206M
        if ((Py_UCS1) ch != ch)
1003
3.42k
            return -1;
1004
206M
        if (direction > 0)
1005
206M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1006
67.2k
        else
1007
67.2k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1008
4.98M
    case PyUnicode_2BYTE_KIND:
1009
4.98M
        if ((Py_UCS2) ch != ch)
1010
0
            return -1;
1011
4.98M
        if (direction > 0)
1012
4.70M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1013
277k
        else
1014
277k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1015
3.02M
    case PyUnicode_4BYTE_KIND:
1016
3.02M
        if (direction > 0)
1017
2.92M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1018
99.1k
        else
1019
99.1k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1020
0
    default:
1021
0
        Py_UNREACHABLE();
1022
214M
    }
1023
214M
}
1024
1025
#ifdef Py_DEBUG
1026
/* Fill the data of a Unicode string with invalid characters to detect bugs
1027
   earlier.
1028
1029
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1030
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1031
   invalid character in Unicode 6.0. */
1032
static void
1033
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1034
{
1035
    int kind = PyUnicode_KIND(unicode);
1036
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1037
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1038
    if (length <= old_length)
1039
        return;
1040
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1041
}
1042
#endif
1043
1044
static PyObject*
1045
resize_copy(PyObject *unicode, Py_ssize_t length)
1046
0
{
1047
0
    Py_ssize_t copy_length;
1048
0
    PyObject *copy;
1049
1050
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1051
0
    if (copy == NULL)
1052
0
        return NULL;
1053
1054
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1055
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1056
0
    return copy;
1057
0
}
1058
1059
PyObject*
1060
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1061
61.7M
{
1062
61.7M
    Py_ssize_t char_size;
1063
61.7M
    Py_ssize_t struct_size;
1064
61.7M
    Py_ssize_t new_size;
1065
61.7M
    PyObject *new_unicode;
1066
#ifdef Py_DEBUG
1067
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1068
#endif
1069
1070
61.7M
    if (!_PyUnicode_IsModifiable(unicode)) {
1071
0
        PyObject *copy = resize_copy(unicode, length);
1072
0
        if (copy == NULL) {
1073
0
            return NULL;
1074
0
        }
1075
0
        Py_DECREF(unicode);
1076
0
        return copy;
1077
0
    }
1078
61.7M
    assert(PyUnicode_IS_COMPACT(unicode));
1079
1080
61.7M
    char_size = PyUnicode_KIND(unicode);
1081
61.7M
    if (PyUnicode_IS_ASCII(unicode))
1082
38.0M
        struct_size = sizeof(PyASCIIObject);
1083
23.7M
    else
1084
23.7M
        struct_size = sizeof(PyCompactUnicodeObject);
1085
1086
61.7M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1087
0
        PyErr_NoMemory();
1088
0
        return NULL;
1089
0
    }
1090
61.7M
    new_size = (struct_size + (length + 1) * char_size);
1091
1092
61.7M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1093
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1094
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1095
0
        PyUnicode_SET_UTF8(unicode, NULL);
1096
0
    }
1097
#ifdef Py_TRACE_REFS
1098
    _Py_ForgetReference(unicode);
1099
#endif
1100
61.7M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1101
1102
61.7M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1103
61.7M
    if (new_unicode == NULL) {
1104
0
        _Py_NewReferenceNoTotal(unicode);
1105
0
        PyErr_NoMemory();
1106
0
        return NULL;
1107
0
    }
1108
61.7M
    unicode = new_unicode;
1109
61.7M
    _Py_NewReferenceNoTotal(unicode);
1110
1111
61.7M
    _PyUnicode_LENGTH(unicode) = length;
1112
#ifdef Py_DEBUG
1113
    unicode_fill_invalid(unicode, old_length);
1114
#endif
1115
61.7M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1116
61.7M
                    length, 0);
1117
61.7M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1118
61.7M
    return unicode;
1119
61.7M
}
1120
1121
static int
1122
resize_inplace(PyObject *unicode, Py_ssize_t length)
1123
0
{
1124
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1125
0
    assert(Py_REFCNT(unicode) == 1);
1126
1127
0
    Py_ssize_t new_size;
1128
0
    Py_ssize_t char_size;
1129
0
    int share_utf8;
1130
0
    void *data;
1131
#ifdef Py_DEBUG
1132
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1133
#endif
1134
1135
0
    data = _PyUnicode_DATA_ANY(unicode);
1136
0
    char_size = PyUnicode_KIND(unicode);
1137
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1138
1139
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1140
0
        PyErr_NoMemory();
1141
0
        return -1;
1142
0
    }
1143
0
    new_size = (length + 1) * char_size;
1144
1145
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1146
0
    {
1147
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1148
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1149
0
        PyUnicode_SET_UTF8(unicode, NULL);
1150
0
    }
1151
1152
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1153
0
    if (data == NULL) {
1154
0
        PyErr_NoMemory();
1155
0
        return -1;
1156
0
    }
1157
0
    _PyUnicode_DATA_ANY(unicode) = data;
1158
0
    if (share_utf8) {
1159
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1160
0
        PyUnicode_SET_UTF8(unicode, data);
1161
0
    }
1162
0
    _PyUnicode_LENGTH(unicode) = length;
1163
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1164
#ifdef Py_DEBUG
1165
    unicode_fill_invalid(unicode, old_length);
1166
#endif
1167
1168
    /* check for integer overflow */
1169
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1170
0
        PyErr_NoMemory();
1171
0
        return -1;
1172
0
    }
1173
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1174
0
    return 0;
1175
0
}
1176
1177
static const char*
1178
unicode_kind_name(PyObject *unicode)
1179
0
{
1180
    /* don't check consistency: unicode_kind_name() is called from
1181
       _PyUnicode_Dump() */
1182
0
    if (!PyUnicode_IS_COMPACT(unicode))
1183
0
    {
1184
0
        switch (PyUnicode_KIND(unicode))
1185
0
        {
1186
0
        case PyUnicode_1BYTE_KIND:
1187
0
            if (PyUnicode_IS_ASCII(unicode))
1188
0
                return "legacy ascii";
1189
0
            else
1190
0
                return "legacy latin1";
1191
0
        case PyUnicode_2BYTE_KIND:
1192
0
            return "legacy UCS2";
1193
0
        case PyUnicode_4BYTE_KIND:
1194
0
            return "legacy UCS4";
1195
0
        default:
1196
0
            return "<legacy invalid kind>";
1197
0
        }
1198
0
    }
1199
0
    switch (PyUnicode_KIND(unicode)) {
1200
0
    case PyUnicode_1BYTE_KIND:
1201
0
        if (PyUnicode_IS_ASCII(unicode))
1202
0
            return "ascii";
1203
0
        else
1204
0
            return "latin1";
1205
0
    case PyUnicode_2BYTE_KIND:
1206
0
        return "UCS2";
1207
0
    case PyUnicode_4BYTE_KIND:
1208
0
        return "UCS4";
1209
0
    default:
1210
0
        return "<invalid compact kind>";
1211
0
    }
1212
0
}
1213
1214
#ifdef Py_DEBUG
1215
/* Functions wrapping macros for use in debugger */
1216
const char *_PyUnicode_utf8(void *unicode_raw){
1217
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1218
    return PyUnicode_UTF8(unicode);
1219
}
1220
1221
const void *_PyUnicode_compact_data(void *unicode_raw) {
1222
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1223
    return _PyUnicode_COMPACT_DATA(unicode);
1224
}
1225
const void *_PyUnicode_data(void *unicode_raw) {
1226
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1227
    printf("obj %p\n", (void*)unicode);
1228
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1229
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1230
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1231
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1232
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1233
    return PyUnicode_DATA(unicode);
1234
}
1235
1236
void
1237
_PyUnicode_Dump(PyObject *op)
1238
{
1239
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1240
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1241
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1242
    const void *data;
1243
1244
    if (ascii->state.compact)
1245
    {
1246
        if (ascii->state.ascii)
1247
            data = (ascii + 1);
1248
        else
1249
            data = (compact + 1);
1250
    }
1251
    else
1252
        data = unicode->data.any;
1253
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1254
1255
    if (!ascii->state.ascii) {
1256
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1257
    }
1258
    printf(", data=%p\n", data);
1259
}
1260
#endif
1261
1262
1263
PyObject *
1264
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1265
568M
{
1266
    /* Optimization for empty strings */
1267
568M
    if (size == 0) {
1268
22.2M
        return _PyUnicode_GetEmpty();
1269
22.2M
    }
1270
1271
546M
    PyObject *obj;
1272
546M
    PyCompactUnicodeObject *unicode;
1273
546M
    void *data;
1274
546M
    int kind;
1275
546M
    int is_ascii;
1276
546M
    Py_ssize_t char_size;
1277
546M
    Py_ssize_t struct_size;
1278
1279
546M
    is_ascii = 0;
1280
546M
    struct_size = sizeof(PyCompactUnicodeObject);
1281
546M
    if (maxchar < 128) {
1282
360M
        kind = PyUnicode_1BYTE_KIND;
1283
360M
        char_size = 1;
1284
360M
        is_ascii = 1;
1285
360M
        struct_size = sizeof(PyASCIIObject);
1286
360M
    }
1287
185M
    else if (maxchar < 256) {
1288
14.7M
        kind = PyUnicode_1BYTE_KIND;
1289
14.7M
        char_size = 1;
1290
14.7M
    }
1291
170M
    else if (maxchar < 65536) {
1292
159M
        kind = PyUnicode_2BYTE_KIND;
1293
159M
        char_size = 2;
1294
159M
    }
1295
11.6M
    else {
1296
11.6M
        if (maxchar > MAX_UNICODE) {
1297
0
            PyErr_SetString(PyExc_SystemError,
1298
0
                            "invalid maximum character passed to PyUnicode_New");
1299
0
            return NULL;
1300
0
        }
1301
11.6M
        kind = PyUnicode_4BYTE_KIND;
1302
11.6M
        char_size = 4;
1303
11.6M
    }
1304
1305
    /* Ensure we won't overflow the size. */
1306
546M
    if (size < 0) {
1307
0
        PyErr_SetString(PyExc_SystemError,
1308
0
                        "Negative size passed to PyUnicode_New");
1309
0
        return NULL;
1310
0
    }
1311
546M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1312
0
        return PyErr_NoMemory();
1313
1314
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1315
     * PyObject_New() so we are able to allocate space for the object and
1316
     * it's data buffer.
1317
     */
1318
546M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1319
546M
    if (obj == NULL) {
1320
0
        return PyErr_NoMemory();
1321
0
    }
1322
546M
    _PyObject_Init(obj, &PyUnicode_Type);
1323
1324
546M
    unicode = (PyCompactUnicodeObject *)obj;
1325
546M
    if (is_ascii)
1326
360M
        data = ((PyASCIIObject*)obj) + 1;
1327
185M
    else
1328
185M
        data = unicode + 1;
1329
546M
    _PyUnicode_LENGTH(unicode) = size;
1330
546M
    _PyUnicode_HASH(unicode) = -1;
1331
546M
    _PyUnicode_STATE(unicode).interned = 0;
1332
546M
    _PyUnicode_STATE(unicode).kind = kind;
1333
546M
    _PyUnicode_STATE(unicode).compact = 1;
1334
546M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1335
546M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1336
546M
    if (is_ascii) {
1337
360M
        ((char*)data)[size] = 0;
1338
360M
    }
1339
185M
    else if (kind == PyUnicode_1BYTE_KIND) {
1340
14.7M
        ((char*)data)[size] = 0;
1341
14.7M
        unicode->utf8 = NULL;
1342
14.7M
        unicode->utf8_length = 0;
1343
14.7M
    }
1344
170M
    else {
1345
170M
        unicode->utf8 = NULL;
1346
170M
        unicode->utf8_length = 0;
1347
170M
        if (kind == PyUnicode_2BYTE_KIND)
1348
159M
            ((Py_UCS2*)data)[size] = 0;
1349
11.6M
        else /* kind == PyUnicode_4BYTE_KIND */
1350
11.6M
            ((Py_UCS4*)data)[size] = 0;
1351
170M
    }
1352
#ifdef Py_DEBUG
1353
    unicode_fill_invalid((PyObject*)unicode, 0);
1354
#endif
1355
546M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1356
546M
    return obj;
1357
546M
}
1358
1359
static int
1360
unicode_check_modifiable(PyObject *unicode)
1361
650
{
1362
650
    if (!_PyUnicode_IsModifiable(unicode)) {
1363
0
        PyErr_SetString(PyExc_SystemError,
1364
0
                        "Cannot modify a string currently used");
1365
0
        return -1;
1366
0
    }
1367
650
    return 0;
1368
650
}
1369
1370
static int
1371
_copy_characters(PyObject *to, Py_ssize_t to_start,
1372
                 PyObject *from, Py_ssize_t from_start,
1373
                 Py_ssize_t how_many, int check_maxchar)
1374
267M
{
1375
267M
    int from_kind, to_kind;
1376
267M
    const void *from_data;
1377
267M
    void *to_data;
1378
1379
267M
    assert(0 <= how_many);
1380
267M
    assert(0 <= from_start);
1381
267M
    assert(0 <= to_start);
1382
267M
    assert(PyUnicode_Check(from));
1383
267M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1384
1385
267M
    assert(to == NULL || PyUnicode_Check(to));
1386
1387
267M
    if (how_many == 0) {
1388
5.46M
        return 0;
1389
5.46M
    }
1390
1391
267M
    assert(to != NULL);
1392
262M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1393
1394
262M
    from_kind = PyUnicode_KIND(from);
1395
262M
    from_data = PyUnicode_DATA(from);
1396
262M
    to_kind = PyUnicode_KIND(to);
1397
262M
    to_data = PyUnicode_DATA(to);
1398
1399
#ifdef Py_DEBUG
1400
    if (!check_maxchar
1401
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1402
    {
1403
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1404
        Py_UCS4 ch;
1405
        Py_ssize_t i;
1406
        for (i=0; i < how_many; i++) {
1407
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1408
            assert(ch <= to_maxchar);
1409
        }
1410
    }
1411
#endif
1412
1413
262M
    if (from_kind == to_kind) {
1414
167M
        if (check_maxchar
1415
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1416
0
        {
1417
            /* Writing Latin-1 characters into an ASCII string requires to
1418
               check that all written characters are pure ASCII */
1419
0
            Py_UCS4 max_char;
1420
0
            max_char = ucs1lib_find_max_char(from_data,
1421
0
                                             (const Py_UCS1*)from_data + how_many);
1422
0
            if (max_char >= 128)
1423
0
                return -1;
1424
0
        }
1425
167M
        memcpy((char*)to_data + to_kind * to_start,
1426
167M
                  (const char*)from_data + from_kind * from_start,
1427
167M
                  to_kind * how_many);
1428
167M
    }
1429
94.2M
    else if (from_kind == PyUnicode_1BYTE_KIND
1430
92.5M
             && to_kind == PyUnicode_2BYTE_KIND)
1431
80.6M
    {
1432
80.6M
        _PyUnicode_CONVERT_BYTES(
1433
80.6M
            Py_UCS1, Py_UCS2,
1434
80.6M
            PyUnicode_1BYTE_DATA(from) + from_start,
1435
80.6M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1436
80.6M
            PyUnicode_2BYTE_DATA(to) + to_start
1437
80.6M
            );
1438
80.6M
    }
1439
13.6M
    else if (from_kind == PyUnicode_1BYTE_KIND
1440
11.8M
             && to_kind == PyUnicode_4BYTE_KIND)
1441
11.8M
    {
1442
11.8M
        _PyUnicode_CONVERT_BYTES(
1443
11.8M
            Py_UCS1, Py_UCS4,
1444
11.8M
            PyUnicode_1BYTE_DATA(from) + from_start,
1445
11.8M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1446
11.8M
            PyUnicode_4BYTE_DATA(to) + to_start
1447
11.8M
            );
1448
11.8M
    }
1449
1.77M
    else if (from_kind == PyUnicode_2BYTE_KIND
1450
1.75M
             && to_kind == PyUnicode_4BYTE_KIND)
1451
1.75M
    {
1452
1.75M
        _PyUnicode_CONVERT_BYTES(
1453
1.75M
            Py_UCS2, Py_UCS4,
1454
1.75M
            PyUnicode_2BYTE_DATA(from) + from_start,
1455
1.75M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1456
1.75M
            PyUnicode_4BYTE_DATA(to) + to_start
1457
1.75M
            );
1458
1.75M
    }
1459
15.1k
    else {
1460
15.1k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1461
1462
15.1k
        if (!check_maxchar) {
1463
15.1k
            if (from_kind == PyUnicode_2BYTE_KIND
1464
3.04k
                && to_kind == PyUnicode_1BYTE_KIND)
1465
3.04k
            {
1466
3.04k
                _PyUnicode_CONVERT_BYTES(
1467
3.04k
                    Py_UCS2, Py_UCS1,
1468
3.04k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1469
3.04k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1470
3.04k
                    PyUnicode_1BYTE_DATA(to) + to_start
1471
3.04k
                    );
1472
3.04k
            }
1473
12.1k
            else if (from_kind == PyUnicode_4BYTE_KIND
1474
12.1k
                     && to_kind == PyUnicode_1BYTE_KIND)
1475
8.23k
            {
1476
8.23k
                _PyUnicode_CONVERT_BYTES(
1477
8.23k
                    Py_UCS4, Py_UCS1,
1478
8.23k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1479
8.23k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1480
8.23k
                    PyUnicode_1BYTE_DATA(to) + to_start
1481
8.23k
                    );
1482
8.23k
            }
1483
3.91k
            else if (from_kind == PyUnicode_4BYTE_KIND
1484
3.91k
                     && to_kind == PyUnicode_2BYTE_KIND)
1485
3.91k
            {
1486
3.91k
                _PyUnicode_CONVERT_BYTES(
1487
3.91k
                    Py_UCS4, Py_UCS2,
1488
3.91k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1489
3.91k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1490
3.91k
                    PyUnicode_2BYTE_DATA(to) + to_start
1491
3.91k
                    );
1492
3.91k
            }
1493
0
            else {
1494
0
                Py_UNREACHABLE();
1495
0
            }
1496
15.1k
        }
1497
0
        else {
1498
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1499
0
            Py_UCS4 ch;
1500
0
            Py_ssize_t i;
1501
1502
0
            for (i=0; i < how_many; i++) {
1503
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1504
0
                if (ch > to_maxchar)
1505
0
                    return -1;
1506
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1507
0
            }
1508
0
        }
1509
15.1k
    }
1510
262M
    return 0;
1511
262M
}
1512
1513
void
1514
_PyUnicode_FastCopyCharacters(
1515
    PyObject *to, Py_ssize_t to_start,
1516
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1517
267M
{
1518
267M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1519
267M
}
1520
1521
Py_ssize_t
1522
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1523
                         PyObject *from, Py_ssize_t from_start,
1524
                         Py_ssize_t how_many)
1525
0
{
1526
0
    int err;
1527
1528
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1529
0
        PyErr_BadInternalCall();
1530
0
        return -1;
1531
0
    }
1532
1533
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1534
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1535
0
        return -1;
1536
0
    }
1537
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1538
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1539
0
        return -1;
1540
0
    }
1541
0
    if (how_many < 0) {
1542
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1543
0
        return -1;
1544
0
    }
1545
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1546
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1547
0
        PyErr_Format(PyExc_SystemError,
1548
0
                     "Cannot write %zi characters at %zi "
1549
0
                     "in a string of %zi characters",
1550
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1551
0
        return -1;
1552
0
    }
1553
1554
0
    if (how_many == 0)
1555
0
        return 0;
1556
1557
0
    if (unicode_check_modifiable(to))
1558
0
        return -1;
1559
1560
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1561
0
    if (err) {
1562
0
        PyErr_Format(PyExc_SystemError,
1563
0
                     "Cannot copy %s characters "
1564
0
                     "into a string of %s characters",
1565
0
                     unicode_kind_name(from),
1566
0
                     unicode_kind_name(to));
1567
0
        return -1;
1568
0
    }
1569
0
    return how_many;
1570
0
}
1571
1572
/* Find the maximum code point and count the number of surrogate pairs so a
1573
   correct string length can be computed before converting a string to UCS4.
1574
   This function counts single surrogates as a character and not as a pair.
1575
1576
   Return 0 on success, or -1 on error. */
1577
static int
1578
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1579
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1580
583k
{
1581
583k
    const wchar_t *iter;
1582
583k
    Py_UCS4 ch;
1583
1584
583k
    assert(num_surrogates != NULL && maxchar != NULL);
1585
583k
    *num_surrogates = 0;
1586
583k
    *maxchar = 0;
1587
1588
14.6M
    for (iter = begin; iter < end; ) {
1589
#if SIZEOF_WCHAR_T == 2
1590
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1591
            && (iter+1) < end
1592
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1593
        {
1594
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1595
            ++(*num_surrogates);
1596
            iter += 2;
1597
        }
1598
        else
1599
#endif
1600
14.0M
        {
1601
14.0M
            ch = *iter;
1602
14.0M
            iter++;
1603
14.0M
        }
1604
14.0M
        if (ch > *maxchar) {
1605
2.33M
            *maxchar = ch;
1606
2.33M
            if (*maxchar > MAX_UNICODE) {
1607
0
                PyErr_Format(PyExc_ValueError,
1608
0
                             "character U+%x is not in range [U+0000; U+%x]",
1609
0
                             ch, MAX_UNICODE);
1610
0
                return -1;
1611
0
            }
1612
2.33M
        }
1613
14.0M
    }
1614
583k
    return 0;
1615
583k
}
1616
1617
static void
1618
unicode_dealloc(PyObject *unicode)
1619
560M
{
1620
#ifdef Py_DEBUG
1621
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1622
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1623
    }
1624
#endif
1625
560M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1626
        /* This should never get called, but we also don't want to SEGV if
1627
        * we accidentally decref an immortal string out of existence. Since
1628
        * the string is an immortal object, just re-set the reference count.
1629
        */
1630
#ifdef Py_DEBUG
1631
        Py_UNREACHABLE();
1632
#endif
1633
0
        _Py_SetImmortal(unicode);
1634
0
        return;
1635
0
    }
1636
560M
    switch (_PyUnicode_STATE(unicode).interned) {
1637
559M
        case SSTATE_NOT_INTERNED:
1638
559M
            break;
1639
456k
        case SSTATE_INTERNED_MORTAL:
1640
            /* Remove the object from the intern dict.
1641
             * Before doing so, we set the refcount to 2: the key and value
1642
             * in the interned_dict.
1643
             */
1644
456k
            assert(Py_REFCNT(unicode) == 0);
1645
456k
            Py_SET_REFCNT(unicode, 2);
1646
#ifdef Py_REF_DEBUG
1647
            /* let's be pedantic with the ref total */
1648
            _Py_IncRefTotal(_PyThreadState_GET());
1649
            _Py_IncRefTotal(_PyThreadState_GET());
1650
#endif
1651
456k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1652
456k
            PyObject *interned = get_interned_dict(interp);
1653
456k
            assert(interned != NULL);
1654
456k
            PyObject *popped;
1655
456k
            int r = PyDict_Pop(interned, unicode, &popped);
1656
456k
            if (r == -1) {
1657
0
                PyErr_FormatUnraisable("Exception ignored while "
1658
0
                                       "removing an interned string %R",
1659
0
                                       unicode);
1660
                // We don't know what happened to the string. It's probably
1661
                // best to leak it:
1662
                // - if it was popped, there are no more references to it
1663
                //   so it can't cause trouble (except wasted memory)
1664
                // - if it wasn't popped, it'll remain interned
1665
0
                _Py_SetImmortal(unicode);
1666
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1667
0
                return;
1668
0
            }
1669
456k
            if (r == 0) {
1670
                // The interned string was not found in the interned_dict.
1671
#ifdef Py_DEBUG
1672
                Py_UNREACHABLE();
1673
#endif
1674
0
                _Py_SetImmortal(unicode);
1675
0
                return;
1676
0
            }
1677
            // Successfully popped.
1678
456k
            assert(popped == unicode);
1679
            // Only our `popped` reference should be left; remove it too.
1680
456k
            assert(Py_REFCNT(unicode) == 1);
1681
456k
            Py_SET_REFCNT(unicode, 0);
1682
#ifdef Py_REF_DEBUG
1683
            /* let's be pedantic with the ref total */
1684
            _Py_DecRefTotal(_PyThreadState_GET());
1685
#endif
1686
456k
            break;
1687
0
        default:
1688
            // As with `statically_allocated` above.
1689
#ifdef Py_REF_DEBUG
1690
            Py_UNREACHABLE();
1691
#endif
1692
0
            _Py_SetImmortal(unicode);
1693
0
            return;
1694
560M
    }
1695
560M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1696
159k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1697
159k
    }
1698
560M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1699
16.0M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1700
16.0M
    }
1701
1702
560M
    Py_TYPE(unicode)->tp_free(unicode);
1703
560M
}
1704
1705
#ifdef Py_DEBUG
1706
static int
1707
unicode_is_singleton(PyObject *unicode)
1708
{
1709
    if (unicode == &_Py_STR(empty)) {
1710
        return 1;
1711
    }
1712
1713
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1714
    if (ascii->length == 1) {
1715
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1716
        if (ch < 256 && LATIN1(ch) == unicode) {
1717
            return 1;
1718
        }
1719
    }
1720
    return 0;
1721
}
1722
#endif
1723
1724
int
1725
_PyUnicode_IsModifiable(PyObject *unicode)
1726
69.2M
{
1727
69.2M
    assert(_PyUnicode_CHECK(unicode));
1728
69.2M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1729
2.74M
        return 0;
1730
66.4M
    if (PyUnicode_HASH(unicode) != -1)
1731
0
        return 0;
1732
66.4M
    if (PyUnicode_CHECK_INTERNED(unicode))
1733
0
        return 0;
1734
66.4M
    if (!PyUnicode_CheckExact(unicode))
1735
0
        return 0;
1736
#ifdef Py_DEBUG
1737
    /* singleton refcount is greater than 1 */
1738
    assert(!unicode_is_singleton(unicode));
1739
#endif
1740
66.4M
    return 1;
1741
66.4M
}
1742
1743
static int
1744
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1745
2.35M
{
1746
2.35M
    PyObject *unicode;
1747
2.35M
    Py_ssize_t old_length;
1748
1749
2.35M
    assert(p_unicode != NULL);
1750
2.35M
    unicode = *p_unicode;
1751
1752
2.35M
    assert(unicode != NULL);
1753
2.35M
    assert(PyUnicode_Check(unicode));
1754
2.35M
    assert(0 <= length);
1755
1756
2.35M
    old_length = PyUnicode_GET_LENGTH(unicode);
1757
2.35M
    if (old_length == length)
1758
0
        return 0;
1759
1760
2.35M
    if (length == 0) {
1761
0
        PyObject *empty = _PyUnicode_GetEmpty();
1762
0
        Py_SETREF(*p_unicode, empty);
1763
0
        return 0;
1764
0
    }
1765
1766
2.35M
    if (!_PyUnicode_IsModifiable(unicode)) {
1767
0
        PyObject *copy = resize_copy(unicode, length);
1768
0
        if (copy == NULL)
1769
0
            return -1;
1770
0
        Py_SETREF(*p_unicode, copy);
1771
0
        return 0;
1772
0
    }
1773
1774
2.35M
    if (PyUnicode_IS_COMPACT(unicode)) {
1775
2.35M
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1776
2.35M
        if (new_unicode == NULL)
1777
0
            return -1;
1778
2.35M
        *p_unicode = new_unicode;
1779
2.35M
        return 0;
1780
2.35M
    }
1781
0
    return resize_inplace(unicode, length);
1782
2.35M
}
1783
1784
int
1785
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1786
0
{
1787
0
    PyObject *unicode;
1788
0
    if (p_unicode == NULL) {
1789
0
        PyErr_BadInternalCall();
1790
0
        return -1;
1791
0
    }
1792
0
    unicode = *p_unicode;
1793
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1794
0
    {
1795
0
        PyErr_BadInternalCall();
1796
0
        return -1;
1797
0
    }
1798
0
    return unicode_resize(p_unicode, length);
1799
0
}
1800
1801
static PyObject*
1802
get_latin1_char(Py_UCS1 ch)
1803
213M
{
1804
213M
    PyObject *o = LATIN1(ch);
1805
213M
    return o;
1806
213M
}
1807
1808
static PyObject*
1809
unicode_char(Py_UCS4 ch)
1810
237M
{
1811
237M
    PyObject *unicode;
1812
1813
237M
    assert(ch <= MAX_UNICODE);
1814
1815
237M
    if (ch < 256) {
1816
125M
        return get_latin1_char(ch);
1817
125M
    }
1818
1819
112M
    unicode = PyUnicode_New(1, ch);
1820
112M
    if (unicode == NULL)
1821
0
        return NULL;
1822
1823
112M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1824
112M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1825
103M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1826
103M
    } else {
1827
9.23M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1828
9.23M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1829
9.23M
    }
1830
112M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1831
112M
    return unicode;
1832
112M
}
1833
1834
1835
static inline void
1836
unicode_write_widechar(int kind, void *data,
1837
                       const wchar_t *u, Py_ssize_t size,
1838
                       Py_ssize_t num_surrogates)
1839
583k
{
1840
583k
    switch (kind) {
1841
514k
    case PyUnicode_1BYTE_KIND:
1842
514k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1843
514k
        break;
1844
1845
68.3k
    case PyUnicode_2BYTE_KIND:
1846
#if SIZEOF_WCHAR_T == 2
1847
        memcpy(data, u, size * 2);
1848
#else
1849
68.3k
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1850
68.3k
#endif
1851
68.3k
        break;
1852
1853
804
    case PyUnicode_4BYTE_KIND:
1854
804
    {
1855
#if SIZEOF_WCHAR_T == 2
1856
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1857
        // surrogate pairs.
1858
        const wchar_t *end = u + size;
1859
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1860
#  ifndef NDEBUG
1861
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1862
#  endif
1863
        for (const wchar_t *iter = u; iter < end; ) {
1864
            assert(ucs4_out < ucs4_end);
1865
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1866
                && (iter+1) < end
1867
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1868
            {
1869
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1870
                iter += 2;
1871
            }
1872
            else {
1873
                *ucs4_out++ = *iter;
1874
                iter++;
1875
            }
1876
        }
1877
        assert(ucs4_out == ucs4_end);
1878
#else
1879
804
        assert(num_surrogates == 0);
1880
804
        memcpy(data, u, size * 4);
1881
804
#endif
1882
804
        break;
1883
0
    }
1884
0
    default:
1885
0
        Py_UNREACHABLE();
1886
583k
    }
1887
583k
}
1888
1889
1890
PyObject *
1891
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1892
1.10M
{
1893
1.10M
    PyObject *unicode;
1894
1.10M
    Py_UCS4 maxchar = 0;
1895
1.10M
    Py_ssize_t num_surrogates;
1896
1897
1.10M
    if (u == NULL && size != 0) {
1898
0
        PyErr_BadInternalCall();
1899
0
        return NULL;
1900
0
    }
1901
1902
1.10M
    if (size == -1) {
1903
1.29k
        size = wcslen(u);
1904
1.29k
    }
1905
1906
    /* If the Unicode data is known at construction time, we can apply
1907
       some optimizations which share commonly used objects. */
1908
1909
    /* Optimization for empty strings */
1910
1.10M
    if (size == 0)
1911
424k
        _Py_RETURN_UNICODE_EMPTY();
1912
1913
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1914
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1915
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1916
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1917
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1918
        if (!converted) {
1919
            return NULL;
1920
        }
1921
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1922
        PyMem_Free(converted);
1923
        return unicode;
1924
    }
1925
#endif
1926
1927
    /* Single character Unicode objects in the Latin-1 range are
1928
       shared when using this constructor */
1929
680k
    if (size == 1 && (Py_UCS4)*u < 256)
1930
97.1k
        return get_latin1_char((unsigned char)*u);
1931
1932
    /* If not empty and not single character, copy the Unicode data
1933
       into the new object */
1934
583k
    if (find_maxchar_surrogates(u, u + size,
1935
583k
                                &maxchar, &num_surrogates) == -1)
1936
0
        return NULL;
1937
1938
583k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1939
583k
    if (!unicode)
1940
0
        return NULL;
1941
1942
583k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1943
583k
                           u, size, num_surrogates);
1944
1945
583k
    return unicode_result(unicode);
1946
583k
}
1947
1948
1949
int
1950
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1951
                              const wchar_t *str,
1952
                              Py_ssize_t size)
1953
0
{
1954
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1955
1956
0
    if (size < 0) {
1957
0
        size = wcslen(str);
1958
0
    }
1959
1960
0
    if (size == 0) {
1961
0
        return 0;
1962
0
    }
1963
1964
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1965
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1966
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1967
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1968
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1969
        if (!converted) {
1970
            return -1;
1971
        }
1972
1973
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1974
        PyMem_Free(converted);
1975
        return res;
1976
    }
1977
#endif
1978
1979
0
    Py_UCS4 maxchar = 0;
1980
0
    Py_ssize_t num_surrogates;
1981
0
    if (find_maxchar_surrogates(str, str + size,
1982
0
                                &maxchar, &num_surrogates) == -1) {
1983
0
        return -1;
1984
0
    }
1985
1986
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1987
0
        return -1;
1988
0
    }
1989
1990
0
    int kind = writer->kind;
1991
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
1992
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
1993
1994
0
    writer->pos += size - num_surrogates;
1995
0
    return 0;
1996
0
}
1997
1998
1999
PyObject *
2000
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2001
7.85M
{
2002
7.85M
    if (size < 0) {
2003
0
        PyErr_SetString(PyExc_SystemError,
2004
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2005
0
        return NULL;
2006
0
    }
2007
7.85M
    if (u != NULL) {
2008
7.85M
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2009
7.85M
    }
2010
0
    if (size > 0) {
2011
0
        PyErr_SetString(PyExc_SystemError,
2012
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2013
0
        return NULL;
2014
0
    }
2015
0
    return _PyUnicode_GetEmpty();
2016
0
}
2017
2018
PyObject *
2019
PyUnicode_FromString(const char *u)
2020
21.9M
{
2021
21.9M
    size_t size = strlen(u);
2022
21.9M
    if (size > PY_SSIZE_T_MAX) {
2023
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2024
0
        return NULL;
2025
0
    }
2026
21.9M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2027
21.9M
}
2028
2029
2030
PyObject *
2031
_PyUnicode_FromId(_Py_Identifier *id)
2032
0
{
2033
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2034
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2035
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2036
2037
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2038
0
    if (index < 0) {
2039
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2040
2041
0
        PyMutex_Lock(&rt_ids->mutex);
2042
        // Check again to detect concurrent access. Another thread can have
2043
        // initialized the index while this thread waited for the lock.
2044
0
        index = _Py_atomic_load_ssize(&id->index);
2045
0
        if (index < 0) {
2046
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2047
0
            index = rt_ids->next_index;
2048
0
            rt_ids->next_index++;
2049
0
            _Py_atomic_store_ssize(&id->index, index);
2050
0
        }
2051
0
        PyMutex_Unlock(&rt_ids->mutex);
2052
0
    }
2053
0
    assert(index >= 0);
2054
2055
0
    PyObject *obj;
2056
0
    if (index < ids->size) {
2057
0
        obj = ids->array[index];
2058
0
        if (obj) {
2059
            // Return a borrowed reference
2060
0
            goto end;
2061
0
        }
2062
0
    }
2063
2064
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2065
0
                                       NULL, NULL);
2066
0
    if (!obj) {
2067
0
        goto end;
2068
0
    }
2069
0
    _PyUnicode_InternImmortal(interp, &obj);
2070
2071
0
    if (index >= ids->size) {
2072
        // Overallocate to reduce the number of realloc
2073
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2074
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2075
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2076
0
        if (new_array == NULL) {
2077
0
            PyErr_NoMemory();
2078
0
            obj = NULL;
2079
0
            goto end;
2080
0
        }
2081
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2082
0
        ids->array = new_array;
2083
0
        ids->size = new_size;
2084
0
    }
2085
2086
    // The array stores a strong reference
2087
0
    ids->array[index] = obj;
2088
2089
0
end:
2090
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2091
    // Return a borrowed reference
2092
0
    return obj;
2093
0
}
2094
2095
2096
static void
2097
unicode_clear_identifiers(struct _Py_unicode_state *state)
2098
0
{
2099
0
    struct _Py_unicode_ids *ids = &state->ids;
2100
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2101
0
        Py_XDECREF(ids->array[i]);
2102
0
    }
2103
0
    ids->size = 0;
2104
0
    PyMem_Free(ids->array);
2105
0
    ids->array = NULL;
2106
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2107
    // after Py_Finalize().
2108
0
}
2109
2110
2111
/* Internal function, doesn't check maximum character */
2112
2113
PyObject*
2114
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2115
118M
{
2116
118M
    const unsigned char *s = (const unsigned char *)buffer;
2117
118M
    PyObject *unicode;
2118
118M
    if (size == 1) {
2119
#ifdef Py_DEBUG
2120
        assert((unsigned char)s[0] < 128);
2121
#endif
2122
44.2M
        return get_latin1_char(s[0]);
2123
44.2M
    }
2124
74.5M
    unicode = PyUnicode_New(size, 127);
2125
74.5M
    if (!unicode)
2126
0
        return NULL;
2127
74.5M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2128
74.5M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2129
74.5M
    return unicode;
2130
74.5M
}
2131
2132
static Py_UCS4
2133
kind_maxchar_limit(int kind)
2134
0
{
2135
0
    switch (kind) {
2136
0
    case PyUnicode_1BYTE_KIND:
2137
0
        return 0x80;
2138
0
    case PyUnicode_2BYTE_KIND:
2139
0
        return 0x100;
2140
0
    case PyUnicode_4BYTE_KIND:
2141
0
        return 0x10000;
2142
0
    default:
2143
0
        Py_UNREACHABLE();
2144
0
    }
2145
0
}
2146
2147
static PyObject*
2148
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2149
47.8M
{
2150
47.8M
    PyObject *res;
2151
47.8M
    unsigned char max_char;
2152
2153
47.8M
    if (size == 0) {
2154
7.23M
        _Py_RETURN_UNICODE_EMPTY();
2155
7.23M
    }
2156
47.8M
    assert(size > 0);
2157
40.6M
    if (size == 1) {
2158
10.3M
        return get_latin1_char(u[0]);
2159
10.3M
    }
2160
2161
30.2M
    max_char = ucs1lib_find_max_char(u, u + size);
2162
30.2M
    res = PyUnicode_New(size, max_char);
2163
30.2M
    if (!res)
2164
0
        return NULL;
2165
30.2M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2166
30.2M
    assert(_PyUnicode_CheckConsistency(res, 1));
2167
30.2M
    return res;
2168
30.2M
}
2169
2170
static PyObject*
2171
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2172
95.5M
{
2173
95.5M
    PyObject *res;
2174
95.5M
    Py_UCS2 max_char;
2175
2176
95.5M
    if (size == 0)
2177
15.0M
        _Py_RETURN_UNICODE_EMPTY();
2178
95.5M
    assert(size > 0);
2179
80.5M
    if (size == 1)
2180
50.9M
        return unicode_char(u[0]);
2181
2182
29.5M
    max_char = ucs2lib_find_max_char(u, u + size);
2183
29.5M
    res = PyUnicode_New(size, max_char);
2184
29.5M
    if (!res)
2185
0
        return NULL;
2186
29.5M
    if (max_char >= 256)
2187
17.9M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2188
11.6M
    else {
2189
11.6M
        _PyUnicode_CONVERT_BYTES(
2190
11.6M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2191
11.6M
    }
2192
29.5M
    assert(_PyUnicode_CheckConsistency(res, 1));
2193
29.5M
    return res;
2194
29.5M
}
2195
2196
static PyObject*
2197
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2198
66.8M
{
2199
66.8M
    PyObject *res;
2200
66.8M
    Py_UCS4 max_char;
2201
2202
66.8M
    if (size == 0)
2203
9.07M
        _Py_RETURN_UNICODE_EMPTY();
2204
66.8M
    assert(size > 0);
2205
57.7M
    if (size == 1)
2206
39.5M
        return unicode_char(u[0]);
2207
2208
18.2M
    max_char = ucs4lib_find_max_char(u, u + size);
2209
18.2M
    res = PyUnicode_New(size, max_char);
2210
18.2M
    if (!res)
2211
0
        return NULL;
2212
18.2M
    if (max_char < 256)
2213
12.6M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2214
18.2M
                                 PyUnicode_1BYTE_DATA(res));
2215
5.60M
    else if (max_char < 0x10000)
2216
4.23M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2217
5.60M
                                 PyUnicode_2BYTE_DATA(res));
2218
1.37M
    else
2219
1.37M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2220
18.2M
    assert(_PyUnicode_CheckConsistency(res, 1));
2221
18.2M
    return res;
2222
18.2M
}
2223
2224
2225
int
2226
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2227
                          const Py_UCS4 *str,
2228
                          Py_ssize_t size)
2229
0
{
2230
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2231
2232
0
    if (size < 0) {
2233
0
        PyErr_SetString(PyExc_ValueError,
2234
0
                        "size must be positive");
2235
0
        return -1;
2236
0
    }
2237
2238
0
    if (size == 0) {
2239
0
        return 0;
2240
0
    }
2241
2242
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2243
2244
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2245
0
        return -1;
2246
0
    }
2247
2248
0
    int kind = writer->kind;
2249
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2250
0
    if (kind == PyUnicode_1BYTE_KIND) {
2251
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2252
0
                                 str, str + size,
2253
0
                                 data);
2254
0
    }
2255
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2256
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2257
0
                                 str, str + size,
2258
0
                                 data);
2259
0
    }
2260
0
    else {
2261
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2262
0
    }
2263
0
    writer->pos += size;
2264
2265
0
    return 0;
2266
0
}
2267
2268
2269
PyObject*
2270
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2271
156M
{
2272
156M
    if (size < 0) {
2273
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2274
0
        return NULL;
2275
0
    }
2276
156M
    switch (kind) {
2277
22.3M
    case PyUnicode_1BYTE_KIND:
2278
22.3M
        return _PyUnicode_FromUCS1(buffer, size);
2279
79.8M
    case PyUnicode_2BYTE_KIND:
2280
79.8M
        return _PyUnicode_FromUCS2(buffer, size);
2281
54.7M
    case PyUnicode_4BYTE_KIND:
2282
54.7M
        return _PyUnicode_FromUCS4(buffer, size);
2283
0
    default:
2284
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2285
0
        return NULL;
2286
156M
    }
2287
156M
}
2288
2289
Py_UCS4
2290
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2291
11.5M
{
2292
11.5M
    int kind;
2293
11.5M
    const void *startptr, *endptr;
2294
2295
11.5M
    assert(0 <= start);
2296
11.5M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2297
11.5M
    assert(start <= end);
2298
2299
11.5M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2300
78.7k
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2301
2302
11.4M
    if (start == end)
2303
0
        return 127;
2304
2305
11.4M
    if (PyUnicode_IS_ASCII(unicode))
2306
11.4M
        return 127;
2307
2308
26.5k
    kind = PyUnicode_KIND(unicode);
2309
26.5k
    startptr = PyUnicode_DATA(unicode);
2310
26.5k
    endptr = (char *)startptr + end * kind;
2311
26.5k
    startptr = (char *)startptr + start * kind;
2312
26.5k
    switch(kind) {
2313
2.56k
    case PyUnicode_1BYTE_KIND:
2314
2.56k
        return ucs1lib_find_max_char(startptr, endptr);
2315
4.86k
    case PyUnicode_2BYTE_KIND:
2316
4.86k
        return ucs2lib_find_max_char(startptr, endptr);
2317
19.0k
    case PyUnicode_4BYTE_KIND:
2318
19.0k
        return ucs4lib_find_max_char(startptr, endptr);
2319
0
    default:
2320
0
        Py_UNREACHABLE();
2321
26.5k
    }
2322
26.5k
}
2323
2324
/* Ensure that a string uses the most efficient storage, if it is not the
2325
   case: create a new string with of the right kind. Write NULL into *p_unicode
2326
   on error. */
2327
static void
2328
unicode_adjust_maxchar(PyObject **p_unicode)
2329
0
{
2330
0
    PyObject *unicode, *copy;
2331
0
    Py_UCS4 max_char;
2332
0
    Py_ssize_t len;
2333
0
    int kind;
2334
2335
0
    assert(p_unicode != NULL);
2336
0
    unicode = *p_unicode;
2337
0
    if (PyUnicode_IS_ASCII(unicode))
2338
0
        return;
2339
2340
0
    len = PyUnicode_GET_LENGTH(unicode);
2341
0
    kind = PyUnicode_KIND(unicode);
2342
0
    if (kind == PyUnicode_1BYTE_KIND) {
2343
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2344
0
        max_char = ucs1lib_find_max_char(u, u + len);
2345
0
        if (max_char >= 128)
2346
0
            return;
2347
0
    }
2348
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2349
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2350
0
        max_char = ucs2lib_find_max_char(u, u + len);
2351
0
        if (max_char >= 256)
2352
0
            return;
2353
0
    }
2354
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2355
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2356
0
        max_char = ucs4lib_find_max_char(u, u + len);
2357
0
        if (max_char >= 0x10000)
2358
0
            return;
2359
0
    }
2360
0
    else
2361
0
        Py_UNREACHABLE();
2362
2363
0
    copy = PyUnicode_New(len, max_char);
2364
0
    if (copy != NULL)
2365
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2366
0
    Py_DECREF(unicode);
2367
0
    *p_unicode = copy;
2368
0
}
2369
2370
PyObject*
2371
_PyUnicode_Copy(PyObject *unicode)
2372
2.87M
{
2373
2.87M
    Py_ssize_t length;
2374
2.87M
    PyObject *copy;
2375
2376
2.87M
    if (!PyUnicode_Check(unicode)) {
2377
0
        PyErr_BadInternalCall();
2378
0
        return NULL;
2379
0
    }
2380
2381
2.87M
    length = PyUnicode_GET_LENGTH(unicode);
2382
2.87M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2383
2.87M
    if (!copy)
2384
0
        return NULL;
2385
2.87M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2386
2387
2.87M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2388
2.87M
              length * PyUnicode_KIND(unicode));
2389
2.87M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2390
2.87M
    return copy;
2391
2.87M
}
2392
2393
2394
/* Widen Unicode objects to larger buffers. Don't write terminating null
2395
   character. Return NULL on error. */
2396
2397
static void*
2398
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2399
8.31M
{
2400
8.31M
    void *result;
2401
2402
8.31M
    assert(skind < kind);
2403
8.31M
    switch (kind) {
2404
5.34M
    case PyUnicode_2BYTE_KIND:
2405
5.34M
        result = PyMem_New(Py_UCS2, len);
2406
5.34M
        if (!result)
2407
0
            return PyErr_NoMemory();
2408
5.34M
        assert(skind == PyUnicode_1BYTE_KIND);
2409
5.34M
        _PyUnicode_CONVERT_BYTES(
2410
5.34M
            Py_UCS1, Py_UCS2,
2411
5.34M
            (const Py_UCS1 *)data,
2412
5.34M
            ((const Py_UCS1 *)data) + len,
2413
5.34M
            result);
2414
5.34M
        return result;
2415
2.96M
    case PyUnicode_4BYTE_KIND:
2416
2.96M
        result = PyMem_New(Py_UCS4, len);
2417
2.96M
        if (!result)
2418
0
            return PyErr_NoMemory();
2419
2.96M
        if (skind == PyUnicode_2BYTE_KIND) {
2420
0
            _PyUnicode_CONVERT_BYTES(
2421
0
                Py_UCS2, Py_UCS4,
2422
0
                (const Py_UCS2 *)data,
2423
0
                ((const Py_UCS2 *)data) + len,
2424
0
                result);
2425
0
        }
2426
2.96M
        else {
2427
2.96M
            assert(skind == PyUnicode_1BYTE_KIND);
2428
2.96M
            _PyUnicode_CONVERT_BYTES(
2429
2.96M
                Py_UCS1, Py_UCS4,
2430
2.96M
                (const Py_UCS1 *)data,
2431
2.96M
                ((const Py_UCS1 *)data) + len,
2432
2.96M
                result);
2433
2.96M
        }
2434
2.96M
        return result;
2435
0
    default:
2436
0
        Py_UNREACHABLE();
2437
0
        return NULL;
2438
8.31M
    }
2439
8.31M
}
2440
2441
static Py_UCS4*
2442
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2443
        int copy_null)
2444
75.9k
{
2445
75.9k
    int kind;
2446
75.9k
    const void *data;
2447
75.9k
    Py_ssize_t len, targetlen;
2448
75.9k
    kind = PyUnicode_KIND(string);
2449
75.9k
    data = PyUnicode_DATA(string);
2450
75.9k
    len = PyUnicode_GET_LENGTH(string);
2451
75.9k
    targetlen = len;
2452
75.9k
    if (copy_null)
2453
0
        targetlen++;
2454
75.9k
    if (!target) {
2455
0
        target = PyMem_New(Py_UCS4, targetlen);
2456
0
        if (!target) {
2457
0
            PyErr_NoMemory();
2458
0
            return NULL;
2459
0
        }
2460
0
    }
2461
75.9k
    else {
2462
75.9k
        if (targetsize < targetlen) {
2463
0
            PyErr_Format(PyExc_SystemError,
2464
0
                         "string is longer than the buffer");
2465
0
            if (copy_null && 0 < targetsize)
2466
0
                target[0] = 0;
2467
0
            return NULL;
2468
0
        }
2469
75.9k
    }
2470
75.9k
    if (kind == PyUnicode_1BYTE_KIND) {
2471
54.1k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2472
54.1k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2473
54.1k
    }
2474
21.7k
    else if (kind == PyUnicode_2BYTE_KIND) {
2475
15.5k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2476
15.5k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2477
15.5k
    }
2478
6.19k
    else if (kind == PyUnicode_4BYTE_KIND) {
2479
6.19k
        memcpy(target, data, len * sizeof(Py_UCS4));
2480
6.19k
    }
2481
0
    else {
2482
0
        Py_UNREACHABLE();
2483
0
    }
2484
75.9k
    if (copy_null)
2485
0
        target[len] = 0;
2486
75.9k
    return target;
2487
75.9k
}
2488
2489
Py_UCS4*
2490
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2491
                 int copy_null)
2492
75.9k
{
2493
75.9k
    if (target == NULL || targetsize < 0) {
2494
0
        PyErr_BadInternalCall();
2495
0
        return NULL;
2496
0
    }
2497
75.9k
    return as_ucs4(string, target, targetsize, copy_null);
2498
75.9k
}
2499
2500
Py_UCS4*
2501
PyUnicode_AsUCS4Copy(PyObject *string)
2502
0
{
2503
0
    return as_ucs4(string, NULL, 0, 1);
2504
0
}
2505
2506
/* maximum number of characters required for output of %jo or %jd or %p.
2507
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2508
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2509
   plus 1 for the terminal NUL. */
2510
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2511
2512
static int
2513
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2514
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2515
8.97M
{
2516
8.97M
    Py_ssize_t length, fill, arglen;
2517
8.97M
    Py_UCS4 maxchar;
2518
2519
8.97M
    length = PyUnicode_GET_LENGTH(str);
2520
8.97M
    if ((precision == -1 || precision >= length)
2521
8.97M
        && width <= length)
2522
8.97M
        return _PyUnicodeWriter_WriteStr(writer, str);
2523
2524
53
    if (precision != -1)
2525
53
        length = Py_MIN(precision, length);
2526
2527
53
    arglen = Py_MAX(length, width);
2528
53
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2529
25
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2530
28
    else
2531
28
        maxchar = writer->maxchar;
2532
2533
53
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2534
0
        return -1;
2535
2536
53
    fill = Py_MAX(width - length, 0);
2537
53
    if (fill && !(flags & F_LJUST)) {
2538
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2539
0
            return -1;
2540
0
        writer->pos += fill;
2541
0
    }
2542
2543
53
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2544
53
                                  str, 0, length);
2545
53
    writer->pos += length;
2546
2547
53
    if (fill && (flags & F_LJUST)) {
2548
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2549
0
            return -1;
2550
0
        writer->pos += fill;
2551
0
    }
2552
2553
53
    return 0;
2554
53
}
2555
2556
static int
2557
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2558
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2559
3.47M
{
2560
    /* UTF-8 */
2561
3.47M
    Py_ssize_t *pconsumed = NULL;
2562
3.47M
    Py_ssize_t length;
2563
3.47M
    if (precision == -1) {
2564
1.16M
        length = strlen(str);
2565
1.16M
    }
2566
2.30M
    else {
2567
2.30M
        length = 0;
2568
36.6M
        while (length < precision && str[length]) {
2569
34.3M
            length++;
2570
34.3M
        }
2571
2.30M
        if (length == precision) {
2572
            /* The input string is not NUL-terminated.  If it ends with an
2573
             * incomplete UTF-8 sequence, truncate the string just before it.
2574
             * Incomplete sequences in the middle and sequences which cannot
2575
             * be valid prefixes are still treated as errors and replaced
2576
             * with \xfffd. */
2577
1.80k
            pconsumed = &length;
2578
1.80k
        }
2579
2.30M
    }
2580
2581
3.47M
    if (width < 0) {
2582
3.47M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2583
3.47M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2584
3.47M
    }
2585
2586
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2587
0
                                                     "replace", pconsumed);
2588
0
    if (unicode == NULL)
2589
0
        return -1;
2590
2591
0
    int res = unicode_fromformat_write_str(writer, unicode,
2592
0
                                           width, -1, flags);
2593
0
    Py_DECREF(unicode);
2594
0
    return res;
2595
0
}
2596
2597
static int
2598
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2599
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2600
0
{
2601
0
    Py_ssize_t length;
2602
0
    if (precision == -1) {
2603
0
        length = wcslen(str);
2604
0
    }
2605
0
    else {
2606
0
        length = 0;
2607
0
        while (length < precision && str[length]) {
2608
0
            length++;
2609
0
        }
2610
0
    }
2611
2612
0
    if (width < 0) {
2613
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2614
0
                                             str, length);
2615
0
    }
2616
2617
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2618
0
    if (unicode == NULL)
2619
0
        return -1;
2620
2621
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2622
0
    Py_DECREF(unicode);
2623
0
    return res;
2624
0
}
2625
2626
0
#define F_LONG 1
2627
0
#define F_LONGLONG 2
2628
217k
#define F_SIZE 3
2629
0
#define F_PTRDIFF 4
2630
0
#define F_INTMAX 5
2631
2632
static const char*
2633
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2634
                       const char *f, va_list *vargs)
2635
26.6M
{
2636
26.6M
    const char *p;
2637
26.6M
    Py_ssize_t len;
2638
26.6M
    int flags = 0;
2639
26.6M
    Py_ssize_t width;
2640
26.6M
    Py_ssize_t precision;
2641
2642
26.6M
    p = f;
2643
26.6M
    f++;
2644
26.6M
    if (*f == '%') {
2645
923k
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2646
0
            return NULL;
2647
923k
        f++;
2648
923k
        return f;
2649
923k
    }
2650
2651
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2652
    /* Flags '+', ' ' and '#' are not particularly useful.
2653
     * They are not worth the implementation and maintenance costs.
2654
     * In addition, '#' should add "0" for "o" conversions for compatibility
2655
     * with printf, but it would confuse Python users. */
2656
25.7M
    while (1) {
2657
25.7M
        switch (*f++) {
2658
0
        case '-': flags |= F_LJUST; continue;
2659
1.70k
        case '0': flags |= F_ZERO; continue;
2660
0
        case '#': flags |= F_ALT; continue;
2661
25.7M
        }
2662
25.7M
        f--;
2663
25.7M
        break;
2664
25.7M
    }
2665
2666
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2667
25.7M
    width = -1;
2668
25.7M
    if (*f == '*') {
2669
0
        width = va_arg(*vargs, int);
2670
0
        if (width < 0) {
2671
0
            flags |= F_LJUST;
2672
0
            width = -width;
2673
0
        }
2674
0
        f++;
2675
0
    }
2676
25.7M
    else if (Py_ISDIGIT((unsigned)*f)) {
2677
1.70k
        width = *f - '0';
2678
1.70k
        f++;
2679
1.70k
        while (Py_ISDIGIT((unsigned)*f)) {
2680
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2681
0
                PyErr_SetString(PyExc_ValueError,
2682
0
                                "width too big");
2683
0
                return NULL;
2684
0
            }
2685
0
            width = (width * 10) + (*f - '0');
2686
0
            f++;
2687
0
        }
2688
1.70k
    }
2689
25.7M
    precision = -1;
2690
25.7M
    if (*f == '.') {
2691
5.24M
        f++;
2692
5.24M
        if (*f == '*') {
2693
0
            precision = va_arg(*vargs, int);
2694
0
            if (precision < 0) {
2695
0
                precision = -2;
2696
0
            }
2697
0
            f++;
2698
0
        }
2699
5.24M
        else if (Py_ISDIGIT((unsigned)*f)) {
2700
5.24M
            precision = (*f - '0');
2701
5.24M
            f++;
2702
15.7M
            while (Py_ISDIGIT((unsigned)*f)) {
2703
10.4M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2704
0
                    PyErr_SetString(PyExc_ValueError,
2705
0
                                    "precision too big");
2706
0
                    return NULL;
2707
0
                }
2708
10.4M
                precision = (precision * 10) + (*f - '0');
2709
10.4M
                f++;
2710
10.4M
            }
2711
5.24M
        }
2712
5.24M
    }
2713
2714
25.7M
    int sizemod = 0;
2715
25.7M
    if (*f == 'l') {
2716
0
        if (f[1] == 'l') {
2717
0
            sizemod = F_LONGLONG;
2718
0
            f += 2;
2719
0
        }
2720
0
        else {
2721
0
            sizemod = F_LONG;
2722
0
            ++f;
2723
0
        }
2724
0
    }
2725
25.7M
    else if (*f == 'z') {
2726
108k
        sizemod = F_SIZE;
2727
108k
        ++f;
2728
108k
    }
2729
25.6M
    else if (*f == 't') {
2730
0
        sizemod = F_PTRDIFF;
2731
0
        ++f;
2732
0
    }
2733
25.6M
    else if (*f == 'j') {
2734
0
        sizemod = F_INTMAX;
2735
0
        ++f;
2736
0
    }
2737
25.7M
    if (f[0] != '\0' && f[1] == '\0')
2738
4.90M
        writer->overallocate = 0;
2739
2740
25.7M
    switch (*f) {
2741
11.4M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2742
11.4M
        break;
2743
1.87M
    case 'c': case 'p':
2744
1.87M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2745
1.87M
        break;
2746
3.47M
    case 's':
2747
3.47M
    case 'V':
2748
3.47M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2749
3.47M
        break;
2750
8.97M
    default:
2751
8.97M
        if (sizemod) goto invalid_format;
2752
8.97M
        break;
2753
25.7M
    }
2754
2755
25.7M
    switch (*f) {
2756
1.86M
    case 'c':
2757
1.86M
    {
2758
1.86M
        int ordinal = va_arg(*vargs, int);
2759
1.86M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2760
0
            PyErr_SetString(PyExc_OverflowError,
2761
0
                            "character argument not in range(0x110000)");
2762
0
            return NULL;
2763
0
        }
2764
1.86M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2765
0
            return NULL;
2766
1.86M
        break;
2767
1.86M
    }
2768
2769
11.4M
    case 'd': case 'i':
2770
11.4M
    case 'o': case 'u': case 'x': case 'X':
2771
11.4M
    {
2772
11.4M
        char buffer[MAX_INTMAX_CHARS];
2773
2774
        // Fill buffer using sprinf, with one of many possible format
2775
        // strings, like "%llX" for `long long` in hexadecimal.
2776
        // The type/size is in `sizemod`; the format is in `*f`.
2777
2778
        // Use macros with nested switches to keep the sprintf format strings
2779
        // as compile-time literals, avoiding warnings and maybe allowing
2780
        // optimizations.
2781
2782
        // `SPRINT` macro does one sprintf
2783
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2784
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2785
11.4M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2786
11.4M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2787
2788
        // One inner switch to handle all format variants
2789
11.4M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2790
11.4M
            switch (*f) {                                                     \
2791
96
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2792
17.0k
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2793
1.30k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2794
983
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2795
11.4M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2796
11.4M
            }
2797
2798
        // Outer switch to handle all the sizes/types
2799
11.4M
        switch (sizemod) {
2800
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2801
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2802
108k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2803
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2804
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2805
11.3M
            default:         DO_SPRINTS("", int, unsigned int); break;
2806
11.4M
        }
2807
11.4M
        #undef SPRINT
2808
11.4M
        #undef DO_SPRINTS
2809
2810
11.4M
        assert(len >= 0);
2811
2812
11.4M
        int sign = (buffer[0] == '-');
2813
11.4M
        len -= sign;
2814
2815
11.4M
        precision = Py_MAX(precision, len);
2816
11.4M
        width = Py_MAX(width, precision + sign);
2817
11.4M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2818
1.70k
            precision = width - sign;
2819
1.70k
        }
2820
2821
11.4M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2822
11.4M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2823
2824
11.4M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2825
0
            return NULL;
2826
2827
11.4M
        if (spacepad && !(flags & F_LJUST)) {
2828
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2829
0
                return NULL;
2830
0
            writer->pos += spacepad;
2831
0
        }
2832
2833
11.4M
        if (sign) {
2834
825
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2835
0
                return NULL;
2836
825
        }
2837
2838
11.4M
        if (zeropad) {
2839
650
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2840
0
                return NULL;
2841
650
            writer->pos += zeropad;
2842
650
        }
2843
2844
11.4M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2845
0
            return NULL;
2846
2847
11.4M
        if (spacepad && (flags & F_LJUST)) {
2848
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2849
0
                return NULL;
2850
0
            writer->pos += spacepad;
2851
0
        }
2852
11.4M
        break;
2853
11.4M
    }
2854
2855
11.4M
    case 'p':
2856
2.92k
    {
2857
2.92k
        char number[MAX_INTMAX_CHARS];
2858
2859
2.92k
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2860
2.92k
        assert(len >= 0);
2861
2862
        /* %p is ill-defined:  ensure leading 0x. */
2863
2.92k
        if (number[1] == 'X')
2864
0
            number[1] = 'x';
2865
2.92k
        else if (number[1] != 'x') {
2866
0
            memmove(number + 2, number,
2867
0
                    strlen(number) + 1);
2868
0
            number[0] = '0';
2869
0
            number[1] = 'x';
2870
0
            len += 2;
2871
0
        }
2872
2873
2.92k
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2874
0
            return NULL;
2875
2.92k
        break;
2876
2.92k
    }
2877
2878
3.47M
    case 's':
2879
3.47M
    {
2880
3.47M
        if (sizemod) {
2881
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2882
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2883
0
                return NULL;
2884
0
        }
2885
3.47M
        else {
2886
            /* UTF-8 */
2887
3.47M
            const char *s = va_arg(*vargs, const char*);
2888
3.47M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2889
0
                return NULL;
2890
3.47M
        }
2891
3.47M
        break;
2892
3.47M
    }
2893
2894
4.16M
    case 'U':
2895
4.16M
    {
2896
4.16M
        PyObject *obj = va_arg(*vargs, PyObject *);
2897
4.16M
        assert(obj && _PyUnicode_CHECK(obj));
2898
2899
4.16M
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2900
0
            return NULL;
2901
4.16M
        break;
2902
4.16M
    }
2903
2904
4.16M
    case 'V':
2905
583
    {
2906
583
        PyObject *obj = va_arg(*vargs, PyObject *);
2907
583
        const char *str;
2908
583
        const wchar_t *wstr;
2909
583
        if (sizemod) {
2910
0
            wstr = va_arg(*vargs, const wchar_t*);
2911
0
        }
2912
583
        else {
2913
583
            str = va_arg(*vargs, const char *);
2914
583
        }
2915
583
        if (obj) {
2916
0
            assert(_PyUnicode_CHECK(obj));
2917
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2918
0
                return NULL;
2919
0
        }
2920
583
        else if (sizemod) {
2921
0
            assert(wstr != NULL);
2922
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2923
0
                return NULL;
2924
0
        }
2925
583
        else {
2926
583
            assert(str != NULL);
2927
583
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2928
0
                return NULL;
2929
583
        }
2930
583
        break;
2931
583
    }
2932
2933
1.92k
    case 'S':
2934
1.92k
    {
2935
1.92k
        PyObject *obj = va_arg(*vargs, PyObject *);
2936
1.92k
        PyObject *str;
2937
1.92k
        assert(obj);
2938
1.92k
        str = PyObject_Str(obj);
2939
1.92k
        if (!str)
2940
0
            return NULL;
2941
1.92k
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2942
0
            Py_DECREF(str);
2943
0
            return NULL;
2944
0
        }
2945
1.92k
        Py_DECREF(str);
2946
1.92k
        break;
2947
1.92k
    }
2948
2949
2.94M
    case 'R':
2950
2.94M
    {
2951
2.94M
        PyObject *obj = va_arg(*vargs, PyObject *);
2952
2.94M
        PyObject *repr;
2953
2.94M
        assert(obj);
2954
2.94M
        repr = PyObject_Repr(obj);
2955
2.94M
        if (!repr)
2956
0
            return NULL;
2957
2.94M
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2958
0
            Py_DECREF(repr);
2959
0
            return NULL;
2960
0
        }
2961
2.94M
        Py_DECREF(repr);
2962
2.94M
        break;
2963
2.94M
    }
2964
2965
0
    case 'A':
2966
0
    {
2967
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2968
0
        PyObject *ascii;
2969
0
        assert(obj);
2970
0
        ascii = PyObject_ASCII(obj);
2971
0
        if (!ascii)
2972
0
            return NULL;
2973
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2974
0
            Py_DECREF(ascii);
2975
0
            return NULL;
2976
0
        }
2977
0
        Py_DECREF(ascii);
2978
0
        break;
2979
0
    }
2980
2981
1.85M
    case 'T':
2982
1.85M
    {
2983
1.85M
        PyObject *obj = va_arg(*vargs, PyObject *);
2984
1.85M
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2985
2986
1.85M
        PyObject *type_name;
2987
1.85M
        if (flags & F_ALT) {
2988
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2989
0
        }
2990
1.85M
        else {
2991
1.85M
            type_name = PyType_GetFullyQualifiedName(type);
2992
1.85M
        }
2993
1.85M
        Py_DECREF(type);
2994
1.85M
        if (!type_name) {
2995
0
            return NULL;
2996
0
        }
2997
2998
1.85M
        if (unicode_fromformat_write_str(writer, type_name,
2999
1.85M
                                         width, precision, flags) == -1) {
3000
0
            Py_DECREF(type_name);
3001
0
            return NULL;
3002
0
        }
3003
1.85M
        Py_DECREF(type_name);
3004
1.85M
        break;
3005
1.85M
    }
3006
3007
0
    case 'N':
3008
0
    {
3009
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3010
0
        assert(type_raw != NULL);
3011
3012
0
        if (!PyType_Check(type_raw)) {
3013
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3014
0
            return NULL;
3015
0
        }
3016
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3017
3018
0
        PyObject *type_name;
3019
0
        if (flags & F_ALT) {
3020
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3021
0
        }
3022
0
        else {
3023
0
            type_name = PyType_GetFullyQualifiedName(type);
3024
0
        }
3025
0
        if (!type_name) {
3026
0
            return NULL;
3027
0
        }
3028
0
        if (unicode_fromformat_write_str(writer, type_name,
3029
0
                                         width, precision, flags) == -1) {
3030
0
            Py_DECREF(type_name);
3031
0
            return NULL;
3032
0
        }
3033
0
        Py_DECREF(type_name);
3034
0
        break;
3035
0
    }
3036
3037
0
    default:
3038
0
    invalid_format:
3039
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3040
0
        return NULL;
3041
25.7M
    }
3042
3043
25.7M
    f++;
3044
25.7M
    return f;
3045
25.7M
}
3046
3047
static int
3048
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3049
12.5M
{
3050
12.5M
    Py_ssize_t len = strlen(format);
3051
12.5M
    writer->min_length += len + 100;
3052
12.5M
    writer->overallocate = 1;
3053
3054
    // Copy varags to be able to pass a reference to a subfunction.
3055
12.5M
    va_list vargs2;
3056
12.5M
    va_copy(vargs2, vargs);
3057
3058
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3059
    // to be encoded to ASCII.
3060
12.5M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3061
12.5M
    if (!is_ascii) {
3062
0
        Py_ssize_t i;
3063
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3064
0
        PyErr_Format(PyExc_ValueError,
3065
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3066
0
            "string, got a non-ASCII byte: 0x%02x",
3067
0
            (unsigned char)format[i]);
3068
0
        goto fail;
3069
0
    }
3070
3071
69.7M
    for (const char *f = format; *f; ) {
3072
57.2M
        if (*f == '%') {
3073
26.6M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3074
26.6M
            if (f == NULL)
3075
0
                goto fail;
3076
26.6M
        }
3077
30.5M
        else {
3078
30.5M
            const char *p = strchr(f, '%');
3079
30.5M
            if (p != NULL) {
3080
22.8M
                len = p - f;
3081
22.8M
            }
3082
7.64M
            else {
3083
7.64M
                len = strlen(f);
3084
7.64M
                writer->overallocate = 0;
3085
7.64M
            }
3086
3087
30.5M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3088
0
                goto fail;
3089
0
            }
3090
30.5M
            f += len;
3091
30.5M
        }
3092
57.2M
    }
3093
12.5M
    va_end(vargs2);
3094
12.5M
    return 0;
3095
3096
0
  fail:
3097
0
    va_end(vargs2);
3098
0
    return -1;
3099
12.5M
}
3100
3101
PyObject *
3102
PyUnicode_FromFormatV(const char *format, va_list vargs)
3103
12.5M
{
3104
12.5M
    _PyUnicodeWriter writer;
3105
12.5M
    _PyUnicodeWriter_Init(&writer);
3106
3107
12.5M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3108
0
        _PyUnicodeWriter_Dealloc(&writer);
3109
0
        return NULL;
3110
0
    }
3111
12.5M
    return _PyUnicodeWriter_Finish(&writer);
3112
12.5M
}
3113
3114
PyObject *
3115
PyUnicode_FromFormat(const char *format, ...)
3116
986k
{
3117
986k
    PyObject* ret;
3118
986k
    va_list vargs;
3119
3120
986k
    va_start(vargs, format);
3121
986k
    ret = PyUnicode_FromFormatV(format, vargs);
3122
986k
    va_end(vargs);
3123
986k
    return ret;
3124
986k
}
3125
3126
int
3127
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3128
0
{
3129
0
    va_list vargs;
3130
0
    va_start(vargs, format);
3131
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3132
0
    va_end(vargs);
3133
0
    return res;
3134
0
}
3135
3136
int
3137
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3138
                         va_list vargs)
3139
0
{
3140
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3141
0
    Py_ssize_t old_pos = _writer->pos;
3142
3143
0
    int res = unicode_from_format(_writer, format, vargs);
3144
3145
0
    if (res < 0) {
3146
0
        _writer->pos = old_pos;
3147
0
    }
3148
0
    return res;
3149
0
}
3150
3151
static Py_ssize_t
3152
unicode_get_widechar_size(PyObject *unicode)
3153
263k
{
3154
263k
    Py_ssize_t res;
3155
3156
263k
    assert(unicode != NULL);
3157
263k
    assert(_PyUnicode_CHECK(unicode));
3158
3159
263k
    res = _PyUnicode_LENGTH(unicode);
3160
#if SIZEOF_WCHAR_T == 2
3161
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3162
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3163
        const Py_UCS4 *end = s + res;
3164
        for (; s < end; ++s) {
3165
            if (*s > 0xFFFF) {
3166
                ++res;
3167
            }
3168
        }
3169
    }
3170
#endif
3171
263k
    return res;
3172
263k
}
3173
3174
static void
3175
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3176
263k
{
3177
263k
    assert(unicode != NULL);
3178
263k
    assert(_PyUnicode_CHECK(unicode));
3179
3180
263k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3181
804
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3182
804
        return;
3183
804
    }
3184
3185
262k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3186
194k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3187
3.95M
        for (; size--; ++s, ++w) {
3188
3.76M
            *w = *s;
3189
3.76M
        }
3190
194k
    }
3191
68.3k
    else {
3192
68.3k
#if SIZEOF_WCHAR_T == 4
3193
68.3k
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3194
68.3k
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3195
1.36M
        for (; size--; ++s, ++w) {
3196
1.29M
            *w = *s;
3197
1.29M
        }
3198
#else
3199
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3200
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3201
        for (; size--; ++s, ++w) {
3202
            Py_UCS4 ch = *s;
3203
            if (ch > 0xFFFF) {
3204
                assert(ch <= MAX_UNICODE);
3205
                /* encode surrogate pair in this case */
3206
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3207
                if (!size--)
3208
                    break;
3209
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3210
            }
3211
            else {
3212
                *w = ch;
3213
            }
3214
        }
3215
#endif
3216
68.3k
    }
3217
262k
}
3218
3219
#ifdef HAVE_WCHAR_H
3220
3221
/* Convert a Unicode object to a wide character string.
3222
3223
   - If w is NULL: return the number of wide characters (including the null
3224
     character) required to convert the unicode object. Ignore size argument.
3225
3226
   - Otherwise: return the number of wide characters (excluding the null
3227
     character) written into w. Write at most size wide characters (including
3228
     the null character). */
3229
Py_ssize_t
3230
PyUnicode_AsWideChar(PyObject *unicode,
3231
                     wchar_t *w,
3232
                     Py_ssize_t size)
3233
1.40k
{
3234
1.40k
    Py_ssize_t res;
3235
3236
1.40k
    if (unicode == NULL) {
3237
0
        PyErr_BadInternalCall();
3238
0
        return -1;
3239
0
    }
3240
1.40k
    if (!PyUnicode_Check(unicode)) {
3241
0
        PyErr_BadArgument();
3242
0
        return -1;
3243
0
    }
3244
3245
1.40k
    res = unicode_get_widechar_size(unicode);
3246
1.40k
    if (w == NULL) {
3247
0
        return res + 1;
3248
0
    }
3249
3250
1.40k
    if (size > res) {
3251
1.40k
        size = res + 1;
3252
1.40k
    }
3253
0
    else {
3254
0
        res = size;
3255
0
    }
3256
1.40k
    unicode_copy_as_widechar(unicode, w, size);
3257
3258
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3259
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3260
       non-Unicode locales and hence needs conversion first. */
3261
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3262
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3263
            return -1;
3264
        }
3265
    }
3266
#endif
3267
3268
1.40k
    return res;
3269
1.40k
}
3270
3271
wchar_t*
3272
PyUnicode_AsWideCharString(PyObject *unicode,
3273
                           Py_ssize_t *size)
3274
261k
{
3275
261k
    wchar_t *buffer;
3276
261k
    Py_ssize_t buflen;
3277
3278
261k
    if (unicode == NULL) {
3279
0
        PyErr_BadInternalCall();
3280
0
        return NULL;
3281
0
    }
3282
261k
    if (!PyUnicode_Check(unicode)) {
3283
0
        PyErr_BadArgument();
3284
0
        return NULL;
3285
0
    }
3286
3287
261k
    buflen = unicode_get_widechar_size(unicode);
3288
261k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3289
261k
    if (buffer == NULL) {
3290
0
        PyErr_NoMemory();
3291
0
        return NULL;
3292
0
    }
3293
261k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3294
3295
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3296
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3297
       non-Unicode locales and hence needs conversion first. */
3298
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3299
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3300
            return NULL;
3301
        }
3302
    }
3303
#endif
3304
3305
261k
    if (size != NULL) {
3306
260k
        *size = buflen;
3307
260k
    }
3308
1.11k
    else if (wcslen(buffer) != (size_t)buflen) {
3309
0
        PyMem_Free(buffer);
3310
0
        PyErr_SetString(PyExc_ValueError,
3311
0
                        "embedded null character");
3312
0
        return NULL;
3313
0
    }
3314
261k
    return buffer;
3315
261k
}
3316
3317
#endif /* HAVE_WCHAR_H */
3318
3319
int
3320
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3321
0
{
3322
0
    wchar_t **p = (wchar_t **)ptr;
3323
0
    if (obj == NULL) {
3324
0
        PyMem_Free(*p);
3325
0
        *p = NULL;
3326
0
        return 1;
3327
0
    }
3328
0
    if (PyUnicode_Check(obj)) {
3329
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3330
0
        if (*p == NULL) {
3331
0
            return 0;
3332
0
        }
3333
0
        return Py_CLEANUP_SUPPORTED;
3334
0
    }
3335
0
    PyErr_Format(PyExc_TypeError,
3336
0
                 "argument must be str, not %.50s",
3337
0
                 Py_TYPE(obj)->tp_name);
3338
0
    return 0;
3339
0
}
3340
3341
int
3342
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3343
0
{
3344
0
    wchar_t **p = (wchar_t **)ptr;
3345
0
    if (obj == NULL) {
3346
0
        PyMem_Free(*p);
3347
0
        *p = NULL;
3348
0
        return 1;
3349
0
    }
3350
0
    if (obj == Py_None) {
3351
0
        *p = NULL;
3352
0
        return 1;
3353
0
    }
3354
0
    if (PyUnicode_Check(obj)) {
3355
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3356
0
        if (*p == NULL) {
3357
0
            return 0;
3358
0
        }
3359
0
        return Py_CLEANUP_SUPPORTED;
3360
0
    }
3361
0
    PyErr_Format(PyExc_TypeError,
3362
0
                 "argument must be str or None, not %.50s",
3363
0
                 Py_TYPE(obj)->tp_name);
3364
0
    return 0;
3365
0
}
3366
3367
PyObject *
3368
PyUnicode_FromOrdinal(int ordinal)
3369
8.98M
{
3370
8.98M
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3371
32
        PyErr_SetString(PyExc_ValueError,
3372
32
                        "chr() arg not in range(0x110000)");
3373
32
        return NULL;
3374
32
    }
3375
3376
8.98M
    return unicode_char((Py_UCS4)ordinal);
3377
8.98M
}
3378
3379
PyObject *
3380
PyUnicode_FromObject(PyObject *obj)
3381
2.09M
{
3382
    /* XXX Perhaps we should make this API an alias of
3383
       PyObject_Str() instead ?! */
3384
2.09M
    if (PyUnicode_CheckExact(obj)) {
3385
2.09M
        return Py_NewRef(obj);
3386
2.09M
    }
3387
0
    if (PyUnicode_Check(obj)) {
3388
        /* For a Unicode subtype that's not a Unicode object,
3389
           return a true Unicode object with the same data. */
3390
0
        return _PyUnicode_Copy(obj);
3391
0
    }
3392
0
    PyErr_Format(PyExc_TypeError,
3393
0
                 "Can't convert '%.100s' object to str implicitly",
3394
0
                 Py_TYPE(obj)->tp_name);
3395
0
    return NULL;
3396
0
}
3397
3398
PyObject *
3399
PyUnicode_FromEncodedObject(PyObject *obj,
3400
                            const char *encoding,
3401
                            const char *errors)
3402
25.0M
{
3403
25.0M
    Py_buffer buffer;
3404
25.0M
    PyObject *v;
3405
3406
25.0M
    if (obj == NULL) {
3407
0
        PyErr_BadInternalCall();
3408
0
        return NULL;
3409
0
    }
3410
3411
    /* Decoding bytes objects is the most common case and should be fast */
3412
25.0M
    if (PyBytes_Check(obj)) {
3413
24.5M
        if (PyBytes_GET_SIZE(obj) == 0) {
3414
3.06M
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3415
0
                return NULL;
3416
0
            }
3417
3.06M
            _Py_RETURN_UNICODE_EMPTY();
3418
3.06M
        }
3419
21.5M
        return PyUnicode_Decode(
3420
21.5M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3421
21.5M
                encoding, errors);
3422
24.5M
    }
3423
3424
499k
    if (PyUnicode_Check(obj)) {
3425
0
        PyErr_SetString(PyExc_TypeError,
3426
0
                        "decoding str is not supported");
3427
0
        return NULL;
3428
0
    }
3429
3430
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3431
499k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3432
0
        PyErr_Format(PyExc_TypeError,
3433
0
                     "decoding to str: need a bytes-like object, %.80s found",
3434
0
                     Py_TYPE(obj)->tp_name);
3435
0
        return NULL;
3436
0
    }
3437
3438
499k
    if (buffer.len == 0) {
3439
0
        PyBuffer_Release(&buffer);
3440
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3441
0
            return NULL;
3442
0
        }
3443
0
        _Py_RETURN_UNICODE_EMPTY();
3444
0
    }
3445
3446
499k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3447
499k
    PyBuffer_Release(&buffer);
3448
499k
    return v;
3449
499k
}
3450
3451
/* Normalize an encoding name like encodings.normalize_encoding()
3452
   but allow to convert to lowercase if *to_lower* is true.
3453
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3454
int
3455
_Py_normalize_encoding(const char *encoding,
3456
                       char *lower,
3457
                       size_t lower_len,
3458
                       int to_lower)
3459
28.8M
{
3460
28.8M
    const char *e;
3461
28.8M
    char *l;
3462
28.8M
    char *l_end;
3463
28.8M
    int punct;
3464
3465
28.8M
    assert(encoding != NULL);
3466
3467
28.8M
    e = encoding;
3468
28.8M
    l = lower;
3469
28.8M
    l_end = &lower[lower_len - 1];
3470
28.8M
    punct = 0;
3471
189M
    while (1) {
3472
189M
        char c = *e;
3473
189M
        if (c == 0) {
3474
28.0M
            break;
3475
28.0M
        }
3476
3477
161M
        if (Py_ISALNUM(c) || c == '.') {
3478
144M
            if (punct && l != lower) {
3479
12.9M
                if (l == l_end) {
3480
1.38k
                    return 0;
3481
1.38k
                }
3482
12.9M
                *l++ = '_';
3483
12.9M
            }
3484
144M
            punct = 0;
3485
3486
144M
            if (l == l_end) {
3487
799k
                return 0;
3488
799k
            }
3489
144M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3490
144M
        }
3491
16.8M
        else {
3492
16.8M
            punct = 1;
3493
16.8M
        }
3494
3495
161M
        e++;
3496
161M
    }
3497
28.0M
    *l = '\0';
3498
28.0M
    return 1;
3499
28.8M
}
3500
3501
PyObject *
3502
PyUnicode_Decode(const char *s,
3503
                 Py_ssize_t size,
3504
                 const char *encoding,
3505
                 const char *errors)
3506
22.1M
{
3507
22.1M
    PyObject *buffer = NULL, *unicode;
3508
22.1M
    Py_buffer info;
3509
22.1M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3510
3511
22.1M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3512
0
        return NULL;
3513
0
    }
3514
3515
22.1M
    if (size == 0) {
3516
0
        _Py_RETURN_UNICODE_EMPTY();
3517
0
    }
3518
3519
22.1M
    if (encoding == NULL) {
3520
40.3k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3521
40.3k
    }
3522
3523
    /* Shortcuts for common default encodings */
3524
22.0M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3525
22.0M
        char *lower = buflower;
3526
3527
        /* Fast paths */
3528
22.0M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3529
4.12M
            lower += 3;
3530
4.12M
            if (*lower == '_') {
3531
                /* Match "utf8" and "utf_8" */
3532
4.12M
                lower++;
3533
4.12M
            }
3534
3535
4.12M
            if (lower[0] == '8' && lower[1] == 0) {
3536
4.12M
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3537
4.12M
            }
3538
1.17k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3539
172
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3540
172
            }
3541
1.00k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3542
155
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3543
155
            }
3544
4.12M
        }
3545
17.9M
        else {
3546
17.9M
            if (strcmp(lower, "ascii") == 0
3547
13.8M
                || strcmp(lower, "us_ascii") == 0) {
3548
13.8M
                return PyUnicode_DecodeASCII(s, size, errors);
3549
13.8M
            }
3550
    #ifdef MS_WINDOWS
3551
            else if (strcmp(lower, "mbcs") == 0) {
3552
                return PyUnicode_DecodeMBCS(s, size, errors);
3553
            }
3554
    #endif
3555
4.06M
            else if (strcmp(lower, "latin1") == 0
3556
4.06M
                     || strcmp(lower, "latin_1") == 0
3557
1.22M
                     || strcmp(lower, "iso_8859_1") == 0
3558
2.86M
                     || strcmp(lower, "iso8859_1") == 0) {
3559
2.86M
                return PyUnicode_DecodeLatin1(s, size, errors);
3560
2.86M
            }
3561
17.9M
        }
3562
22.0M
    }
3563
3564
    /* Decode via the codec registry */
3565
1.20M
    buffer = NULL;
3566
1.20M
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3567
0
        goto onError;
3568
1.20M
    buffer = PyMemoryView_FromBuffer(&info);
3569
1.20M
    if (buffer == NULL)
3570
0
        goto onError;
3571
1.20M
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3572
1.20M
    if (unicode == NULL)
3573
69.8k
        goto onError;
3574
1.13M
    if (!PyUnicode_Check(unicode)) {
3575
0
        PyErr_Format(PyExc_TypeError,
3576
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3577
0
                     "use codecs.decode() to decode to arbitrary types",
3578
0
                     encoding,
3579
0
                     Py_TYPE(unicode)->tp_name);
3580
0
        Py_DECREF(unicode);
3581
0
        goto onError;
3582
0
    }
3583
1.13M
    Py_DECREF(buffer);
3584
1.13M
    return unicode_result(unicode);
3585
3586
69.8k
  onError:
3587
69.8k
    Py_XDECREF(buffer);
3588
69.8k
    return NULL;
3589
1.13M
}
3590
3591
PyAPI_FUNC(PyObject *)
3592
PyUnicode_AsDecodedObject(PyObject *unicode,
3593
                          const char *encoding,
3594
                          const char *errors)
3595
0
{
3596
0
    if (!PyUnicode_Check(unicode)) {
3597
0
        PyErr_BadArgument();
3598
0
        return NULL;
3599
0
    }
3600
3601
0
    if (encoding == NULL)
3602
0
        encoding = PyUnicode_GetDefaultEncoding();
3603
3604
    /* Decode via the codec registry */
3605
0
    return PyCodec_Decode(unicode, encoding, errors);
3606
0
}
3607
3608
PyAPI_FUNC(PyObject *)
3609
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3610
                           const char *encoding,
3611
                           const char *errors)
3612
0
{
3613
0
    PyObject *v;
3614
3615
0
    if (!PyUnicode_Check(unicode)) {
3616
0
        PyErr_BadArgument();
3617
0
        goto onError;
3618
0
    }
3619
3620
0
    if (encoding == NULL)
3621
0
        encoding = PyUnicode_GetDefaultEncoding();
3622
3623
    /* Decode via the codec registry */
3624
0
    v = PyCodec_Decode(unicode, encoding, errors);
3625
0
    if (v == NULL)
3626
0
        goto onError;
3627
0
    if (!PyUnicode_Check(v)) {
3628
0
        PyErr_Format(PyExc_TypeError,
3629
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3630
0
                     "use codecs.decode() to decode to arbitrary types",
3631
0
                     encoding,
3632
0
                     Py_TYPE(unicode)->tp_name);
3633
0
        Py_DECREF(v);
3634
0
        goto onError;
3635
0
    }
3636
0
    return unicode_result(v);
3637
3638
0
  onError:
3639
0
    return NULL;
3640
0
}
3641
3642
PyAPI_FUNC(PyObject *)
3643
PyUnicode_AsEncodedObject(PyObject *unicode,
3644
                          const char *encoding,
3645
                          const char *errors)
3646
0
{
3647
0
    PyObject *v;
3648
3649
0
    if (!PyUnicode_Check(unicode)) {
3650
0
        PyErr_BadArgument();
3651
0
        goto onError;
3652
0
    }
3653
3654
0
    if (encoding == NULL)
3655
0
        encoding = PyUnicode_GetDefaultEncoding();
3656
3657
    /* Encode via the codec registry */
3658
0
    v = PyCodec_Encode(unicode, encoding, errors);
3659
0
    if (v == NULL)
3660
0
        goto onError;
3661
0
    return v;
3662
3663
0
  onError:
3664
0
    return NULL;
3665
0
}
3666
3667
3668
static PyObject *
3669
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3670
                      int current_locale)
3671
0
{
3672
0
    Py_ssize_t wlen;
3673
0
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3674
0
    if (wstr == NULL) {
3675
0
        return NULL;
3676
0
    }
3677
3678
0
    if ((size_t)wlen != wcslen(wstr)) {
3679
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3680
0
        PyMem_Free(wstr);
3681
0
        return NULL;
3682
0
    }
3683
3684
0
    char *str;
3685
0
    size_t error_pos;
3686
0
    const char *reason;
3687
0
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3688
0
                                 current_locale, error_handler);
3689
0
    PyMem_Free(wstr);
3690
3691
0
    if (res != 0) {
3692
0
        if (res == -2) {
3693
0
            PyObject *exc;
3694
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3695
0
                    "locale", unicode,
3696
0
                    (Py_ssize_t)error_pos,
3697
0
                    (Py_ssize_t)(error_pos+1),
3698
0
                    reason);
3699
0
            if (exc != NULL) {
3700
0
                PyCodec_StrictErrors(exc);
3701
0
                Py_DECREF(exc);
3702
0
            }
3703
0
        }
3704
0
        else if (res == -3) {
3705
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3706
0
        }
3707
0
        else {
3708
0
            PyErr_NoMemory();
3709
0
        }
3710
0
        return NULL;
3711
0
    }
3712
3713
0
    PyObject *bytes = PyBytes_FromString(str);
3714
0
    PyMem_RawFree(str);
3715
0
    return bytes;
3716
0
}
3717
3718
PyObject *
3719
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3720
0
{
3721
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3722
0
    return unicode_encode_locale(unicode, error_handler, 1);
3723
0
}
3724
3725
PyObject *
3726
PyUnicode_EncodeFSDefault(PyObject *unicode)
3727
1.31M
{
3728
1.31M
    PyInterpreterState *interp = _PyInterpreterState_GET();
3729
1.31M
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3730
1.31M
    if (fs_codec->utf8) {
3731
1.31M
        return unicode_encode_utf8(unicode,
3732
1.31M
                                   fs_codec->error_handler,
3733
1.31M
                                   fs_codec->errors);
3734
1.31M
    }
3735
0
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3736
0
    else if (fs_codec->encoding) {
3737
0
        return PyUnicode_AsEncodedString(unicode,
3738
0
                                         fs_codec->encoding,
3739
0
                                         fs_codec->errors);
3740
0
    }
3741
0
#endif
3742
0
    else {
3743
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3744
           machinery is not ready and so cannot be used:
3745
           use wcstombs() in this case. */
3746
0
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3747
0
        const wchar_t *filesystem_errors = config->filesystem_errors;
3748
0
        assert(filesystem_errors != NULL);
3749
0
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3750
0
        assert(errors != _Py_ERROR_UNKNOWN);
3751
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3752
        return unicode_encode_utf8(unicode, errors, NULL);
3753
#else
3754
0
        return unicode_encode_locale(unicode, errors, 0);
3755
0
#endif
3756
0
    }
3757
1.31M
}
3758
3759
PyObject *
3760
PyUnicode_AsEncodedString(PyObject *unicode,
3761
                          const char *encoding,
3762
                          const char *errors)
3763
19.8M
{
3764
19.8M
    PyObject *v;
3765
19.8M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3766
3767
19.8M
    if (!PyUnicode_Check(unicode)) {
3768
0
        PyErr_BadArgument();
3769
0
        return NULL;
3770
0
    }
3771
3772
19.8M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3773
0
        return NULL;
3774
0
    }
3775
3776
19.8M
    if (encoding == NULL) {
3777
13.0M
        return _PyUnicode_AsUTF8String(unicode, errors);
3778
13.0M
    }
3779
3780
    /* Shortcuts for common default encodings */
3781
6.81M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3782
6.01M
        char *lower = buflower;
3783
3784
        /* Fast paths */
3785
6.01M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3786
4.92M
            lower += 3;
3787
4.92M
            if (*lower == '_') {
3788
                /* Match "utf8" and "utf_8" */
3789
4.92M
                lower++;
3790
4.92M
            }
3791
3792
4.92M
            if (lower[0] == '8' && lower[1] == 0) {
3793
4.91M
                return _PyUnicode_AsUTF8String(unicode, errors);
3794
4.91M
            }
3795
6.51k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3796
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3797
0
            }
3798
6.51k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3799
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3800
0
            }
3801
4.92M
        }
3802
1.09M
        else {
3803
1.09M
            if (strcmp(lower, "ascii") == 0
3804
819k
                || strcmp(lower, "us_ascii") == 0) {
3805
819k
                return _PyUnicode_AsASCIIString(unicode, errors);
3806
819k
            }
3807
#ifdef MS_WINDOWS
3808
            else if (strcmp(lower, "mbcs") == 0) {
3809
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3810
            }
3811
#endif
3812
270k
            else if (strcmp(lower, "latin1") == 0 ||
3813
270k
                     strcmp(lower, "latin_1") == 0 ||
3814
270k
                     strcmp(lower, "iso_8859_1") == 0 ||
3815
270k
                     strcmp(lower, "iso8859_1") == 0) {
3816
10
                return _PyUnicode_AsLatin1String(unicode, errors);
3817
10
            }
3818
1.09M
        }
3819
6.01M
    }
3820
3821
    /* Encode via the codec registry */
3822
1.07M
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3823
1.07M
    if (v == NULL)
3824
0
        return NULL;
3825
3826
    /* The normal path */
3827
1.07M
    if (PyBytes_Check(v))
3828
1.07M
        return v;
3829
3830
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3831
0
    if (PyByteArray_Check(v)) {
3832
0
        int error;
3833
0
        PyObject *b;
3834
3835
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3836
0
            "encoder %s returned bytearray instead of bytes; "
3837
0
            "use codecs.encode() to encode to arbitrary types",
3838
0
            encoding);
3839
0
        if (error) {
3840
0
            Py_DECREF(v);
3841
0
            return NULL;
3842
0
        }
3843
3844
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3845
0
                                      PyByteArray_GET_SIZE(v));
3846
0
        Py_DECREF(v);
3847
0
        return b;
3848
0
    }
3849
3850
0
    PyErr_Format(PyExc_TypeError,
3851
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3852
0
                 "use codecs.encode() to encode to arbitrary types",
3853
0
                 encoding,
3854
0
                 Py_TYPE(v)->tp_name);
3855
0
    Py_DECREF(v);
3856
0
    return NULL;
3857
0
}
3858
3859
PyAPI_FUNC(PyObject *)
3860
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3861
                           const char *encoding,
3862
                           const char *errors)
3863
0
{
3864
0
    PyObject *v;
3865
3866
0
    if (!PyUnicode_Check(unicode)) {
3867
0
        PyErr_BadArgument();
3868
0
        goto onError;
3869
0
    }
3870
3871
0
    if (encoding == NULL)
3872
0
        encoding = PyUnicode_GetDefaultEncoding();
3873
3874
    /* Encode via the codec registry */
3875
0
    v = PyCodec_Encode(unicode, encoding, errors);
3876
0
    if (v == NULL)
3877
0
        goto onError;
3878
0
    if (!PyUnicode_Check(v)) {
3879
0
        PyErr_Format(PyExc_TypeError,
3880
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3881
0
                     "use codecs.encode() to encode to arbitrary types",
3882
0
                     encoding,
3883
0
                     Py_TYPE(v)->tp_name);
3884
0
        Py_DECREF(v);
3885
0
        goto onError;
3886
0
    }
3887
0
    return v;
3888
3889
0
  onError:
3890
0
    return NULL;
3891
0
}
3892
3893
static PyObject*
3894
unicode_decode_locale(const char *str, Py_ssize_t len,
3895
                      _Py_error_handler errors, int current_locale)
3896
389k
{
3897
389k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3898
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3899
0
        return NULL;
3900
0
    }
3901
3902
389k
    wchar_t *wstr;
3903
389k
    size_t wlen;
3904
389k
    const char *reason;
3905
389k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3906
389k
                                 current_locale, errors);
3907
389k
    if (res != 0) {
3908
0
        if (res == -2) {
3909
0
            PyObject *exc;
3910
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3911
0
                                        "locale", str, len,
3912
0
                                        (Py_ssize_t)wlen,
3913
0
                                        (Py_ssize_t)(wlen + 1),
3914
0
                                        reason);
3915
0
            if (exc != NULL) {
3916
0
                PyCodec_StrictErrors(exc);
3917
0
                Py_DECREF(exc);
3918
0
            }
3919
0
        }
3920
0
        else if (res == -3) {
3921
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3922
0
        }
3923
0
        else {
3924
0
            PyErr_NoMemory();
3925
0
        }
3926
0
        return NULL;
3927
0
    }
3928
3929
389k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3930
389k
    PyMem_RawFree(wstr);
3931
389k
    return unicode;
3932
389k
}
3933
3934
PyObject*
3935
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3936
                              const char *errors)
3937
0
{
3938
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3939
0
    return unicode_decode_locale(str, len, error_handler, 1);
3940
0
}
3941
3942
PyObject*
3943
PyUnicode_DecodeLocale(const char *str, const char *errors)
3944
389k
{
3945
389k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3946
389k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3947
389k
    return unicode_decode_locale(str, size, error_handler, 1);
3948
389k
}
3949
3950
3951
PyObject*
3952
221
PyUnicode_DecodeFSDefault(const char *s) {
3953
221
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3954
221
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3955
221
}
3956
3957
PyObject*
3958
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3959
236k
{
3960
236k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3961
236k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3962
236k
    if (fs_codec->utf8) {
3963
236k
        return unicode_decode_utf8(s, size,
3964
236k
                                   fs_codec->error_handler,
3965
236k
                                   fs_codec->errors,
3966
236k
                                   NULL);
3967
236k
    }
3968
36
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3969
36
    else if (fs_codec->encoding) {
3970
0
        return PyUnicode_Decode(s, size,
3971
0
                                fs_codec->encoding,
3972
0
                                fs_codec->errors);
3973
0
    }
3974
36
#endif
3975
36
    else {
3976
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3977
           machinery is not ready and so cannot be used:
3978
           use mbstowcs() in this case. */
3979
36
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3980
36
        const wchar_t *filesystem_errors = config->filesystem_errors;
3981
36
        assert(filesystem_errors != NULL);
3982
36
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3983
36
        assert(errors != _Py_ERROR_UNKNOWN);
3984
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3985
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3986
#else
3987
36
        return unicode_decode_locale(s, size, errors, 0);
3988
36
#endif
3989
36
    }
3990
236k
}
3991
3992
3993
int
3994
PyUnicode_FSConverter(PyObject* arg, void* addr)
3995
315k
{
3996
315k
    PyObject *path = NULL;
3997
315k
    PyObject *output = NULL;
3998
315k
    Py_ssize_t size;
3999
315k
    const char *data;
4000
315k
    if (arg == NULL) {
4001
0
        Py_DECREF(*(PyObject**)addr);
4002
0
        *(PyObject**)addr = NULL;
4003
0
        return 1;
4004
0
    }
4005
315k
    path = PyOS_FSPath(arg);
4006
315k
    if (path == NULL) {
4007
0
        return 0;
4008
0
    }
4009
315k
    if (PyBytes_Check(path)) {
4010
0
        output = path;
4011
0
    }
4012
315k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4013
315k
        output = PyUnicode_EncodeFSDefault(path);
4014
315k
        Py_DECREF(path);
4015
315k
        if (!output) {
4016
0
            return 0;
4017
0
        }
4018
315k
        assert(PyBytes_Check(output));
4019
315k
    }
4020
4021
315k
    size = PyBytes_GET_SIZE(output);
4022
315k
    data = PyBytes_AS_STRING(output);
4023
315k
    if ((size_t)size != strlen(data)) {
4024
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4025
0
        Py_DECREF(output);
4026
0
        return 0;
4027
0
    }
4028
315k
    *(PyObject**)addr = output;
4029
315k
    return Py_CLEANUP_SUPPORTED;
4030
315k
}
4031
4032
4033
int
4034
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4035
98.9k
{
4036
98.9k
    if (arg == NULL) {
4037
0
        Py_DECREF(*(PyObject**)addr);
4038
0
        *(PyObject**)addr = NULL;
4039
0
        return 1;
4040
0
    }
4041
4042
98.9k
    PyObject *path = PyOS_FSPath(arg);
4043
98.9k
    if (path == NULL) {
4044
0
        return 0;
4045
0
    }
4046
4047
98.9k
    PyObject *output = NULL;
4048
98.9k
    if (PyUnicode_Check(path)) {
4049
98.9k
        output = path;
4050
98.9k
    }
4051
0
    else if (PyBytes_Check(path)) {
4052
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4053
0
                                                  PyBytes_GET_SIZE(path));
4054
0
        Py_DECREF(path);
4055
0
        if (!output) {
4056
0
            return 0;
4057
0
        }
4058
0
    }
4059
0
    else {
4060
0
        PyErr_Format(PyExc_TypeError,
4061
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4062
0
                     Py_TYPE(arg)->tp_name);
4063
0
        Py_DECREF(path);
4064
0
        return 0;
4065
0
    }
4066
4067
98.9k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4068
98.9k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4069
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4070
0
        Py_DECREF(output);
4071
0
        return 0;
4072
0
    }
4073
98.9k
    *(PyObject**)addr = output;
4074
98.9k
    return Py_CLEANUP_SUPPORTED;
4075
98.9k
}
4076
4077
4078
static int unicode_fill_utf8(PyObject *unicode);
4079
4080
4081
static int
4082
unicode_ensure_utf8(PyObject *unicode)
4083
71.1M
{
4084
71.1M
    int err = 0;
4085
71.1M
    if (PyUnicode_UTF8(unicode) == NULL) {
4086
161k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4087
161k
        if (PyUnicode_UTF8(unicode) == NULL) {
4088
161k
            err = unicode_fill_utf8(unicode);
4089
161k
        }
4090
161k
        Py_END_CRITICAL_SECTION();
4091
161k
    }
4092
71.1M
    return err;
4093
71.1M
}
4094
4095
const char *
4096
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4097
71.1M
{
4098
71.1M
    if (!PyUnicode_Check(unicode)) {
4099
0
        PyErr_BadArgument();
4100
0
        if (psize) {
4101
0
            *psize = -1;
4102
0
        }
4103
0
        return NULL;
4104
0
    }
4105
4106
71.1M
    if (unicode_ensure_utf8(unicode) == -1) {
4107
206
        if (psize) {
4108
206
            *psize = -1;
4109
206
        }
4110
206
        return NULL;
4111
206
    }
4112
4113
71.1M
    if (psize) {
4114
70.8M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4115
70.8M
    }
4116
71.1M
    return PyUnicode_UTF8(unicode);
4117
71.1M
}
4118
4119
const char *
4120
PyUnicode_AsUTF8(PyObject *unicode)
4121
246k
{
4122
246k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4123
246k
}
4124
4125
const char *
4126
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4127
2.92M
{
4128
2.92M
    Py_ssize_t size;
4129
2.92M
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4130
2.92M
    if (s && strlen(s) != (size_t)size) {
4131
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4132
0
        return NULL;
4133
0
    }
4134
2.92M
    return s;
4135
2.92M
}
4136
4137
/*
4138
PyUnicode_GetSize() has been deprecated since Python 3.3
4139
because it returned length of Py_UNICODE.
4140
4141
But this function is part of stable abi, because it doesn't
4142
include Py_UNICODE in signature and it was not excluded from
4143
stable ABI in PEP 384.
4144
*/
4145
PyAPI_FUNC(Py_ssize_t)
4146
PyUnicode_GetSize(PyObject *unicode)
4147
0
{
4148
0
    PyErr_SetString(PyExc_RuntimeError,
4149
0
                    "PyUnicode_GetSize has been removed.");
4150
0
    return -1;
4151
0
}
4152
4153
Py_ssize_t
4154
PyUnicode_GetLength(PyObject *unicode)
4155
25.9k
{
4156
25.9k
    if (!PyUnicode_Check(unicode)) {
4157
0
        PyErr_BadArgument();
4158
0
        return -1;
4159
0
    }
4160
25.9k
    return PyUnicode_GET_LENGTH(unicode);
4161
25.9k
}
4162
4163
Py_UCS4
4164
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4165
21
{
4166
21
    const void *data;
4167
21
    int kind;
4168
4169
21
    if (!PyUnicode_Check(unicode)) {
4170
0
        PyErr_BadArgument();
4171
0
        return (Py_UCS4)-1;
4172
0
    }
4173
21
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4174
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4175
0
        return (Py_UCS4)-1;
4176
0
    }
4177
21
    data = PyUnicode_DATA(unicode);
4178
21
    kind = PyUnicode_KIND(unicode);
4179
21
    return PyUnicode_READ(kind, data, index);
4180
21
}
4181
4182
int
4183
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4184
0
{
4185
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4186
0
        PyErr_BadArgument();
4187
0
        return -1;
4188
0
    }
4189
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4190
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4191
0
        return -1;
4192
0
    }
4193
0
    if (unicode_check_modifiable(unicode))
4194
0
        return -1;
4195
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4196
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4197
0
        return -1;
4198
0
    }
4199
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4200
0
                    index, ch);
4201
0
    return 0;
4202
0
}
4203
4204
const char *
4205
PyUnicode_GetDefaultEncoding(void)
4206
0
{
4207
0
    return "utf-8";
4208
0
}
4209
4210
/* create or adjust a UnicodeDecodeError */
4211
static void
4212
make_decode_exception(PyObject **exceptionObject,
4213
                      const char *encoding,
4214
                      const char *input, Py_ssize_t length,
4215
                      Py_ssize_t startpos, Py_ssize_t endpos,
4216
                      const char *reason)
4217
2.63M
{
4218
2.63M
    if (*exceptionObject == NULL) {
4219
2.41M
        *exceptionObject = PyUnicodeDecodeError_Create(
4220
2.41M
            encoding, input, length, startpos, endpos, reason);
4221
2.41M
    }
4222
222k
    else {
4223
222k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4224
0
            goto onError;
4225
222k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4226
0
            goto onError;
4227
222k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4228
0
            goto onError;
4229
222k
    }
4230
2.63M
    return;
4231
4232
2.63M
onError:
4233
0
    Py_CLEAR(*exceptionObject);
4234
0
}
4235
4236
#ifdef MS_WINDOWS
4237
static int
4238
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4239
{
4240
    if (newsize > *size) {
4241
        wchar_t *newbuf = *buf;
4242
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4243
            PyErr_NoMemory();
4244
            return -1;
4245
        }
4246
        *buf = newbuf;
4247
    }
4248
    *size = newsize;
4249
    return 0;
4250
}
4251
4252
/* error handling callback helper:
4253
   build arguments, call the callback and check the arguments,
4254
   if no exception occurred, copy the replacement to the output
4255
   and adjust various state variables.
4256
   return 0 on success, -1 on error
4257
*/
4258
4259
static int
4260
unicode_decode_call_errorhandler_wchar(
4261
    const char *errors, PyObject **errorHandler,
4262
    const char *encoding, const char *reason,
4263
    const char **input, const char **inend, Py_ssize_t *startinpos,
4264
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4265
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4266
{
4267
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4268
4269
    PyObject *restuple = NULL;
4270
    PyObject *repunicode = NULL;
4271
    Py_ssize_t outsize;
4272
    Py_ssize_t insize;
4273
    Py_ssize_t requiredsize;
4274
    Py_ssize_t newpos;
4275
    PyObject *inputobj = NULL;
4276
    Py_ssize_t repwlen;
4277
4278
    if (*errorHandler == NULL) {
4279
        *errorHandler = PyCodec_LookupError(errors);
4280
        if (*errorHandler == NULL)
4281
            goto onError;
4282
    }
4283
4284
    make_decode_exception(exceptionObject,
4285
        encoding,
4286
        *input, *inend - *input,
4287
        *startinpos, *endinpos,
4288
        reason);
4289
    if (*exceptionObject == NULL)
4290
        goto onError;
4291
4292
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4293
    if (restuple == NULL)
4294
        goto onError;
4295
    if (!PyTuple_Check(restuple)) {
4296
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4297
        goto onError;
4298
    }
4299
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4300
        goto onError;
4301
4302
    /* Copy back the bytes variables, which might have been modified by the
4303
       callback */
4304
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4305
    if (!inputobj)
4306
        goto onError;
4307
    *input = PyBytes_AS_STRING(inputobj);
4308
    insize = PyBytes_GET_SIZE(inputobj);
4309
    *inend = *input + insize;
4310
    /* we can DECREF safely, as the exception has another reference,
4311
       so the object won't go away. */
4312
    Py_DECREF(inputobj);
4313
4314
    if (newpos<0)
4315
        newpos = insize+newpos;
4316
    if (newpos<0 || newpos>insize) {
4317
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4318
        goto onError;
4319
    }
4320
4321
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4322
    if (repwlen < 0)
4323
        goto onError;
4324
    repwlen--;
4325
    /* need more space? (at least enough for what we
4326
       have+the replacement+the rest of the string (starting
4327
       at the new input position), so we won't have to check space
4328
       when there are no errors in the rest of the string) */
4329
    requiredsize = *outpos;
4330
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4331
        goto overflow;
4332
    requiredsize += repwlen;
4333
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4334
        goto overflow;
4335
    requiredsize += insize - newpos;
4336
    outsize = *bufsize;
4337
    if (requiredsize > outsize) {
4338
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4339
            requiredsize = 2*outsize;
4340
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4341
            goto onError;
4342
        }
4343
    }
4344
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4345
    *outpos += repwlen;
4346
    *endinpos = newpos;
4347
    *inptr = *input + newpos;
4348
4349
    /* we made it! */
4350
    Py_DECREF(restuple);
4351
    return 0;
4352
4353
  overflow:
4354
    PyErr_SetString(PyExc_OverflowError,
4355
                    "decoded result is too long for a Python string");
4356
4357
  onError:
4358
    Py_XDECREF(restuple);
4359
    return -1;
4360
}
4361
#endif   /* MS_WINDOWS */
4362
4363
static int
4364
unicode_decode_call_errorhandler_writer(
4365
    const char *errors, PyObject **errorHandler,
4366
    const char *encoding, const char *reason,
4367
    const char **input, const char **inend, Py_ssize_t *startinpos,
4368
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4369
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4370
2.63M
{
4371
2.63M
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4372
4373
2.63M
    PyObject *restuple = NULL;
4374
2.63M
    PyObject *repunicode = NULL;
4375
2.63M
    Py_ssize_t insize;
4376
2.63M
    Py_ssize_t newpos;
4377
2.63M
    Py_ssize_t replen;
4378
2.63M
    Py_ssize_t remain;
4379
2.63M
    PyObject *inputobj = NULL;
4380
2.63M
    int need_to_grow = 0;
4381
2.63M
    const char *new_inptr;
4382
4383
2.63M
    if (*errorHandler == NULL) {
4384
2.41M
        *errorHandler = PyCodec_LookupError(errors);
4385
2.41M
        if (*errorHandler == NULL)
4386
0
            goto onError;
4387
2.41M
    }
4388
4389
2.63M
    make_decode_exception(exceptionObject,
4390
2.63M
        encoding,
4391
2.63M
        *input, *inend - *input,
4392
2.63M
        *startinpos, *endinpos,
4393
2.63M
        reason);
4394
2.63M
    if (*exceptionObject == NULL)
4395
0
        goto onError;
4396
4397
2.63M
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4398
2.63M
    if (restuple == NULL)
4399
2.37M
        goto onError;
4400
263k
    if (!PyTuple_Check(restuple)) {
4401
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4402
0
        goto onError;
4403
0
    }
4404
263k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4405
0
        goto onError;
4406
4407
    /* Copy back the bytes variables, which might have been modified by the
4408
       callback */
4409
263k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4410
263k
    if (!inputobj)
4411
0
        goto onError;
4412
263k
    remain = *inend - *input - *endinpos;
4413
263k
    *input = PyBytes_AS_STRING(inputobj);
4414
263k
    insize = PyBytes_GET_SIZE(inputobj);
4415
263k
    *inend = *input + insize;
4416
    /* we can DECREF safely, as the exception has another reference,
4417
       so the object won't go away. */
4418
263k
    Py_DECREF(inputobj);
4419
4420
263k
    if (newpos<0)
4421
0
        newpos = insize+newpos;
4422
263k
    if (newpos<0 || newpos>insize) {
4423
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4424
0
        goto onError;
4425
0
    }
4426
4427
263k
    replen = PyUnicode_GET_LENGTH(repunicode);
4428
263k
    if (replen > 1) {
4429
27.0k
        writer->min_length += replen - 1;
4430
27.0k
        need_to_grow = 1;
4431
27.0k
    }
4432
263k
    new_inptr = *input + newpos;
4433
263k
    if (*inend - new_inptr > remain) {
4434
        /* We don't know the decoding algorithm here so we make the worst
4435
           assumption that one byte decodes to one unicode character.
4436
           If unfortunately one byte could decode to more unicode characters,
4437
           the decoder may write out-of-bound then.  Is it possible for the
4438
           algorithms using this function? */
4439
12.1k
        writer->min_length += *inend - new_inptr - remain;
4440
12.1k
        need_to_grow = 1;
4441
12.1k
    }
4442
263k
    if (need_to_grow) {
4443
27.2k
        writer->overallocate = 1;
4444
27.2k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4445
27.2k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4446
0
            goto onError;
4447
27.2k
    }
4448
263k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4449
0
        goto onError;
4450
4451
263k
    *endinpos = newpos;
4452
263k
    *inptr = new_inptr;
4453
4454
    /* we made it! */
4455
263k
    Py_DECREF(restuple);
4456
263k
    return 0;
4457
4458
2.37M
  onError:
4459
2.37M
    Py_XDECREF(restuple);
4460
2.37M
    return -1;
4461
263k
}
4462
4463
/* --- UTF-7 Codec -------------------------------------------------------- */
4464
4465
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4466
4467
/* Three simple macros defining base-64. */
4468
4469
/* Is c a base-64 character? */
4470
4471
#define IS_BASE64(c) \
4472
311k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4473
311k
     ((c) >= 'a' && (c) <= 'z') ||     \
4474
311k
     ((c) >= '0' && (c) <= '9') ||     \
4475
311k
     (c) == '+' || (c) == '/')
4476
4477
/* given that c is a base-64 character, what is its base-64 value? */
4478
4479
#define FROM_BASE64(c)                                                  \
4480
275k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4481
275k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4482
227k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4483
135k
     (c) == '+' ? 62 : 63)
4484
4485
/* What is the base-64 character of the bottom 6 bits of n? */
4486
4487
#define TO_BASE64(n)  \
4488
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4489
4490
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4491
 * decoded as itself.  We are permissive on decoding; the only ASCII
4492
 * byte not decoding to itself is the + which begins a base64
4493
 * string. */
4494
4495
#define DECODE_DIRECT(c)                                \
4496
5.32M
    ((c) <= 127 && (c) != '+')
4497
4498
/* The UTF-7 encoder treats ASCII characters differently according to
4499
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4500
 * the above).  See RFC2152.  This array identifies these different
4501
 * sets:
4502
 * 0 : "Set D"
4503
 *     alphanumeric and '(),-./:?
4504
 * 1 : "Set O"
4505
 *     !"#$%&*;<=>@[]^_`{|}
4506
 * 2 : "whitespace"
4507
 *     ht nl cr sp
4508
 * 3 : special (must be base64 encoded)
4509
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4510
 */
4511
4512
static
4513
char utf7_category[128] = {
4514
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4515
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4516
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4517
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4518
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4519
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4520
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4521
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4522
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4523
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4524
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4525
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4526
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4527
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4528
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4529
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4530
};
4531
4532
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4533
 * answer depends on whether we are encoding set O as itself, and also
4534
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4535
 * clear that the answers to these questions vary between
4536
 * applications, so this code needs to be flexible.  */
4537
4538
#define ENCODE_DIRECT(c) \
4539
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4540
4541
PyObject *
4542
PyUnicode_DecodeUTF7(const char *s,
4543
                     Py_ssize_t size,
4544
                     const char *errors)
4545
0
{
4546
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4547
0
}
4548
4549
/* The decoder.  The only state we preserve is our read position,
4550
 * i.e. how many characters we have consumed.  So if we end in the
4551
 * middle of a shift sequence we have to back off the read position
4552
 * and the output to the beginning of the sequence, otherwise we lose
4553
 * all the shift state (seen bits, number of bits seen, high
4554
 * surrogate). */
4555
4556
PyObject *
4557
PyUnicode_DecodeUTF7Stateful(const char *s,
4558
                             Py_ssize_t size,
4559
                             const char *errors,
4560
                             Py_ssize_t *consumed)
4561
20.1k
{
4562
20.1k
    const char *starts = s;
4563
20.1k
    Py_ssize_t startinpos;
4564
20.1k
    Py_ssize_t endinpos;
4565
20.1k
    const char *e;
4566
20.1k
    _PyUnicodeWriter writer;
4567
20.1k
    const char *errmsg = "";
4568
20.1k
    int inShift = 0;
4569
20.1k
    Py_ssize_t shiftOutStart;
4570
20.1k
    unsigned int base64bits = 0;
4571
20.1k
    unsigned long base64buffer = 0;
4572
20.1k
    Py_UCS4 surrogate = 0;
4573
20.1k
    PyObject *errorHandler = NULL;
4574
20.1k
    PyObject *exc = NULL;
4575
4576
20.1k
    if (size == 0) {
4577
0
        if (consumed)
4578
0
            *consumed = 0;
4579
0
        _Py_RETURN_UNICODE_EMPTY();
4580
0
    }
4581
4582
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4583
20.1k
    _PyUnicodeWriter_Init(&writer);
4584
20.1k
    writer.min_length = size;
4585
4586
20.1k
    shiftOutStart = 0;
4587
20.1k
    e = s + size;
4588
4589
5.65M
    while (s < e) {
4590
5.64M
        Py_UCS4 ch;
4591
5.64M
      restart:
4592
5.64M
        ch = (unsigned char) *s;
4593
4594
5.64M
        if (inShift) { /* in a base-64 section */
4595
290k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4596
275k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4597
275k
                base64bits += 6;
4598
275k
                s++;
4599
275k
                if (base64bits >= 16) {
4600
                    /* we have enough bits for a UTF-16 value */
4601
97.7k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4602
97.7k
                    base64bits -= 16;
4603
97.7k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4604
97.7k
                    assert(outCh <= 0xffff);
4605
97.7k
                    if (surrogate) {
4606
                        /* expecting a second surrogate */
4607
8.10k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4608
2.85k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4609
2.85k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4610
0
                                goto onError;
4611
2.85k
                            surrogate = 0;
4612
2.85k
                            continue;
4613
2.85k
                        }
4614
5.25k
                        else {
4615
5.25k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4616
0
                                goto onError;
4617
5.25k
                            surrogate = 0;
4618
5.25k
                        }
4619
8.10k
                    }
4620
94.8k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4621
                        /* first surrogate */
4622
11.7k
                        surrogate = outCh;
4623
11.7k
                    }
4624
83.1k
                    else {
4625
83.1k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4626
0
                            goto onError;
4627
83.1k
                    }
4628
94.8k
                }
4629
275k
            }
4630
15.4k
            else { /* now leaving a base-64 section */
4631
15.4k
                inShift = 0;
4632
15.4k
                if (base64bits > 0) { /* left-over bits */
4633
12.5k
                    if (base64bits >= 6) {
4634
                        /* We've seen at least one base-64 character */
4635
5.77k
                        s++;
4636
5.77k
                        errmsg = "partial character in shift sequence";
4637
5.77k
                        goto utf7Error;
4638
5.77k
                    }
4639
6.79k
                    else {
4640
                        /* Some bits remain; they should be zero */
4641
6.79k
                        if (base64buffer != 0) {
4642
1.54k
                            s++;
4643
1.54k
                            errmsg = "non-zero padding bits in shift sequence";
4644
1.54k
                            goto utf7Error;
4645
1.54k
                        }
4646
6.79k
                    }
4647
12.5k
                }
4648
8.08k
                if (surrogate && DECODE_DIRECT(ch)) {
4649
2.62k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4650
0
                        goto onError;
4651
2.62k
                }
4652
8.08k
                surrogate = 0;
4653
8.08k
                if (ch == '-') {
4654
                    /* '-' is absorbed; other terminating
4655
                       characters are preserved */
4656
2.19k
                    s++;
4657
2.19k
                }
4658
8.08k
            }
4659
290k
        }
4660
5.35M
        else if ( ch == '+' ) {
4661
24.1k
            startinpos = s-starts;
4662
24.1k
            s++; /* consume '+' */
4663
24.1k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4664
2.82k
                s++;
4665
2.82k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4666
0
                    goto onError;
4667
2.82k
            }
4668
21.3k
            else if (s < e && !IS_BASE64(*s)) {
4669
2.76k
                s++;
4670
2.76k
                errmsg = "ill-formed sequence";
4671
2.76k
                goto utf7Error;
4672
2.76k
            }
4673
18.5k
            else { /* begin base64-encoded section */
4674
18.5k
                inShift = 1;
4675
18.5k
                surrogate = 0;
4676
18.5k
                shiftOutStart = writer.pos;
4677
18.5k
                base64bits = 0;
4678
18.5k
                base64buffer = 0;
4679
18.5k
            }
4680
24.1k
        }
4681
5.32M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4682
5.22M
            s++;
4683
5.22M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4684
0
                goto onError;
4685
5.22M
        }
4686
105k
        else {
4687
105k
            startinpos = s-starts;
4688
105k
            s++;
4689
105k
            errmsg = "unexpected special character";
4690
105k
            goto utf7Error;
4691
105k
        }
4692
5.52M
        continue;
4693
5.52M
utf7Error:
4694
115k
        endinpos = s-starts;
4695
115k
        if (unicode_decode_call_errorhandler_writer(
4696
115k
                errors, &errorHandler,
4697
115k
                "utf7", errmsg,
4698
115k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4699
115k
                &writer))
4700
9.16k
            goto onError;
4701
115k
    }
4702
4703
    /* end of string */
4704
4705
10.9k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4706
        /* if we're in an inconsistent state, that's an error */
4707
3.18k
        inShift = 0;
4708
3.18k
        if (surrogate ||
4709
2.80k
                (base64bits >= 6) ||
4710
1.93k
                (base64bits > 0 && base64buffer != 0)) {
4711
1.93k
            endinpos = size;
4712
1.93k
            if (unicode_decode_call_errorhandler_writer(
4713
1.93k
                    errors, &errorHandler,
4714
1.93k
                    "utf7", "unterminated shift sequence",
4715
1.93k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4716
1.93k
                    &writer))
4717
1.52k
                goto onError;
4718
401
            if (s < e)
4719
0
                goto restart;
4720
401
        }
4721
3.18k
    }
4722
4723
    /* return state */
4724
9.43k
    if (consumed) {
4725
0
        if (inShift) {
4726
0
            *consumed = startinpos;
4727
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4728
0
                PyObject *result = PyUnicode_FromKindAndData(
4729
0
                        writer.kind, writer.data, shiftOutStart);
4730
0
                Py_XDECREF(errorHandler);
4731
0
                Py_XDECREF(exc);
4732
0
                _PyUnicodeWriter_Dealloc(&writer);
4733
0
                return result;
4734
0
            }
4735
0
            writer.pos = shiftOutStart; /* back off output */
4736
0
        }
4737
0
        else {
4738
0
            *consumed = s-starts;
4739
0
        }
4740
0
    }
4741
4742
9.43k
    Py_XDECREF(errorHandler);
4743
9.43k
    Py_XDECREF(exc);
4744
9.43k
    return _PyUnicodeWriter_Finish(&writer);
4745
4746
10.6k
  onError:
4747
10.6k
    Py_XDECREF(errorHandler);
4748
10.6k
    Py_XDECREF(exc);
4749
10.6k
    _PyUnicodeWriter_Dealloc(&writer);
4750
10.6k
    return NULL;
4751
9.43k
}
4752
4753
4754
PyObject *
4755
_PyUnicode_EncodeUTF7(PyObject *str,
4756
                      const char *errors)
4757
0
{
4758
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4759
0
    if (len == 0) {
4760
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4761
0
    }
4762
0
    int kind = PyUnicode_KIND(str);
4763
0
    const void *data = PyUnicode_DATA(str);
4764
4765
    /* It might be possible to tighten this worst case */
4766
0
    if (len > PY_SSIZE_T_MAX / 8) {
4767
0
        return PyErr_NoMemory();
4768
0
    }
4769
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4770
0
    if (writer == NULL) {
4771
0
        return NULL;
4772
0
    }
4773
4774
0
    int inShift = 0;
4775
0
    unsigned int base64bits = 0;
4776
0
    unsigned long base64buffer = 0;
4777
0
    char *out = PyBytesWriter_GetData(writer);
4778
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4779
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4780
4781
0
        if (inShift) {
4782
0
            if (ENCODE_DIRECT(ch)) {
4783
                /* shifting out */
4784
0
                if (base64bits) { /* output remaining bits */
4785
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4786
0
                    base64buffer = 0;
4787
0
                    base64bits = 0;
4788
0
                }
4789
0
                inShift = 0;
4790
                /* Characters not in the BASE64 set implicitly unshift the sequence
4791
                   so no '-' is required, except if the character is itself a '-' */
4792
0
                if (IS_BASE64(ch) || ch == '-') {
4793
0
                    *out++ = '-';
4794
0
                }
4795
0
                *out++ = (char) ch;
4796
0
            }
4797
0
            else {
4798
0
                goto encode_char;
4799
0
            }
4800
0
        }
4801
0
        else { /* not in a shift sequence */
4802
0
            if (ch == '+') {
4803
0
                *out++ = '+';
4804
0
                        *out++ = '-';
4805
0
            }
4806
0
            else if (ENCODE_DIRECT(ch)) {
4807
0
                *out++ = (char) ch;
4808
0
            }
4809
0
            else {
4810
0
                *out++ = '+';
4811
0
                inShift = 1;
4812
0
                goto encode_char;
4813
0
            }
4814
0
        }
4815
0
        continue;
4816
0
encode_char:
4817
0
        if (ch >= 0x10000) {
4818
0
            assert(ch <= MAX_UNICODE);
4819
4820
            /* code first surrogate */
4821
0
            base64bits += 16;
4822
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4823
0
            while (base64bits >= 6) {
4824
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4825
0
                base64bits -= 6;
4826
0
            }
4827
            /* prepare second surrogate */
4828
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4829
0
        }
4830
0
        base64bits += 16;
4831
0
        base64buffer = (base64buffer << 16) | ch;
4832
0
        while (base64bits >= 6) {
4833
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4834
0
            base64bits -= 6;
4835
0
        }
4836
0
    }
4837
0
    if (base64bits)
4838
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4839
0
    if (inShift)
4840
0
        *out++ = '-';
4841
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4842
0
}
4843
4844
#undef IS_BASE64
4845
#undef FROM_BASE64
4846
#undef TO_BASE64
4847
#undef DECODE_DIRECT
4848
#undef ENCODE_DIRECT
4849
4850
/* --- UTF-8 Codec -------------------------------------------------------- */
4851
4852
PyObject *
4853
PyUnicode_DecodeUTF8(const char *s,
4854
                     Py_ssize_t size,
4855
                     const char *errors)
4856
73.8M
{
4857
73.8M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4858
73.8M
}
4859
4860
#include "stringlib/asciilib.h"
4861
#include "stringlib/codecs.h"
4862
#include "stringlib/undef.h"
4863
4864
#include "stringlib/ucs1lib.h"
4865
#include "stringlib/codecs.h"
4866
#include "stringlib/undef.h"
4867
4868
#include "stringlib/ucs2lib.h"
4869
#include "stringlib/codecs.h"
4870
#include "stringlib/undef.h"
4871
4872
#include "stringlib/ucs4lib.h"
4873
#include "stringlib/codecs.h"
4874
#include "stringlib/undef.h"
4875
4876
#if (SIZEOF_SIZE_T == 8)
4877
/* Mask to quickly check whether a C 'size_t' contains a
4878
   non-ASCII, UTF8-encoded char. */
4879
193M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4880
// used to count codepoints in UTF-8 string.
4881
48.7M
# define VECTOR_0101     0x0101010101010101ULL
4882
925k
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4883
#elif (SIZEOF_SIZE_T == 4)
4884
# define ASCII_CHAR_MASK 0x80808080U
4885
# define VECTOR_0101     0x01010101U
4886
# define VECTOR_00FF     0x00ff00ffU
4887
#else
4888
# error C 'size_t' size should be either 4 or 8!
4889
#endif
4890
4891
#if (defined(__clang__) || defined(__GNUC__))
4892
#define HAVE_CTZ 1
4893
static inline unsigned int
4894
ctz(size_t v)
4895
17.0M
{
4896
17.0M
    return __builtin_ctzll((unsigned long long)v);
4897
17.0M
}
4898
#elif defined(_MSC_VER)
4899
#define HAVE_CTZ 1
4900
static inline unsigned int
4901
ctz(size_t v)
4902
{
4903
    unsigned long pos;
4904
#if SIZEOF_SIZE_T == 4
4905
    _BitScanForward(&pos, v);
4906
#else
4907
    _BitScanForward64(&pos, v);
4908
#endif /* SIZEOF_SIZE_T */
4909
    return pos;
4910
}
4911
#else
4912
#define HAVE_CTZ 0
4913
#endif
4914
4915
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4916
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4917
static size_t
4918
load_unaligned(const unsigned char *p, size_t size)
4919
66.9M
{
4920
66.9M
    union {
4921
66.9M
        size_t s;
4922
66.9M
        unsigned char b[SIZEOF_SIZE_T];
4923
66.9M
    } u;
4924
66.9M
    u.s = 0;
4925
    // This switch statement assumes little endian because:
4926
    // * union is faster than bitwise or and shift.
4927
    // * big endian machine is rare and hard to maintain.
4928
66.9M
    switch (size) {
4929
0
    default:
4930
0
#if SIZEOF_SIZE_T == 8
4931
0
    case 8:
4932
0
        u.b[7] = p[7];
4933
0
        _Py_FALLTHROUGH;
4934
5.21M
    case 7:
4935
5.21M
        u.b[6] = p[6];
4936
5.21M
        _Py_FALLTHROUGH;
4937
15.4M
    case 6:
4938
15.4M
        u.b[5] = p[5];
4939
15.4M
        _Py_FALLTHROUGH;
4940
23.6M
    case 5:
4941
23.6M
        u.b[4] = p[4];
4942
23.6M
        _Py_FALLTHROUGH;
4943
23.6M
#endif
4944
29.3M
    case 4:
4945
29.3M
        u.b[3] = p[3];
4946
29.3M
        _Py_FALLTHROUGH;
4947
48.7M
    case 3:
4948
48.7M
        u.b[2] = p[2];
4949
48.7M
        _Py_FALLTHROUGH;
4950
61.2M
    case 2:
4951
61.2M
        u.b[1] = p[1];
4952
61.2M
        _Py_FALLTHROUGH;
4953
63.3M
    case 1:
4954
63.3M
        u.b[0] = p[0];
4955
63.3M
        break;
4956
3.60M
    case 0:
4957
3.60M
        break;
4958
66.9M
    }
4959
66.9M
    return u.s;
4960
66.9M
}
4961
#endif
4962
4963
/*
4964
 * Find the first non-ASCII character in a byte sequence.
4965
 *
4966
 * This function scans a range of bytes from `start` to `end` and returns the
4967
 * index of the first byte that is not an ASCII character (i.e., has the most
4968
 * significant bit set). If all characters in the range are ASCII, it returns
4969
 * `end - start`.
4970
 */
4971
static Py_ssize_t
4972
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4973
74.8M
{
4974
    // The search is done in `size_t` chunks.
4975
    // The start and end might not be aligned at `size_t` boundaries,
4976
    // so they're handled specially.
4977
4978
74.8M
    const unsigned char *p = start;
4979
4980
74.8M
    if (end - start >= SIZEOF_SIZE_T) {
4981
        // Avoid unaligned read.
4982
25.2M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4983
25.2M
        size_t u;
4984
25.2M
        memcpy(&u, p, sizeof(size_t));
4985
25.2M
        u &= ASCII_CHAR_MASK;
4986
25.2M
        if (u) {
4987
6.03M
            return (ctz(u) - 7) / 8;
4988
6.03M
        }
4989
19.2M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4990
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4991
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
4992
        while (p < p2) {
4993
            if (*p & 0x80) {
4994
                return p - start;
4995
            }
4996
            p++;
4997
        }
4998
#endif
4999
5000
19.2M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5001
110M
        while (p <= e) {
5002
93.2M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5003
93.2M
            if (u) {
5004
1.84M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5005
1.84M
                return p - start + (ctz(u) - 7) / 8;
5006
#else
5007
                // big endian and minor compilers are difficult to test.
5008
                // fallback to per byte check.
5009
                break;
5010
#endif
5011
1.84M
            }
5012
91.4M
            p += SIZEOF_SIZE_T;
5013
91.4M
        }
5014
19.2M
    }
5015
66.9M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5016
74.8M
    assert((end - p) < SIZEOF_SIZE_T);
5017
    // we can not use *(const size_t*)p to avoid buffer overrun.
5018
66.9M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5019
66.9M
    if (u) {
5020
9.19M
        return p - start + (ctz(u) - 7) / 8;
5021
9.19M
    }
5022
57.7M
    return end - start;
5023
#else
5024
    while (p < end) {
5025
        if (*p & 0x80) {
5026
            break;
5027
        }
5028
        p++;
5029
    }
5030
    return p - start;
5031
#endif
5032
66.9M
}
5033
5034
static inline int
5035
scalar_utf8_start_char(unsigned int ch)
5036
912k
{
5037
    // 0xxxxxxx or 11xxxxxx are first byte.
5038
912k
    return (~ch >> 7 | ch >> 6) & 1;
5039
912k
}
5040
5041
static inline size_t
5042
vector_utf8_start_chars(size_t v)
5043
48.7M
{
5044
48.7M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5045
48.7M
}
5046
5047
5048
// Count the number of UTF-8 code points in a given byte sequence.
5049
static Py_ssize_t
5050
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5051
367k
{
5052
367k
    Py_ssize_t len = 0;
5053
5054
367k
    if (end - s >= SIZEOF_SIZE_T) {
5055
300k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5056
19.9k
            len += scalar_utf8_start_char(*s++);
5057
19.9k
        }
5058
5059
743k
        while (s + SIZEOF_SIZE_T <= end) {
5060
462k
            const unsigned char *e = end;
5061
462k
            if (e - s > SIZEOF_SIZE_T * 255) {
5062
183k
                e = s + SIZEOF_SIZE_T * 255;
5063
183k
            }
5064
462k
            Py_ssize_t vstart = 0;
5065
49.2M
            while (s + SIZEOF_SIZE_T <= e) {
5066
48.7M
                size_t v = *(size_t*)s;
5067
48.7M
                size_t vs = vector_utf8_start_chars(v);
5068
48.7M
                vstart += vs;
5069
48.7M
                s += SIZEOF_SIZE_T;
5070
48.7M
            }
5071
462k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5072
462k
            vstart += vstart >> 16;
5073
462k
#if SIZEOF_SIZE_T == 8
5074
462k
            vstart += vstart >> 32;
5075
462k
#endif
5076
462k
            len += vstart & 0x7ff;
5077
462k
        }
5078
280k
    }
5079
1.25M
    while (s < end) {
5080
892k
        len += scalar_utf8_start_char(*s++);
5081
892k
    }
5082
367k
    return len;
5083
367k
}
5084
5085
static Py_ssize_t
5086
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5087
16.8M
{
5088
16.8M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5089
16.8M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5090
16.7M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5091
13.3M
    {
5092
        /* Fast path, see in STRINGLIB(utf8_decode) for
5093
           an explanation. */
5094
13.3M
        const char *p = start;
5095
13.3M
        Py_UCS1 *q = dest;
5096
18.7M
        while (p + SIZEOF_SIZE_T <= end) {
5097
7.81M
            size_t value = *(const size_t *) p;
5098
7.81M
            if (value & ASCII_CHAR_MASK)
5099
2.48M
                break;
5100
5.33M
            *((size_t *)q) = value;
5101
5.33M
            p += SIZEOF_SIZE_T;
5102
5.33M
            q += SIZEOF_SIZE_T;
5103
5.33M
        }
5104
64.9M
        while (p < end) {
5105
54.0M
            if ((unsigned char)*p & 0x80)
5106
2.50M
                break;
5107
51.5M
            *q++ = *p++;
5108
51.5M
        }
5109
13.3M
        return p - start;
5110
13.3M
    }
5111
3.45M
#endif
5112
3.45M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5113
3.45M
                                         (const unsigned char*)end);
5114
3.45M
    memcpy(dest, start, pos);
5115
3.45M
    return pos;
5116
16.8M
}
5117
5118
static int
5119
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5120
                         const char *starts, const char *s, const char *end,
5121
                         _Py_error_handler error_handler,
5122
                         const char *errors,
5123
                         Py_ssize_t *consumed)
5124
17.0M
{
5125
17.0M
    Py_ssize_t startinpos, endinpos;
5126
17.0M
    const char *errmsg = "";
5127
17.0M
    PyObject *error_handler_obj = NULL;
5128
17.0M
    PyObject *exc = NULL;
5129
5130
340M
    while (s < end) {
5131
333M
        Py_UCS4 ch;
5132
333M
        int kind = writer->kind;
5133
5134
333M
        if (kind == PyUnicode_1BYTE_KIND) {
5135
17.5M
            if (PyUnicode_IS_ASCII(writer->buffer))
5136
16.7M
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5137
800k
            else
5138
800k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5139
315M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5140
114M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5141
201M
        } else {
5142
201M
            assert(kind == PyUnicode_4BYTE_KIND);
5143
201M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5144
201M
        }
5145
5146
333M
        switch (ch) {
5147
9.78M
        case 0:
5148
9.78M
            if (s == end || consumed)
5149
9.75M
                goto End;
5150
25.8k
            errmsg = "unexpected end of data";
5151
25.8k
            startinpos = s - starts;
5152
25.8k
            endinpos = end - starts;
5153
25.8k
            break;
5154
236M
        case 1:
5155
236M
            errmsg = "invalid start byte";
5156
236M
            startinpos = s - starts;
5157
236M
            endinpos = startinpos + 1;
5158
236M
            break;
5159
68.6M
        case 2:
5160
68.6M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5161
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5162
0
            {
5163
                /* Truncated surrogate code in range D800-DFFF */
5164
0
                goto End;
5165
0
            }
5166
68.6M
            _Py_FALLTHROUGH;
5167
69.9M
        case 3:
5168
70.2M
        case 4:
5169
70.2M
            errmsg = "invalid continuation byte";
5170
70.2M
            startinpos = s - starts;
5171
70.2M
            endinpos = startinpos + ch - 1;
5172
70.2M
            break;
5173
16.7M
        default:
5174
            // ch doesn't fit into kind, so change the buffer kind to write
5175
            // the character
5176
16.7M
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5177
0
                goto onError;
5178
16.7M
            continue;
5179
333M
        }
5180
5181
306M
        if (error_handler == _Py_ERROR_UNKNOWN)
5182
166k
            error_handler = _Py_GetErrorHandler(errors);
5183
5184
306M
        switch (error_handler) {
5185
0
        case _Py_ERROR_IGNORE:
5186
0
            s += (endinpos - startinpos);
5187
0
            break;
5188
5189
303M
        case _Py_ERROR_REPLACE:
5190
303M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5191
0
                goto onError;
5192
303M
            s += (endinpos - startinpos);
5193
303M
            break;
5194
5195
3.11M
        case _Py_ERROR_SURROGATEESCAPE:
5196
3.11M
        {
5197
3.11M
            Py_ssize_t i;
5198
5199
3.11M
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5200
0
                goto onError;
5201
6.23M
            for (i=startinpos; i<endinpos; i++) {
5202
3.11M
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5203
3.11M
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5204
3.11M
                                ch + 0xdc00);
5205
3.11M
                writer->pos++;
5206
3.11M
            }
5207
3.11M
            s += (endinpos - startinpos);
5208
3.11M
            break;
5209
3.11M
        }
5210
5211
1.58k
        default:
5212
1.58k
            if (unicode_decode_call_errorhandler_writer(
5213
1.58k
                    errors, &error_handler_obj,
5214
1.58k
                    "utf-8", errmsg,
5215
1.58k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5216
1.58k
                    writer)) {
5217
1.58k
                goto onError;
5218
1.58k
            }
5219
5220
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5221
0
                goto onError;
5222
0
            }
5223
306M
        }
5224
306M
    }
5225
5226
17.0M
End:
5227
17.0M
    if (consumed)
5228
656
        *consumed = s - starts;
5229
5230
17.0M
    Py_XDECREF(error_handler_obj);
5231
17.0M
    Py_XDECREF(exc);
5232
17.0M
    return 0;
5233
5234
1.58k
onError:
5235
1.58k
    Py_XDECREF(error_handler_obj);
5236
1.58k
    Py_XDECREF(exc);
5237
1.58k
    return -1;
5238
17.0M
}
5239
5240
5241
static PyObject *
5242
unicode_decode_utf8(const char *s, Py_ssize_t size,
5243
                    _Py_error_handler error_handler, const char *errors,
5244
                    Py_ssize_t *consumed)
5245
108M
{
5246
108M
    if (size == 0) {
5247
3.63M
        if (consumed) {
5248
0
            *consumed = 0;
5249
0
        }
5250
3.63M
        _Py_RETURN_UNICODE_EMPTY();
5251
3.63M
    }
5252
5253
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5254
104M
    if (size == 1 && (unsigned char)s[0] < 128) {
5255
33.2M
        if (consumed) {
5256
0
            *consumed = 1;
5257
0
        }
5258
33.2M
        return get_latin1_char((unsigned char)s[0]);
5259
33.2M
    }
5260
5261
    // I don't know this check is necessary or not. But there is a test
5262
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5263
71.3M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5264
0
        PyErr_NoMemory();
5265
0
        return NULL;
5266
0
    }
5267
5268
71.3M
    const char *starts = s;
5269
71.3M
    const char *end = s + size;
5270
5271
71.3M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5272
71.3M
    if (pos == size) {  // fast path: ASCII string.
5273
54.3M
        PyObject *u = PyUnicode_New(size, 127);
5274
54.3M
        if (u == NULL) {
5275
0
            return NULL;
5276
0
        }
5277
54.3M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5278
54.3M
        if (consumed) {
5279
102
            *consumed = size;
5280
102
        }
5281
54.3M
        return u;
5282
54.3M
    }
5283
5284
17.0M
    int maxchr = 127;
5285
17.0M
    Py_ssize_t maxsize = size;
5286
5287
17.0M
    unsigned char ch = (unsigned char)(s[pos]);
5288
    // error handler other than strict may remove/replace the invalid byte.
5289
    // consumed != NULL allows 1~3 bytes remainings.
5290
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5291
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5292
    // reallocation and copy.
5293
17.0M
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5294
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5295
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5296
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5297
        // means that it is no longer necessary to allocate several times the required amount
5298
        // of memory.
5299
367k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5300
367k
        if (ch < 0xc4) { // latin1
5301
237k
            maxchr = 0xff;
5302
237k
        }
5303
130k
        else if (ch < 0xf0) { // ucs2
5304
116k
            maxchr = 0xffff;
5305
116k
        }
5306
13.2k
        else { // ucs4
5307
13.2k
            maxchr = 0x10ffff;
5308
13.2k
        }
5309
367k
    }
5310
17.0M
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5311
17.0M
    if (!u) {
5312
0
        return NULL;
5313
0
    }
5314
5315
    // Use _PyUnicodeWriter after fast path is failed.
5316
17.0M
    _PyUnicodeWriter writer;
5317
17.0M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5318
17.0M
    if (maxchr <= 255) {
5319
16.9M
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5320
16.9M
        s += pos;
5321
16.9M
        writer.pos = pos;
5322
16.9M
    }
5323
5324
17.0M
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5325
17.0M
                                 error_handler, errors,
5326
17.0M
                                 consumed) < 0) {
5327
1.58k
        _PyUnicodeWriter_Dealloc(&writer);
5328
1.58k
        return NULL;
5329
1.58k
    }
5330
17.0M
    return _PyUnicodeWriter_Finish(&writer);
5331
17.0M
}
5332
5333
5334
// Used by PyUnicodeWriter_WriteUTF8() implementation
5335
int
5336
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5337
                            const char *s, Py_ssize_t size,
5338
                            _Py_error_handler error_handler, const char *errors,
5339
                            Py_ssize_t *consumed)
5340
3.47M
{
5341
3.47M
    if (size == 0) {
5342
6.27k
        if (consumed) {
5343
0
            *consumed = 0;
5344
0
        }
5345
6.27k
        return 0;
5346
6.27k
    }
5347
5348
    // fast path: try ASCII string.
5349
3.46M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5350
0
        return -1;
5351
0
    }
5352
5353
3.46M
    const char *starts = s;
5354
3.46M
    const char *end = s + size;
5355
3.46M
    Py_ssize_t decoded = 0;
5356
3.46M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5357
3.46M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5358
3.46M
        decoded = ascii_decode(s, end, dest);
5359
3.46M
        writer->pos += decoded;
5360
5361
3.46M
        if (decoded == size) {
5362
3.42M
            if (consumed) {
5363
1.14k
                *consumed = size;
5364
1.14k
            }
5365
3.42M
            return 0;
5366
3.42M
        }
5367
38.7k
        s += decoded;
5368
38.7k
    }
5369
5370
40.6k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5371
40.6k
                                    error_handler, errors, consumed);
5372
3.46M
}
5373
5374
5375
PyObject *
5376
PyUnicode_DecodeUTF8Stateful(const char *s,
5377
                             Py_ssize_t size,
5378
                             const char *errors,
5379
                             Py_ssize_t *consumed)
5380
108M
{
5381
108M
    return unicode_decode_utf8(s, size,
5382
108M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5383
108M
                               errors, consumed);
5384
108M
}
5385
5386
5387
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5388
   non-zero, use strict error handler otherwise.
5389
5390
   On success, write a pointer to a newly allocated wide character string into
5391
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5392
   (in number of wchar_t units) into *wlen (if wlen is set).
5393
5394
   On memory allocation failure, return -1.
5395
5396
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5397
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5398
   is not NULL, write the decoding error message into *reason. */
5399
int
5400
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5401
                 const char **reason, _Py_error_handler errors)
5402
288
{
5403
288
    const char *orig_s = s;
5404
288
    const char *e;
5405
288
    wchar_t *unicode;
5406
288
    Py_ssize_t outpos;
5407
5408
288
    int surrogateescape = 0;
5409
288
    int surrogatepass = 0;
5410
288
    switch (errors)
5411
288
    {
5412
0
    case _Py_ERROR_STRICT:
5413
0
        break;
5414
288
    case _Py_ERROR_SURROGATEESCAPE:
5415
288
        surrogateescape = 1;
5416
288
        break;
5417
0
    case _Py_ERROR_SURROGATEPASS:
5418
0
        surrogatepass = 1;
5419
0
        break;
5420
0
    default:
5421
0
        return -3;
5422
288
    }
5423
5424
    /* Note: size will always be longer than the resulting Unicode
5425
       character count */
5426
288
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5427
0
        return -1;
5428
0
    }
5429
5430
288
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5431
288
    if (!unicode) {
5432
0
        return -1;
5433
0
    }
5434
5435
    /* Unpack UTF-8 encoded data */
5436
288
    e = s + size;
5437
288
    outpos = 0;
5438
288
    while (s < e) {
5439
288
        Py_UCS4 ch;
5440
288
#if SIZEOF_WCHAR_T == 4
5441
288
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5442
#else
5443
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5444
#endif
5445
288
        if (ch > 0xFF) {
5446
0
#if SIZEOF_WCHAR_T == 4
5447
0
            Py_UNREACHABLE();
5448
#else
5449
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5450
            /* write a surrogate pair */
5451
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5452
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5453
#endif
5454
0
        }
5455
288
        else {
5456
288
            if (!ch && s == e) {
5457
288
                break;
5458
288
            }
5459
5460
0
            if (surrogateescape) {
5461
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5462
0
            }
5463
0
            else {
5464
                /* Is it a valid three-byte code? */
5465
0
                if (surrogatepass
5466
0
                    && (e - s) >= 3
5467
0
                    && (s[0] & 0xf0) == 0xe0
5468
0
                    && (s[1] & 0xc0) == 0x80
5469
0
                    && (s[2] & 0xc0) == 0x80)
5470
0
                {
5471
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5472
0
                    s += 3;
5473
0
                    unicode[outpos++] = ch;
5474
0
                }
5475
0
                else {
5476
0
                    PyMem_RawFree(unicode );
5477
0
                    if (reason != NULL) {
5478
0
                        switch (ch) {
5479
0
                        case 0:
5480
0
                            *reason = "unexpected end of data";
5481
0
                            break;
5482
0
                        case 1:
5483
0
                            *reason = "invalid start byte";
5484
0
                            break;
5485
                        /* 2, 3, 4 */
5486
0
                        default:
5487
0
                            *reason = "invalid continuation byte";
5488
0
                            break;
5489
0
                        }
5490
0
                    }
5491
0
                    if (wlen != NULL) {
5492
0
                        *wlen = s - orig_s;
5493
0
                    }
5494
0
                    return -2;
5495
0
                }
5496
0
            }
5497
0
        }
5498
288
    }
5499
288
    unicode[outpos] = L'\0';
5500
288
    if (wlen) {
5501
288
        *wlen = outpos;
5502
288
    }
5503
288
    *wstr = unicode;
5504
288
    return 0;
5505
288
}
5506
5507
5508
wchar_t*
5509
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5510
                               size_t *wlen)
5511
0
{
5512
0
    wchar_t *wstr;
5513
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5514
0
                               &wstr, wlen,
5515
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5516
0
    if (res != 0) {
5517
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5518
0
        assert(res != -3);
5519
0
        if (wlen) {
5520
0
            *wlen = (size_t)res;
5521
0
        }
5522
0
        return NULL;
5523
0
    }
5524
0
    return wstr;
5525
0
}
5526
5527
5528
/* UTF-8 encoder.
5529
5530
   On success, return 0 and write the newly allocated character string (use
5531
   PyMem_Free() to free the memory) into *str.
5532
5533
   On encoding failure, return -2 and write the position of the invalid
5534
   surrogate character into *error_pos (if error_pos is set) and the decoding
5535
   error message into *reason (if reason is set).
5536
5537
   On memory allocation failure, return -1. */
5538
int
5539
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5540
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5541
612
{
5542
612
    const Py_ssize_t max_char_size = 4;
5543
612
    Py_ssize_t len = wcslen(text);
5544
5545
612
    assert(len >= 0);
5546
5547
612
    int surrogateescape = 0;
5548
612
    int surrogatepass = 0;
5549
612
    switch (errors)
5550
612
    {
5551
144
    case _Py_ERROR_STRICT:
5552
144
        break;
5553
468
    case _Py_ERROR_SURROGATEESCAPE:
5554
468
        surrogateescape = 1;
5555
468
        break;
5556
0
    case _Py_ERROR_SURROGATEPASS:
5557
0
        surrogatepass = 1;
5558
0
        break;
5559
0
    default:
5560
0
        return -3;
5561
612
    }
5562
5563
612
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5564
0
        return -1;
5565
0
    }
5566
612
    char *bytes;
5567
612
    if (raw_malloc) {
5568
612
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5569
612
    }
5570
0
    else {
5571
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5572
0
    }
5573
612
    if (bytes == NULL) {
5574
0
        return -1;
5575
0
    }
5576
5577
612
    char *p = bytes;
5578
612
    Py_ssize_t i;
5579
19.1k
    for (i = 0; i < len; ) {
5580
18.5k
        Py_ssize_t ch_pos = i;
5581
18.5k
        Py_UCS4 ch = text[i];
5582
18.5k
        i++;
5583
18.5k
        if (sizeof(wchar_t) == 2
5584
0
            && Py_UNICODE_IS_HIGH_SURROGATE(ch)
5585
0
            && i < len
5586
0
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5587
0
        {
5588
0
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5589
0
            i++;
5590
0
        }
5591
5592
18.5k
        if (ch < 0x80) {
5593
            /* Encode ASCII */
5594
18.5k
            *p++ = (char) ch;
5595
5596
18.5k
        }
5597
0
        else if (ch < 0x0800) {
5598
            /* Encode Latin-1 */
5599
0
            *p++ = (char)(0xc0 | (ch >> 6));
5600
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5601
0
        }
5602
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5603
            /* surrogateescape error handler */
5604
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5605
0
                if (error_pos != NULL) {
5606
0
                    *error_pos = (size_t)ch_pos;
5607
0
                }
5608
0
                if (reason != NULL) {
5609
0
                    *reason = "encoding error";
5610
0
                }
5611
0
                if (raw_malloc) {
5612
0
                    PyMem_RawFree(bytes);
5613
0
                }
5614
0
                else {
5615
0
                    PyMem_Free(bytes);
5616
0
                }
5617
0
                return -2;
5618
0
            }
5619
0
            *p++ = (char)(ch & 0xff);
5620
0
        }
5621
0
        else if (ch < 0x10000) {
5622
0
            *p++ = (char)(0xe0 | (ch >> 12));
5623
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5624
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5625
0
        }
5626
0
        else {  /* ch >= 0x10000 */
5627
0
            assert(ch <= MAX_UNICODE);
5628
            /* Encode UCS4 Unicode ordinals */
5629
0
            *p++ = (char)(0xf0 | (ch >> 18));
5630
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5631
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5632
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5633
0
        }
5634
18.5k
    }
5635
612
    *p++ = '\0';
5636
5637
612
    size_t final_size = (p - bytes);
5638
612
    char *bytes2;
5639
612
    if (raw_malloc) {
5640
612
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5641
612
    }
5642
0
    else {
5643
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5644
0
    }
5645
612
    if (bytes2 == NULL) {
5646
0
        if (error_pos != NULL) {
5647
0
            *error_pos = (size_t)-1;
5648
0
        }
5649
0
        if (raw_malloc) {
5650
0
            PyMem_RawFree(bytes);
5651
0
        }
5652
0
        else {
5653
0
            PyMem_Free(bytes);
5654
0
        }
5655
0
        return -1;
5656
0
    }
5657
612
    *str = bytes2;
5658
612
    return 0;
5659
612
}
5660
5661
5662
/* Primary internal function which creates utf8 encoded bytes objects.
5663
5664
   Allocation strategy:  if the string is short, convert into a stack buffer
5665
   and allocate exactly as much space needed at the end.  Else allocate the
5666
   maximum possible needed (4 result bytes per Unicode character), and return
5667
   the excess memory at the end.
5668
*/
5669
static PyObject *
5670
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5671
                    const char *errors)
5672
19.2M
{
5673
19.2M
    if (!PyUnicode_Check(unicode)) {
5674
0
        PyErr_BadArgument();
5675
0
        return NULL;
5676
0
    }
5677
5678
19.2M
    if (PyUnicode_UTF8(unicode))
5679
9.82M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5680
9.82M
                                         PyUnicode_UTF8_LENGTH(unicode));
5681
5682
9.42M
    int kind = PyUnicode_KIND(unicode);
5683
9.42M
    const void *data = PyUnicode_DATA(unicode);
5684
9.42M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5685
5686
9.42M
    PyBytesWriter *writer;
5687
9.42M
    char *end;
5688
5689
9.42M
    switch (kind) {
5690
0
    default:
5691
0
        Py_UNREACHABLE();
5692
6.30M
    case PyUnicode_1BYTE_KIND:
5693
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5694
6.30M
        assert(!PyUnicode_IS_ASCII(unicode));
5695
6.30M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5696
6.30M
                                      error_handler, errors, &end);
5697
6.30M
        break;
5698
1.92M
    case PyUnicode_2BYTE_KIND:
5699
1.92M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5700
1.92M
                                      error_handler, errors, &end);
5701
1.92M
        break;
5702
1.19M
    case PyUnicode_4BYTE_KIND:
5703
1.19M
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5704
1.19M
                                      error_handler, errors, &end);
5705
1.19M
        break;
5706
9.42M
    }
5707
5708
9.42M
    if (writer == NULL) {
5709
138k
        PyBytesWriter_Discard(writer);
5710
138k
        return NULL;
5711
138k
    }
5712
9.29M
    return PyBytesWriter_FinishWithPointer(writer, end);
5713
9.42M
}
5714
5715
static int
5716
unicode_fill_utf8(PyObject *unicode)
5717
161k
{
5718
161k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5719
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5720
161k
    assert(!PyUnicode_IS_ASCII(unicode));
5721
5722
161k
    int kind = PyUnicode_KIND(unicode);
5723
161k
    const void *data = PyUnicode_DATA(unicode);
5724
161k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5725
5726
161k
    PyBytesWriter *writer;
5727
161k
    char *end;
5728
5729
161k
    switch (kind) {
5730
0
    default:
5731
0
        Py_UNREACHABLE();
5732
121k
    case PyUnicode_1BYTE_KIND:
5733
121k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5734
121k
                                      _Py_ERROR_STRICT, NULL, &end);
5735
121k
        break;
5736
33.4k
    case PyUnicode_2BYTE_KIND:
5737
33.4k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5738
33.4k
                                      _Py_ERROR_STRICT, NULL, &end);
5739
33.4k
        break;
5740
6.54k
    case PyUnicode_4BYTE_KIND:
5741
6.54k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5742
6.54k
                                      _Py_ERROR_STRICT, NULL, &end);
5743
6.54k
        break;
5744
161k
    }
5745
161k
    if (writer == NULL) {
5746
206
        return -1;
5747
206
    }
5748
5749
160k
    const char *start = PyBytesWriter_GetData(writer);
5750
160k
    Py_ssize_t len = end - start;
5751
5752
160k
    char *cache = PyMem_Malloc(len + 1);
5753
160k
    if (cache == NULL) {
5754
0
        PyBytesWriter_Discard(writer);
5755
0
        PyErr_NoMemory();
5756
0
        return -1;
5757
0
    }
5758
160k
    memcpy(cache, start, len);
5759
160k
    cache[len] = '\0';
5760
160k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5761
160k
    PyUnicode_SET_UTF8(unicode, cache);
5762
160k
    PyBytesWriter_Discard(writer);
5763
160k
    return 0;
5764
160k
}
5765
5766
PyObject *
5767
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5768
17.9M
{
5769
17.9M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5770
17.9M
}
5771
5772
5773
PyObject *
5774
PyUnicode_AsUTF8String(PyObject *unicode)
5775
2.48k
{
5776
2.48k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5777
2.48k
}
5778
5779
/* --- UTF-32 Codec ------------------------------------------------------- */
5780
5781
PyObject *
5782
PyUnicode_DecodeUTF32(const char *s,
5783
                      Py_ssize_t size,
5784
                      const char *errors,
5785
                      int *byteorder)
5786
155
{
5787
155
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5788
155
}
5789
5790
PyObject *
5791
PyUnicode_DecodeUTF32Stateful(const char *s,
5792
                              Py_ssize_t size,
5793
                              const char *errors,
5794
                              int *byteorder,
5795
                              Py_ssize_t *consumed)
5796
33.5k
{
5797
33.5k
    const char *starts = s;
5798
33.5k
    Py_ssize_t startinpos;
5799
33.5k
    Py_ssize_t endinpos;
5800
33.5k
    _PyUnicodeWriter writer;
5801
33.5k
    const unsigned char *q, *e;
5802
33.5k
    int le, bo = 0;       /* assume native ordering by default */
5803
33.5k
    const char *encoding;
5804
33.5k
    const char *errmsg = "";
5805
33.5k
    PyObject *errorHandler = NULL;
5806
33.5k
    PyObject *exc = NULL;
5807
5808
33.5k
    q = (const unsigned char *)s;
5809
33.5k
    e = q + size;
5810
5811
33.5k
    if (byteorder)
5812
33.3k
        bo = *byteorder;
5813
5814
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5815
       byte order setting accordingly. In native mode, the leading BOM
5816
       mark is skipped, in all other modes, it is copied to the output
5817
       stream as-is (giving a ZWNBSP character). */
5818
33.5k
    if (bo == 0 && size >= 4) {
5819
31.2k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5820
31.2k
        if (bom == 0x0000FEFF) {
5821
195
            bo = -1;
5822
195
            q += 4;
5823
195
        }
5824
31.0k
        else if (bom == 0xFFFE0000) {
5825
476
            bo = 1;
5826
476
            q += 4;
5827
476
        }
5828
31.2k
        if (byteorder)
5829
31.1k
            *byteorder = bo;
5830
31.2k
    }
5831
5832
33.5k
    if (q == e) {
5833
101
        if (consumed)
5834
0
            *consumed = size;
5835
101
        _Py_RETURN_UNICODE_EMPTY();
5836
101
    }
5837
5838
#ifdef WORDS_BIGENDIAN
5839
    le = bo < 0;
5840
#else
5841
33.4k
    le = bo <= 0;
5842
33.4k
#endif
5843
33.4k
    encoding = le ? "utf-32-le" : "utf-32-be";
5844
5845
33.4k
    _PyUnicodeWriter_Init(&writer);
5846
33.4k
    writer.min_length = (e - q + 3) / 4;
5847
33.4k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5848
0
        goto onError;
5849
5850
112k
    while (1) {
5851
112k
        Py_UCS4 ch = 0;
5852
112k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5853
5854
112k
        if (e - q >= 4) {
5855
87.7k
            int kind = writer.kind;
5856
87.7k
            void *data = writer.data;
5857
87.7k
            const unsigned char *last = e - 4;
5858
87.7k
            Py_ssize_t pos = writer.pos;
5859
87.7k
            if (le) {
5860
2.66M
                do {
5861
2.66M
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5862
2.66M
                    if (ch > maxch)
5863
81.8k
                        break;
5864
2.58M
                    if (kind != PyUnicode_1BYTE_KIND &&
5865
2.55M
                        Py_UNICODE_IS_SURROGATE(ch))
5866
136
                        break;
5867
2.58M
                    PyUnicode_WRITE(kind, data, pos++, ch);
5868
2.58M
                    q += 4;
5869
2.58M
                } while (q <= last);
5870
83.1k
            }
5871
4.59k
            else {
5872
8.00k
                do {
5873
8.00k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5874
8.00k
                    if (ch > maxch)
5875
4.11k
                        break;
5876
3.88k
                    if (kind != PyUnicode_1BYTE_KIND &&
5877
3.30k
                        Py_UNICODE_IS_SURROGATE(ch))
5878
132
                        break;
5879
3.75k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5880
3.75k
                    q += 4;
5881
3.75k
                } while (q <= last);
5882
4.59k
            }
5883
87.7k
            writer.pos = pos;
5884
87.7k
        }
5885
5886
112k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5887
274
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5888
274
            startinpos = ((const char *)q) - starts;
5889
274
            endinpos = startinpos + 4;
5890
274
        }
5891
112k
        else if (ch <= maxch) {
5892
26.4k
            if (q == e || consumed)
5893
6.43k
                break;
5894
            /* remaining bytes at the end? (size should be divisible by 4) */
5895
20.0k
            errmsg = "truncated data";
5896
20.0k
            startinpos = ((const char *)q) - starts;
5897
20.0k
            endinpos = ((const char *)e) - starts;
5898
20.0k
        }
5899
85.9k
        else {
5900
85.9k
            if (ch < 0x110000) {
5901
5.39k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5902
0
                    goto onError;
5903
5.39k
                q += 4;
5904
5.39k
                continue;
5905
5.39k
            }
5906
80.5k
            errmsg = "code point not in range(0x110000)";
5907
80.5k
            startinpos = ((const char *)q) - starts;
5908
80.5k
            endinpos = startinpos + 4;
5909
80.5k
        }
5910
5911
        /* The remaining input chars are ignored if the callback
5912
           chooses to skip the input */
5913
100k
        if (unicode_decode_call_errorhandler_writer(
5914
100k
                errors, &errorHandler,
5915
100k
                encoding, errmsg,
5916
100k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5917
100k
                &writer))
5918
26.9k
            goto onError;
5919
100k
    }
5920
5921
6.43k
    if (consumed)
5922
0
        *consumed = (const char *)q-starts;
5923
5924
6.43k
    Py_XDECREF(errorHandler);
5925
6.43k
    Py_XDECREF(exc);
5926
6.43k
    return _PyUnicodeWriter_Finish(&writer);
5927
5928
26.9k
  onError:
5929
26.9k
    _PyUnicodeWriter_Dealloc(&writer);
5930
26.9k
    Py_XDECREF(errorHandler);
5931
26.9k
    Py_XDECREF(exc);
5932
26.9k
    return NULL;
5933
33.4k
}
5934
5935
PyObject *
5936
_PyUnicode_EncodeUTF32(PyObject *str,
5937
                       const char *errors,
5938
                       int byteorder)
5939
0
{
5940
0
    if (!PyUnicode_Check(str)) {
5941
0
        PyErr_BadArgument();
5942
0
        return NULL;
5943
0
    }
5944
0
    int kind = PyUnicode_KIND(str);
5945
0
    const void *data = PyUnicode_DATA(str);
5946
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5947
5948
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5949
0
        return PyErr_NoMemory();
5950
0
    Py_ssize_t nsize = len + (byteorder == 0);
5951
5952
0
#if PY_LITTLE_ENDIAN
5953
0
    int native_ordering = byteorder <= 0;
5954
#else
5955
    int native_ordering = byteorder >= 0;
5956
#endif
5957
5958
0
    if (kind == PyUnicode_1BYTE_KIND) {
5959
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5960
        // on short strings
5961
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5962
0
        if (v == NULL) {
5963
0
            return NULL;
5964
0
        }
5965
5966
        /* output buffer is 4-bytes aligned */
5967
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5968
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5969
0
        if (byteorder == 0) {
5970
0
            *out++ = 0xFEFF;
5971
0
        }
5972
0
        if (len > 0) {
5973
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5974
0
                                 &out, native_ordering);
5975
0
        }
5976
0
        return v;
5977
0
    }
5978
5979
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5980
0
    if (writer == NULL) {
5981
0
        return NULL;
5982
0
    }
5983
5984
    /* output buffer is 4-bytes aligned */
5985
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5986
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5987
0
    if (byteorder == 0) {
5988
0
        *out++ = 0xFEFF;
5989
0
    }
5990
0
    if (len == 0) {
5991
0
        return PyBytesWriter_Finish(writer);
5992
0
    }
5993
5994
0
    const char *encoding;
5995
0
    if (byteorder == -1)
5996
0
        encoding = "utf-32-le";
5997
0
    else if (byteorder == 1)
5998
0
        encoding = "utf-32-be";
5999
0
    else
6000
0
        encoding = "utf-32";
6001
6002
0
    PyObject *errorHandler = NULL;
6003
0
    PyObject *exc = NULL;
6004
0
    PyObject *rep = NULL;
6005
6006
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6007
0
        if (kind == PyUnicode_2BYTE_KIND) {
6008
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6009
0
                                        &out, native_ordering);
6010
0
        }
6011
0
        else {
6012
0
            assert(kind == PyUnicode_4BYTE_KIND);
6013
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6014
0
                                        &out, native_ordering);
6015
0
        }
6016
0
        if (pos == len)
6017
0
            break;
6018
6019
0
        Py_ssize_t newpos;
6020
0
        rep = unicode_encode_call_errorhandler(
6021
0
                errors, &errorHandler,
6022
0
                encoding, "surrogates not allowed",
6023
0
                str, &exc, pos, pos + 1, &newpos);
6024
0
        if (!rep)
6025
0
            goto error;
6026
6027
0
        Py_ssize_t repsize, moreunits;
6028
0
        if (PyBytes_Check(rep)) {
6029
0
            repsize = PyBytes_GET_SIZE(rep);
6030
0
            if (repsize & 3) {
6031
0
                raise_encode_exception(&exc, encoding,
6032
0
                                       str, pos, pos + 1,
6033
0
                                       "surrogates not allowed");
6034
0
                goto error;
6035
0
            }
6036
0
            moreunits = repsize / 4;
6037
0
        }
6038
0
        else {
6039
0
            assert(PyUnicode_Check(rep));
6040
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6041
0
            if (!PyUnicode_IS_ASCII(rep)) {
6042
0
                raise_encode_exception(&exc, encoding,
6043
0
                                       str, pos, pos + 1,
6044
0
                                       "surrogates not allowed");
6045
0
                goto error;
6046
0
            }
6047
0
        }
6048
0
        moreunits += pos - newpos;
6049
0
        pos = newpos;
6050
6051
        /* four bytes are reserved for each surrogate */
6052
0
        if (moreunits > 0) {
6053
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6054
0
            if (out == NULL) {
6055
0
                goto error;
6056
0
            }
6057
0
        }
6058
6059
0
        if (PyBytes_Check(rep)) {
6060
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6061
0
            out += repsize / 4;
6062
0
        }
6063
0
        else {
6064
            /* rep is unicode */
6065
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6066
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6067
0
                                 &out, native_ordering);
6068
0
        }
6069
6070
0
        Py_CLEAR(rep);
6071
0
    }
6072
6073
0
    Py_XDECREF(errorHandler);
6074
0
    Py_XDECREF(exc);
6075
6076
    /* Cut back to size actually needed. This is necessary for, for example,
6077
       encoding of a string containing isolated surrogates and the 'ignore'
6078
       handler is used. */
6079
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6080
6081
0
  error:
6082
0
    Py_XDECREF(rep);
6083
0
    Py_XDECREF(errorHandler);
6084
0
    Py_XDECREF(exc);
6085
0
    PyBytesWriter_Discard(writer);
6086
0
    return NULL;
6087
0
}
6088
6089
PyObject *
6090
PyUnicode_AsUTF32String(PyObject *unicode)
6091
0
{
6092
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6093
0
}
6094
6095
/* --- UTF-16 Codec ------------------------------------------------------- */
6096
6097
PyObject *
6098
PyUnicode_DecodeUTF16(const char *s,
6099
                      Py_ssize_t size,
6100
                      const char *errors,
6101
                      int *byteorder)
6102
172
{
6103
172
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6104
172
}
6105
6106
PyObject *
6107
PyUnicode_DecodeUTF16Stateful(const char *s,
6108
                              Py_ssize_t size,
6109
                              const char *errors,
6110
                              int *byteorder,
6111
                              Py_ssize_t *consumed)
6112
14.3k
{
6113
14.3k
    const char *starts = s;
6114
14.3k
    Py_ssize_t startinpos;
6115
14.3k
    Py_ssize_t endinpos;
6116
14.3k
    _PyUnicodeWriter writer;
6117
14.3k
    const unsigned char *q, *e;
6118
14.3k
    int bo = 0;       /* assume native ordering by default */
6119
14.3k
    int native_ordering;
6120
14.3k
    const char *errmsg = "";
6121
14.3k
    PyObject *errorHandler = NULL;
6122
14.3k
    PyObject *exc = NULL;
6123
14.3k
    const char *encoding;
6124
6125
14.3k
    q = (const unsigned char *)s;
6126
14.3k
    e = q + size;
6127
6128
14.3k
    if (byteorder)
6129
14.2k
        bo = *byteorder;
6130
6131
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6132
       byte order setting accordingly. In native mode, the leading BOM
6133
       mark is skipped, in all other modes, it is copied to the output
6134
       stream as-is (giving a ZWNBSP character). */
6135
14.3k
    if (bo == 0 && size >= 2) {
6136
13.5k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6137
13.5k
        if (bom == 0xFEFF) {
6138
313
            q += 2;
6139
313
            bo = -1;
6140
313
        }
6141
13.2k
        else if (bom == 0xFFFE) {
6142
2.46k
            q += 2;
6143
2.46k
            bo = 1;
6144
2.46k
        }
6145
13.5k
        if (byteorder)
6146
13.3k
            *byteorder = bo;
6147
13.5k
    }
6148
6149
14.3k
    if (q == e) {
6150
94
        if (consumed)
6151
0
            *consumed = size;
6152
94
        _Py_RETURN_UNICODE_EMPTY();
6153
94
    }
6154
6155
14.2k
#if PY_LITTLE_ENDIAN
6156
14.2k
    native_ordering = bo <= 0;
6157
14.2k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6158
#else
6159
    native_ordering = bo >= 0;
6160
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6161
#endif
6162
6163
    /* Note: size will always be longer than the resulting Unicode
6164
       character count normally.  Error handler will take care of
6165
       resizing when needed. */
6166
14.2k
    _PyUnicodeWriter_Init(&writer);
6167
14.2k
    writer.min_length = (e - q + 1) / 2;
6168
14.2k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6169
0
        goto onError;
6170
6171
50.6k
    while (1) {
6172
50.6k
        Py_UCS4 ch = 0;
6173
50.6k
        if (e - q >= 2) {
6174
42.6k
            int kind = writer.kind;
6175
42.6k
            if (kind == PyUnicode_1BYTE_KIND) {
6176
17.2k
                if (PyUnicode_IS_ASCII(writer.buffer))
6177
13.6k
                    ch = asciilib_utf16_decode(&q, e,
6178
13.6k
                            (Py_UCS1*)writer.data, &writer.pos,
6179
13.6k
                            native_ordering);
6180
3.53k
                else
6181
3.53k
                    ch = ucs1lib_utf16_decode(&q, e,
6182
3.53k
                            (Py_UCS1*)writer.data, &writer.pos,
6183
3.53k
                            native_ordering);
6184
25.3k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6185
11.8k
                ch = ucs2lib_utf16_decode(&q, e,
6186
11.8k
                        (Py_UCS2*)writer.data, &writer.pos,
6187
11.8k
                        native_ordering);
6188
13.5k
            } else {
6189
13.5k
                assert(kind == PyUnicode_4BYTE_KIND);
6190
13.5k
                ch = ucs4lib_utf16_decode(&q, e,
6191
13.5k
                        (Py_UCS4*)writer.data, &writer.pos,
6192
13.5k
                        native_ordering);
6193
13.5k
            }
6194
42.6k
        }
6195
6196
50.6k
        switch (ch)
6197
50.6k
        {
6198
14.9k
        case 0:
6199
            /* remaining byte at the end? (size should be even) */
6200
14.9k
            if (q == e || consumed)
6201
9.73k
                goto End;
6202
5.25k
            errmsg = "truncated data";
6203
5.25k
            startinpos = ((const char *)q) - starts;
6204
5.25k
            endinpos = ((const char *)e) - starts;
6205
5.25k
            break;
6206
            /* The remaining input chars are ignored if the callback
6207
               chooses to skip the input */
6208
1.59k
        case 1:
6209
1.59k
            q -= 2;
6210
1.59k
            if (consumed)
6211
0
                goto End;
6212
1.59k
            errmsg = "unexpected end of data";
6213
1.59k
            startinpos = ((const char *)q) - starts;
6214
1.59k
            endinpos = ((const char *)e) - starts;
6215
1.59k
            break;
6216
11.5k
        case 2:
6217
11.5k
            errmsg = "illegal encoding";
6218
11.5k
            startinpos = ((const char *)q) - 2 - starts;
6219
11.5k
            endinpos = startinpos + 2;
6220
11.5k
            break;
6221
7.21k
        case 3:
6222
7.21k
            errmsg = "illegal UTF-16 surrogate";
6223
7.21k
            startinpos = ((const char *)q) - 4 - starts;
6224
7.21k
            endinpos = startinpos + 2;
6225
7.21k
            break;
6226
15.3k
        default:
6227
15.3k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6228
0
                goto onError;
6229
15.3k
            continue;
6230
50.6k
        }
6231
6232
25.5k
        if (unicode_decode_call_errorhandler_writer(
6233
25.5k
                errors,
6234
25.5k
                &errorHandler,
6235
25.5k
                encoding, errmsg,
6236
25.5k
                &starts,
6237
25.5k
                (const char **)&e,
6238
25.5k
                &startinpos,
6239
25.5k
                &endinpos,
6240
25.5k
                &exc,
6241
25.5k
                (const char **)&q,
6242
25.5k
                &writer))
6243
4.54k
            goto onError;
6244
25.5k
    }
6245
6246
9.73k
End:
6247
9.73k
    if (consumed)
6248
0
        *consumed = (const char *)q-starts;
6249
6250
9.73k
    Py_XDECREF(errorHandler);
6251
9.73k
    Py_XDECREF(exc);
6252
9.73k
    return _PyUnicodeWriter_Finish(&writer);
6253
6254
4.54k
  onError:
6255
4.54k
    _PyUnicodeWriter_Dealloc(&writer);
6256
4.54k
    Py_XDECREF(errorHandler);
6257
4.54k
    Py_XDECREF(exc);
6258
4.54k
    return NULL;
6259
14.2k
}
6260
6261
PyObject *
6262
_PyUnicode_EncodeUTF16(PyObject *str,
6263
                       const char *errors,
6264
                       int byteorder)
6265
6.51k
{
6266
6.51k
    if (!PyUnicode_Check(str)) {
6267
0
        PyErr_BadArgument();
6268
0
        return NULL;
6269
0
    }
6270
6.51k
    int kind = PyUnicode_KIND(str);
6271
6.51k
    const void *data = PyUnicode_DATA(str);
6272
6.51k
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6273
6274
6.51k
    Py_ssize_t pairs = 0;
6275
6.51k
    if (kind == PyUnicode_4BYTE_KIND) {
6276
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6277
0
        const Py_UCS4 *end = in + len;
6278
0
        while (in < end) {
6279
0
            if (*in++ >= 0x10000) {
6280
0
                pairs++;
6281
0
            }
6282
0
        }
6283
0
    }
6284
6.51k
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6285
0
        return PyErr_NoMemory();
6286
0
    }
6287
6.51k
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6288
6289
#if PY_BIG_ENDIAN
6290
    int native_ordering = byteorder >= 0;
6291
#else
6292
6.51k
    int native_ordering = byteorder <= 0;
6293
6.51k
#endif
6294
6295
6.51k
    if (kind == PyUnicode_1BYTE_KIND) {
6296
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6297
        // on short strings
6298
6.44k
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6299
6.44k
        if (v == NULL) {
6300
0
            return NULL;
6301
0
        }
6302
6303
        /* output buffer is 2-bytes aligned */
6304
6.44k
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6305
6.44k
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6306
6.44k
        if (byteorder == 0) {
6307
0
            *out++ = 0xFEFF;
6308
0
        }
6309
6.44k
        if (len > 0) {
6310
6.44k
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6311
6.44k
        }
6312
6.44k
        return v;
6313
6.44k
    }
6314
6315
68
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6316
68
    if (writer == NULL) {
6317
0
        return NULL;
6318
0
    }
6319
6320
    /* output buffer is 2-bytes aligned */
6321
68
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6322
68
    unsigned short *out = PyBytesWriter_GetData(writer);
6323
68
    if (byteorder == 0) {
6324
0
        *out++ = 0xFEFF;
6325
0
    }
6326
68
    if (len == 0) {
6327
0
        return PyBytesWriter_Finish(writer);
6328
0
    }
6329
6330
68
    const char *encoding;
6331
68
    if (byteorder < 0) {
6332
0
        encoding = "utf-16-le";
6333
0
    }
6334
68
    else if (byteorder > 0) {
6335
68
        encoding = "utf-16-be";
6336
68
    }
6337
0
    else {
6338
0
        encoding = "utf-16";
6339
0
    }
6340
6341
68
    PyObject *errorHandler = NULL;
6342
68
    PyObject *exc = NULL;
6343
68
    PyObject *rep = NULL;
6344
6345
68
    for (Py_ssize_t pos = 0; pos < len; ) {
6346
68
        if (kind == PyUnicode_2BYTE_KIND) {
6347
68
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6348
68
                                        &out, native_ordering);
6349
68
        }
6350
0
        else {
6351
0
            assert(kind == PyUnicode_4BYTE_KIND);
6352
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6353
0
                                        &out, native_ordering);
6354
0
        }
6355
68
        if (pos == len)
6356
68
            break;
6357
6358
0
        Py_ssize_t newpos;
6359
0
        rep = unicode_encode_call_errorhandler(
6360
0
                errors, &errorHandler,
6361
0
                encoding, "surrogates not allowed",
6362
0
                str, &exc, pos, pos + 1, &newpos);
6363
0
        if (!rep)
6364
0
            goto error;
6365
6366
0
        Py_ssize_t repsize, moreunits;
6367
0
        if (PyBytes_Check(rep)) {
6368
0
            repsize = PyBytes_GET_SIZE(rep);
6369
0
            if (repsize & 1) {
6370
0
                raise_encode_exception(&exc, encoding,
6371
0
                                       str, pos, pos + 1,
6372
0
                                       "surrogates not allowed");
6373
0
                goto error;
6374
0
            }
6375
0
            moreunits = repsize / 2;
6376
0
        }
6377
0
        else {
6378
0
            assert(PyUnicode_Check(rep));
6379
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6380
0
            if (!PyUnicode_IS_ASCII(rep)) {
6381
0
                raise_encode_exception(&exc, encoding,
6382
0
                                       str, pos, pos + 1,
6383
0
                                       "surrogates not allowed");
6384
0
                goto error;
6385
0
            }
6386
0
        }
6387
0
        moreunits += pos - newpos;
6388
0
        pos = newpos;
6389
6390
        /* two bytes are reserved for each surrogate */
6391
0
        if (moreunits > 0) {
6392
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6393
0
            if (out == NULL) {
6394
0
                goto error;
6395
0
            }
6396
0
        }
6397
6398
0
        if (PyBytes_Check(rep)) {
6399
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6400
0
            out += repsize / 2;
6401
0
        } else {
6402
            /* rep is unicode */
6403
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6404
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6405
0
                                 &out, native_ordering);
6406
0
        }
6407
6408
0
        Py_CLEAR(rep);
6409
0
    }
6410
6411
68
    Py_XDECREF(errorHandler);
6412
68
    Py_XDECREF(exc);
6413
6414
    /* Cut back to size actually needed. This is necessary for, for example,
6415
    encoding of a string containing isolated surrogates and the 'ignore' handler
6416
    is used. */
6417
68
    return PyBytesWriter_FinishWithPointer(writer, out);
6418
6419
0
  error:
6420
0
    Py_XDECREF(rep);
6421
0
    Py_XDECREF(errorHandler);
6422
0
    Py_XDECREF(exc);
6423
0
    PyBytesWriter_Discard(writer);
6424
0
    return NULL;
6425
68
}
6426
6427
PyObject *
6428
PyUnicode_AsUTF16String(PyObject *unicode)
6429
0
{
6430
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6431
0
}
6432
6433
_PyUnicode_Name_CAPI *
6434
_PyUnicode_GetNameCAPI(void)
6435
15.7k
{
6436
15.7k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6437
15.7k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6438
6439
15.7k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6440
15.7k
    if (ucnhash_capi == NULL) {
6441
2
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6442
2
                PyUnicodeData_CAPSULE_NAME, 1);
6443
6444
        // It's fine if we overwrite the value here. It's always the same value.
6445
2
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6446
2
    }
6447
15.7k
    return ucnhash_capi;
6448
15.7k
}
6449
6450
/* --- Unicode Escape Codec ----------------------------------------------- */
6451
6452
PyObject *
6453
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6454
                               Py_ssize_t size,
6455
                               const char *errors,
6456
                               Py_ssize_t *consumed,
6457
                               int *first_invalid_escape_char,
6458
                               const char **first_invalid_escape_ptr)
6459
31.9k
{
6460
31.9k
    const char *starts = s;
6461
31.9k
    const char *initial_starts = starts;
6462
31.9k
    _PyUnicodeWriter writer;
6463
31.9k
    const char *end;
6464
31.9k
    PyObject *errorHandler = NULL;
6465
31.9k
    PyObject *exc = NULL;
6466
31.9k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6467
6468
    // so we can remember if we've seen an invalid escape char or not
6469
31.9k
    *first_invalid_escape_char = -1;
6470
31.9k
    *first_invalid_escape_ptr = NULL;
6471
6472
31.9k
    if (size == 0) {
6473
2.14k
        if (consumed) {
6474
0
            *consumed = 0;
6475
0
        }
6476
2.14k
        _Py_RETURN_UNICODE_EMPTY();
6477
2.14k
    }
6478
    /* Escaped strings will always be longer than the resulting
6479
       Unicode string, so we start with size here and then reduce the
6480
       length after conversion to the true value.
6481
       (but if the error callback returns a long replacement string
6482
       we'll have to allocate more space) */
6483
29.8k
    _PyUnicodeWriter_Init(&writer);
6484
29.8k
    writer.min_length = size;
6485
29.8k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6486
0
        goto onError;
6487
0
    }
6488
6489
29.8k
    end = s + size;
6490
8.97M
    while (s < end) {
6491
8.94M
        unsigned char c = (unsigned char) *s++;
6492
8.94M
        Py_UCS4 ch;
6493
8.94M
        int count;
6494
8.94M
        const char *message;
6495
6496
8.94M
#define WRITE_ASCII_CHAR(ch)                                                  \
6497
8.94M
            do {                                                              \
6498
115k
                assert(ch <= 127);                                            \
6499
115k
                assert(writer.pos < writer.size);                             \
6500
115k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6501
115k
            } while(0)
6502
6503
8.94M
#define WRITE_CHAR(ch)                                                        \
6504
8.94M
            do {                                                              \
6505
8.86M
                if (ch <= writer.maxchar) {                                   \
6506
8.84M
                    assert(writer.pos < writer.size);                         \
6507
8.84M
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6508
8.84M
                }                                                             \
6509
8.86M
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6510
0
                    goto onError;                                             \
6511
0
                }                                                             \
6512
8.86M
            } while(0)
6513
6514
        /* Non-escape characters are interpreted as Unicode ordinals */
6515
8.94M
        if (c != '\\') {
6516
8.65M
            WRITE_CHAR(c);
6517
8.65M
            continue;
6518
8.65M
        }
6519
6520
283k
        Py_ssize_t startinpos = s - starts - 1;
6521
        /* \ - Escapes */
6522
283k
        if (s >= end) {
6523
15
            message = "\\ at end of string";
6524
15
            goto incomplete;
6525
15
        }
6526
283k
        c = (unsigned char) *s++;
6527
6528
283k
        assert(writer.pos < writer.size);
6529
283k
        switch (c) {
6530
6531
            /* \x escapes */
6532
1.06k
        case '\n': continue;
6533
36.8k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6534
1.70k
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6535
3.57k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6536
3.42k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6537
        /* FF */
6538
6.72k
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6539
1.48k
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6540
2.06k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6541
6.45k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6542
        /* VT */
6543
14.1k
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6544
        /* BEL, not classic C */
6545
2.51k
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6546
6547
            /* \OOO (octal) escapes */
6548
43.4k
        case '0': case '1': case '2': case '3':
6549
110k
        case '4': case '5': case '6': case '7':
6550
110k
            ch = c - '0';
6551
110k
            if (s < end && '0' <= *s && *s <= '7') {
6552
66.0k
                ch = (ch<<3) + *s++ - '0';
6553
66.0k
                if (s < end && '0' <= *s && *s <= '7') {
6554
52.3k
                    ch = (ch<<3) + *s++ - '0';
6555
52.3k
                }
6556
66.0k
            }
6557
110k
            if (ch > 0377) {
6558
48.9k
                if (*first_invalid_escape_char == -1) {
6559
1.15k
                    *first_invalid_escape_char = ch;
6560
1.15k
                    if (starts == initial_starts) {
6561
                        /* Back up 3 chars, since we've already incremented s. */
6562
1.15k
                        *first_invalid_escape_ptr = s - 3;
6563
1.15k
                    }
6564
1.15k
                }
6565
48.9k
            }
6566
110k
            WRITE_CHAR(ch);
6567
110k
            continue;
6568
6569
            /* hex escapes */
6570
            /* \xXX */
6571
110k
        case 'x':
6572
14.5k
            count = 2;
6573
14.5k
            message = "truncated \\xXX escape";
6574
14.5k
            goto hexescape;
6575
6576
            /* \uXXXX */
6577
6.69k
        case 'u':
6578
6.69k
            count = 4;
6579
6.69k
            message = "truncated \\uXXXX escape";
6580
6.69k
            goto hexescape;
6581
6582
            /* \UXXXXXXXX */
6583
19.7k
        case 'U':
6584
19.7k
            count = 8;
6585
19.7k
            message = "truncated \\UXXXXXXXX escape";
6586
40.9k
        hexescape:
6587
254k
            for (ch = 0; count; ++s, --count) {
6588
213k
                if (s >= end) {
6589
18
                    goto incomplete;
6590
18
                }
6591
213k
                c = (unsigned char)*s;
6592
213k
                ch <<= 4;
6593
213k
                if (c >= '0' && c <= '9') {
6594
151k
                    ch += c - '0';
6595
151k
                }
6596
61.5k
                else if (c >= 'a' && c <= 'f') {
6597
61.0k
                    ch += c - ('a' - 10);
6598
61.0k
                }
6599
499
                else if (c >= 'A' && c <= 'F') {
6600
459
                    ch += c - ('A' - 10);
6601
459
                }
6602
40
                else {
6603
40
                    goto error;
6604
40
                }
6605
213k
            }
6606
6607
            /* when we get here, ch is a 32-bit unicode character */
6608
40.9k
            if (ch > MAX_UNICODE) {
6609
8
                message = "illegal Unicode character";
6610
8
                goto error;
6611
8
            }
6612
6613
40.9k
            WRITE_CHAR(ch);
6614
40.9k
            continue;
6615
6616
            /* \N{name} */
6617
40.9k
        case 'N':
6618
15.7k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6619
15.7k
            if (ucnhash_capi == NULL) {
6620
0
                PyErr_SetString(
6621
0
                        PyExc_UnicodeError,
6622
0
                        "\\N escapes not supported (can't load unicodedata module)"
6623
0
                );
6624
0
                goto onError;
6625
0
            }
6626
6627
15.7k
            message = "malformed \\N character escape";
6628
15.7k
            if (s >= end) {
6629
6
                goto incomplete;
6630
6
            }
6631
15.7k
            if (*s == '{') {
6632
15.6k
                const char *start = ++s;
6633
15.6k
                size_t namelen;
6634
                /* look for the closing brace */
6635
90.2k
                while (s < end && *s != '}')
6636
74.5k
                    s++;
6637
15.6k
                if (s >= end) {
6638
20
                    goto incomplete;
6639
20
                }
6640
15.6k
                namelen = s - start;
6641
15.6k
                if (namelen) {
6642
                    /* found a name.  look it up in the unicode database */
6643
15.6k
                    s++;
6644
15.6k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6645
15.6k
                    if (namelen <= INT_MAX &&
6646
15.6k
                        ucnhash_capi->getcode(start, (int)namelen,
6647
15.6k
                                              &ch, 0)) {
6648
15.5k
                        assert(ch <= MAX_UNICODE);
6649
15.5k
                        WRITE_CHAR(ch);
6650
15.5k
                        continue;
6651
15.5k
                    }
6652
106
                    message = "unknown Unicode character name";
6653
106
                }
6654
15.6k
            }
6655
125
            goto error;
6656
6657
36.2k
        default:
6658
36.2k
            if (*first_invalid_escape_char == -1) {
6659
4.14k
                *first_invalid_escape_char = c;
6660
4.14k
                if (starts == initial_starts) {
6661
                    /* Back up one char, since we've already incremented s. */
6662
4.14k
                    *first_invalid_escape_ptr = s - 1;
6663
4.14k
                }
6664
4.14k
            }
6665
36.2k
            WRITE_ASCII_CHAR('\\');
6666
36.2k
            WRITE_CHAR(c);
6667
36.2k
            continue;
6668
283k
        }
6669
6670
59
      incomplete:
6671
59
        if (consumed) {
6672
0
            *consumed = startinpos;
6673
0
            break;
6674
0
        }
6675
232
      error:;
6676
232
        Py_ssize_t endinpos = s-starts;
6677
232
        writer.min_length = end - s + writer.pos;
6678
232
        if (unicode_decode_call_errorhandler_writer(
6679
232
                errors, &errorHandler,
6680
232
                "unicodeescape", message,
6681
232
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6682
232
                &writer)) {
6683
232
            goto onError;
6684
232
        }
6685
232
        assert(end - s <= writer.size - writer.pos);
6686
6687
0
#undef WRITE_ASCII_CHAR
6688
0
#undef WRITE_CHAR
6689
0
    }
6690
6691
29.5k
    Py_XDECREF(errorHandler);
6692
29.5k
    Py_XDECREF(exc);
6693
29.5k
    return _PyUnicodeWriter_Finish(&writer);
6694
6695
232
  onError:
6696
232
    _PyUnicodeWriter_Dealloc(&writer);
6697
232
    Py_XDECREF(errorHandler);
6698
232
    Py_XDECREF(exc);
6699
232
    return NULL;
6700
29.8k
}
6701
6702
PyObject *
6703
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6704
                              Py_ssize_t size,
6705
                              const char *errors,
6706
                              Py_ssize_t *consumed)
6707
545
{
6708
545
    int first_invalid_escape_char;
6709
545
    const char *first_invalid_escape_ptr;
6710
545
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6711
545
                                                      consumed,
6712
545
                                                      &first_invalid_escape_char,
6713
545
                                                      &first_invalid_escape_ptr);
6714
545
    if (result == NULL)
6715
122
        return NULL;
6716
423
    if (first_invalid_escape_char != -1) {
6717
303
        if (first_invalid_escape_char > 0xff) {
6718
96
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6719
96
                                 "\"\\%o\" is an invalid octal escape sequence. "
6720
96
                                 "Such sequences will not work in the future. ",
6721
96
                                 first_invalid_escape_char) < 0)
6722
0
            {
6723
0
                Py_DECREF(result);
6724
0
                return NULL;
6725
0
            }
6726
96
        }
6727
207
        else {
6728
207
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6729
207
                                 "\"\\%c\" is an invalid escape sequence. "
6730
207
                                 "Such sequences will not work in the future. ",
6731
207
                                 first_invalid_escape_char) < 0)
6732
0
            {
6733
0
                Py_DECREF(result);
6734
0
                return NULL;
6735
0
            }
6736
207
        }
6737
303
    }
6738
423
    return result;
6739
423
}
6740
6741
PyObject *
6742
PyUnicode_DecodeUnicodeEscape(const char *s,
6743
                              Py_ssize_t size,
6744
                              const char *errors)
6745
0
{
6746
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6747
0
}
6748
6749
/* Return a Unicode-Escape string version of the Unicode object. */
6750
6751
PyObject *
6752
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6753
523k
{
6754
523k
    if (!PyUnicode_Check(unicode)) {
6755
0
        PyErr_BadArgument();
6756
0
        return NULL;
6757
0
    }
6758
6759
523k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6760
523k
    if (len == 0) {
6761
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6762
0
    }
6763
523k
    int kind = PyUnicode_KIND(unicode);
6764
523k
    const void *data = PyUnicode_DATA(unicode);
6765
6766
    /* Initial allocation is based on the longest-possible character
6767
     * escape.
6768
     *
6769
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6770
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6771
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6772
523k
    Py_ssize_t expandsize = kind * 2 + 2;
6773
523k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6774
0
        return PyErr_NoMemory();
6775
0
    }
6776
6777
523k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6778
523k
    if (writer == NULL) {
6779
0
        return NULL;
6780
0
    }
6781
523k
    char *p = PyBytesWriter_GetData(writer);
6782
6783
1.04M
    for (Py_ssize_t i = 0; i < len; i++) {
6784
523k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6785
6786
        /* U+0000-U+00ff range */
6787
523k
        if (ch < 0x100) {
6788
514k
            if (ch >= ' ' && ch < 127) {
6789
56.0k
                if (ch != '\\') {
6790
                    /* Copy printable US ASCII as-is */
6791
0
                    *p++ = (char) ch;
6792
0
                }
6793
                /* Escape backslashes */
6794
56.0k
                else {
6795
56.0k
                    *p++ = '\\';
6796
56.0k
                    *p++ = '\\';
6797
56.0k
                }
6798
56.0k
            }
6799
6800
            /* Map special whitespace to '\t', \n', '\r' */
6801
458k
            else if (ch == '\t') {
6802
4.62k
                *p++ = '\\';
6803
4.62k
                *p++ = 't';
6804
4.62k
            }
6805
454k
            else if (ch == '\n') {
6806
1.05k
                *p++ = '\\';
6807
1.05k
                *p++ = 'n';
6808
1.05k
            }
6809
453k
            else if (ch == '\r') {
6810
678
                *p++ = '\\';
6811
678
                *p++ = 'r';
6812
678
            }
6813
6814
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6815
452k
            else {
6816
452k
                *p++ = '\\';
6817
452k
                *p++ = 'x';
6818
452k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6819
452k
                *p++ = Py_hexdigits[ch & 0x000F];
6820
452k
            }
6821
514k
        }
6822
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6823
8.41k
        else if (ch < 0x10000) {
6824
7.02k
            *p++ = '\\';
6825
7.02k
            *p++ = 'u';
6826
7.02k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6827
7.02k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6828
7.02k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6829
7.02k
            *p++ = Py_hexdigits[ch & 0x000F];
6830
7.02k
        }
6831
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6832
1.38k
        else {
6833
6834
            /* Make sure that the first two digits are zero */
6835
1.38k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6836
1.38k
            *p++ = '\\';
6837
1.38k
            *p++ = 'U';
6838
1.38k
            *p++ = '0';
6839
1.38k
            *p++ = '0';
6840
1.38k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6841
1.38k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6842
1.38k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6843
1.38k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6844
1.38k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6845
1.38k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6846
1.38k
        }
6847
523k
    }
6848
6849
523k
    return PyBytesWriter_FinishWithPointer(writer, p);
6850
523k
}
6851
6852
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6853
6854
PyObject *
6855
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6856
                                          Py_ssize_t size,
6857
                                          const char *errors,
6858
                                          Py_ssize_t *consumed)
6859
112
{
6860
112
    const char *starts = s;
6861
112
    _PyUnicodeWriter writer;
6862
112
    const char *end;
6863
112
    PyObject *errorHandler = NULL;
6864
112
    PyObject *exc = NULL;
6865
6866
112
    if (size == 0) {
6867
0
        if (consumed) {
6868
0
            *consumed = 0;
6869
0
        }
6870
0
        _Py_RETURN_UNICODE_EMPTY();
6871
0
    }
6872
6873
    /* Escaped strings will always be longer than the resulting
6874
       Unicode string, so we start with size here and then reduce the
6875
       length after conversion to the true value. (But decoding error
6876
       handler might have to resize the string) */
6877
112
    _PyUnicodeWriter_Init(&writer);
6878
112
    writer.min_length = size;
6879
112
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6880
0
        goto onError;
6881
0
    }
6882
6883
112
    end = s + size;
6884
46.7k
    while (s < end) {
6885
46.6k
        unsigned char c = (unsigned char) *s++;
6886
46.6k
        Py_UCS4 ch;
6887
46.6k
        int count;
6888
46.6k
        const char *message;
6889
6890
46.6k
#define WRITE_CHAR(ch)                                                        \
6891
46.6k
            do {                                                              \
6892
46.6k
                if (ch <= writer.maxchar) {                                   \
6893
46.5k
                    assert(writer.pos < writer.size);                         \
6894
46.5k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6895
46.5k
                }                                                             \
6896
46.6k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6897
0
                    goto onError;                                             \
6898
0
                }                                                             \
6899
46.6k
            } while(0)
6900
6901
        /* Non-escape characters are interpreted as Unicode ordinals */
6902
46.6k
        if (c != '\\' || (s >= end && !consumed)) {
6903
43.5k
            WRITE_CHAR(c);
6904
43.5k
            continue;
6905
43.5k
        }
6906
6907
3.07k
        Py_ssize_t startinpos = s - starts - 1;
6908
        /* \ - Escapes */
6909
3.07k
        if (s >= end) {
6910
0
            assert(consumed);
6911
            // Set message to silent compiler warning.
6912
            // Actually it is never used.
6913
0
            message = "\\ at end of string";
6914
0
            goto incomplete;
6915
0
        }
6916
6917
3.07k
        c = (unsigned char) *s++;
6918
3.07k
        if (c == 'u') {
6919
404
            count = 4;
6920
404
            message = "truncated \\uXXXX escape";
6921
404
        }
6922
2.66k
        else if (c == 'U') {
6923
542
            count = 8;
6924
542
            message = "truncated \\UXXXXXXXX escape";
6925
542
        }
6926
2.12k
        else {
6927
2.12k
            assert(writer.pos < writer.size);
6928
2.12k
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6929
2.12k
            WRITE_CHAR(c);
6930
2.12k
            continue;
6931
2.12k
        }
6932
6933
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6934
6.78k
        for (ch = 0; count; ++s, --count) {
6935
5.86k
            if (s >= end) {
6936
7
                goto incomplete;
6937
7
            }
6938
5.85k
            c = (unsigned char)*s;
6939
5.85k
            ch <<= 4;
6940
5.85k
            if (c >= '0' && c <= '9') {
6941
5.00k
                ch += c - '0';
6942
5.00k
            }
6943
850
            else if (c >= 'a' && c <= 'f') {
6944
736
                ch += c - ('a' - 10);
6945
736
            }
6946
114
            else if (c >= 'A' && c <= 'F') {
6947
95
                ch += c - ('A' - 10);
6948
95
            }
6949
19
            else {
6950
19
                goto error;
6951
19
            }
6952
5.85k
        }
6953
920
        if (ch > MAX_UNICODE) {
6954
3
            message = "\\Uxxxxxxxx out of range";
6955
3
            goto error;
6956
3
        }
6957
917
        WRITE_CHAR(ch);
6958
917
        continue;
6959
6960
917
      incomplete:
6961
7
        if (consumed) {
6962
0
            *consumed = startinpos;
6963
0
            break;
6964
0
        }
6965
29
      error:;
6966
29
        Py_ssize_t endinpos = s-starts;
6967
29
        writer.min_length = end - s + writer.pos;
6968
29
        if (unicode_decode_call_errorhandler_writer(
6969
29
                errors, &errorHandler,
6970
29
                "rawunicodeescape", message,
6971
29
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6972
29
                &writer)) {
6973
29
            goto onError;
6974
29
        }
6975
29
        assert(end - s <= writer.size - writer.pos);
6976
6977
0
#undef WRITE_CHAR
6978
0
    }
6979
83
    Py_XDECREF(errorHandler);
6980
83
    Py_XDECREF(exc);
6981
83
    return _PyUnicodeWriter_Finish(&writer);
6982
6983
29
  onError:
6984
29
    _PyUnicodeWriter_Dealloc(&writer);
6985
29
    Py_XDECREF(errorHandler);
6986
29
    Py_XDECREF(exc);
6987
29
    return NULL;
6988
112
}
6989
6990
PyObject *
6991
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6992
                                 Py_ssize_t size,
6993
                                 const char *errors)
6994
0
{
6995
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
6996
0
}
6997
6998
6999
PyObject *
7000
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7001
274k
{
7002
274k
    if (!PyUnicode_Check(unicode)) {
7003
0
        PyErr_BadArgument();
7004
0
        return NULL;
7005
0
    }
7006
274k
    int kind = PyUnicode_KIND(unicode);
7007
274k
    const void *data = PyUnicode_DATA(unicode);
7008
274k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7009
274k
    if (len == 0) {
7010
530
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7011
530
    }
7012
273k
    if (kind == PyUnicode_1BYTE_KIND) {
7013
273k
        return PyBytes_FromStringAndSize(data, len);
7014
273k
    }
7015
7016
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7017
       bytes, and 1 byte characters 4. */
7018
296
    Py_ssize_t expandsize = kind * 2 + 2;
7019
296
    if (len > PY_SSIZE_T_MAX / expandsize) {
7020
0
        return PyErr_NoMemory();
7021
0
    }
7022
7023
296
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7024
296
    if (writer == NULL) {
7025
0
        return NULL;
7026
0
    }
7027
296
    char *p = PyBytesWriter_GetData(writer);
7028
7029
4.92M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7030
4.92M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7031
7032
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7033
4.92M
        if (ch < 0x100) {
7034
4.88M
            *p++ = (char) ch;
7035
4.88M
        }
7036
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7037
36.7k
        else if (ch < 0x10000) {
7038
36.2k
            *p++ = '\\';
7039
36.2k
            *p++ = 'u';
7040
36.2k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7041
36.2k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7042
36.2k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7043
36.2k
            *p++ = Py_hexdigits[ch & 15];
7044
36.2k
        }
7045
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7046
579
        else {
7047
579
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7048
579
            *p++ = '\\';
7049
579
            *p++ = 'U';
7050
579
            *p++ = '0';
7051
579
            *p++ = '0';
7052
579
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7053
579
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7054
579
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7055
579
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7056
579
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7057
579
            *p++ = Py_hexdigits[ch & 15];
7058
579
        }
7059
4.92M
    }
7060
7061
296
    return PyBytesWriter_FinishWithPointer(writer, p);
7062
296
}
7063
7064
/* --- Latin-1 Codec ------------------------------------------------------ */
7065
7066
PyObject *
7067
PyUnicode_DecodeLatin1(const char *s,
7068
                       Py_ssize_t size,
7069
                       const char *errors)
7070
2.87M
{
7071
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7072
2.87M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7073
2.87M
}
7074
7075
/* create or adjust a UnicodeEncodeError */
7076
static void
7077
make_encode_exception(PyObject **exceptionObject,
7078
                      const char *encoding,
7079
                      PyObject *unicode,
7080
                      Py_ssize_t startpos, Py_ssize_t endpos,
7081
                      const char *reason)
7082
718k
{
7083
718k
    if (*exceptionObject == NULL) {
7084
718k
        *exceptionObject = PyObject_CallFunction(
7085
718k
            PyExc_UnicodeEncodeError, "sOnns",
7086
718k
            encoding, unicode, startpos, endpos, reason);
7087
718k
    }
7088
0
    else {
7089
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7090
0
            goto onError;
7091
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7092
0
            goto onError;
7093
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7094
0
            goto onError;
7095
0
        return;
7096
0
      onError:
7097
0
        Py_CLEAR(*exceptionObject);
7098
0
    }
7099
718k
}
7100
7101
/* raises a UnicodeEncodeError */
7102
static void
7103
raise_encode_exception(PyObject **exceptionObject,
7104
                       const char *encoding,
7105
                       PyObject *unicode,
7106
                       Py_ssize_t startpos, Py_ssize_t endpos,
7107
                       const char *reason)
7108
570k
{
7109
570k
    make_encode_exception(exceptionObject,
7110
570k
                          encoding, unicode, startpos, endpos, reason);
7111
570k
    if (*exceptionObject != NULL)
7112
570k
        PyCodec_StrictErrors(*exceptionObject);
7113
570k
}
7114
7115
/* error handling callback helper:
7116
   build arguments, call the callback and check the arguments,
7117
   put the result into newpos and return the replacement string, which
7118
   has to be freed by the caller */
7119
static PyObject *
7120
unicode_encode_call_errorhandler(const char *errors,
7121
                                 PyObject **errorHandler,
7122
                                 const char *encoding, const char *reason,
7123
                                 PyObject *unicode, PyObject **exceptionObject,
7124
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7125
                                 Py_ssize_t *newpos)
7126
148k
{
7127
148k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7128
148k
    Py_ssize_t len;
7129
148k
    PyObject *restuple;
7130
148k
    PyObject *resunicode;
7131
7132
148k
    if (*errorHandler == NULL) {
7133
148k
        *errorHandler = PyCodec_LookupError(errors);
7134
148k
        if (*errorHandler == NULL)
7135
0
            return NULL;
7136
148k
    }
7137
7138
148k
    len = PyUnicode_GET_LENGTH(unicode);
7139
7140
148k
    make_encode_exception(exceptionObject,
7141
148k
                          encoding, unicode, startpos, endpos, reason);
7142
148k
    if (*exceptionObject == NULL)
7143
0
        return NULL;
7144
7145
148k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7146
148k
    if (restuple == NULL)
7147
148k
        return NULL;
7148
0
    if (!PyTuple_Check(restuple)) {
7149
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7150
0
        Py_DECREF(restuple);
7151
0
        return NULL;
7152
0
    }
7153
0
    if (!PyArg_ParseTuple(restuple, argparse,
7154
0
                          &resunicode, newpos)) {
7155
0
        Py_DECREF(restuple);
7156
0
        return NULL;
7157
0
    }
7158
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7159
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7160
0
        Py_DECREF(restuple);
7161
0
        return NULL;
7162
0
    }
7163
0
    if (*newpos<0)
7164
0
        *newpos = len + *newpos;
7165
0
    if (*newpos<0 || *newpos>len) {
7166
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7167
0
        Py_DECREF(restuple);
7168
0
        return NULL;
7169
0
    }
7170
0
    Py_INCREF(resunicode);
7171
0
    Py_DECREF(restuple);
7172
0
    return resunicode;
7173
0
}
7174
7175
static PyObject *
7176
unicode_encode_ucs1(PyObject *unicode,
7177
                    const char *errors,
7178
                    const Py_UCS4 limit)
7179
585k
{
7180
    /* input state */
7181
585k
    Py_ssize_t pos=0, size;
7182
585k
    int kind;
7183
585k
    const void *data;
7184
585k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7185
585k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7186
585k
    PyObject *error_handler_obj = NULL;
7187
585k
    PyObject *exc = NULL;
7188
585k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7189
585k
    PyObject *rep = NULL;
7190
7191
585k
    size = PyUnicode_GET_LENGTH(unicode);
7192
585k
    kind = PyUnicode_KIND(unicode);
7193
585k
    data = PyUnicode_DATA(unicode);
7194
    /* allocate enough for a simple encoding without
7195
       replacements, if we need more, we'll resize */
7196
585k
    if (size == 0)
7197
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7198
7199
    /* output object */
7200
585k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7201
585k
    if (writer == NULL) {
7202
0
        return NULL;
7203
0
    }
7204
    /* pointer into the output */
7205
585k
    char *str = PyBytesWriter_GetData(writer);
7206
7207
5.96M
    while (pos < size) {
7208
5.95M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7209
7210
        /* can we encode this? */
7211
5.95M
        if (ch < limit) {
7212
            /* no overflow check, because we know that the space is enough */
7213
5.37M
            *str++ = (char)ch;
7214
5.37M
            ++pos;
7215
5.37M
        }
7216
585k
        else {
7217
585k
            Py_ssize_t newpos, i;
7218
            /* startpos for collecting unencodable chars */
7219
585k
            Py_ssize_t collstart = pos;
7220
585k
            Py_ssize_t collend = collstart + 1;
7221
            /* find all unecodable characters */
7222
7223
2.42M
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7224
1.84M
                ++collend;
7225
7226
            /* Only overallocate the buffer if it's not the last write */
7227
585k
            writer->overallocate = (collend < size);
7228
7229
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7230
585k
            if (error_handler == _Py_ERROR_UNKNOWN)
7231
585k
                error_handler = _Py_GetErrorHandler(errors);
7232
7233
585k
            switch (error_handler) {
7234
570k
            case _Py_ERROR_STRICT:
7235
570k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7236
570k
                goto onError;
7237
7238
6.02k
            case _Py_ERROR_REPLACE:
7239
6.02k
                memset(str, '?', collend - collstart);
7240
6.02k
                str += (collend - collstart);
7241
6.02k
                _Py_FALLTHROUGH;
7242
6.02k
            case _Py_ERROR_IGNORE:
7243
6.02k
                pos = collend;
7244
6.02k
                break;
7245
7246
0
            case _Py_ERROR_BACKSLASHREPLACE:
7247
                /* subtract preallocated bytes */
7248
0
                writer->size -= (collend - collstart);
7249
0
                str = backslashreplace(writer, str,
7250
0
                                       unicode, collstart, collend);
7251
0
                if (str == NULL)
7252
0
                    goto onError;
7253
0
                pos = collend;
7254
0
                break;
7255
7256
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7257
                /* subtract preallocated bytes */
7258
0
                writer->size -= (collend - collstart);
7259
0
                str = xmlcharrefreplace(writer, str,
7260
0
                                        unicode, collstart, collend);
7261
0
                if (str == NULL)
7262
0
                    goto onError;
7263
0
                pos = collend;
7264
0
                break;
7265
7266
8.93k
            case _Py_ERROR_SURROGATEESCAPE:
7267
8.93k
                for (i = collstart; i < collend; ++i) {
7268
8.93k
                    ch = PyUnicode_READ(kind, data, i);
7269
8.93k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7270
                        /* Not a UTF-8b surrogate */
7271
8.93k
                        break;
7272
8.93k
                    }
7273
0
                    *str++ = (char)(ch - 0xdc00);
7274
0
                    ++pos;
7275
0
                }
7276
8.93k
                if (i >= collend)
7277
0
                    break;
7278
8.93k
                collstart = pos;
7279
8.93k
                assert(collstart != collend);
7280
8.93k
                _Py_FALLTHROUGH;
7281
7282
8.93k
            default:
7283
8.93k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7284
8.93k
                                                       encoding, reason, unicode, &exc,
7285
8.93k
                                                       collstart, collend, &newpos);
7286
8.93k
                if (rep == NULL)
7287
8.93k
                    goto onError;
7288
7289
0
                if (newpos < collstart) {
7290
0
                    writer->overallocate = 1;
7291
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7292
0
                                                             collstart - newpos,
7293
0
                                                             str);
7294
0
                    if (str == NULL) {
7295
0
                        goto onError;
7296
0
                    }
7297
0
                }
7298
0
                else {
7299
                    /* subtract preallocated bytes */
7300
0
                    writer->size -= newpos - collstart;
7301
                    /* Only overallocate the buffer if it's not the last write */
7302
0
                    writer->overallocate = (newpos < size);
7303
0
                }
7304
7305
0
                char *rep_str;
7306
0
                Py_ssize_t rep_len;
7307
0
                if (PyBytes_Check(rep)) {
7308
                    /* Directly copy bytes result to output. */
7309
0
                    rep_str = PyBytes_AS_STRING(rep);
7310
0
                    rep_len = PyBytes_GET_SIZE(rep);
7311
0
                }
7312
0
                else {
7313
0
                    assert(PyUnicode_Check(rep));
7314
7315
0
                    if (limit == 256 ?
7316
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7317
0
                        !PyUnicode_IS_ASCII(rep))
7318
0
                    {
7319
                        /* Not all characters are smaller than limit */
7320
0
                        raise_encode_exception(&exc, encoding, unicode,
7321
0
                                               collstart, collend, reason);
7322
0
                        goto onError;
7323
0
                    }
7324
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7325
0
                    rep_str = PyUnicode_DATA(rep);
7326
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7327
0
                }
7328
7329
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7330
0
                if (str == NULL) {
7331
0
                    goto onError;
7332
0
                }
7333
0
                memcpy(str, rep_str, rep_len);
7334
0
                str += rep_len;
7335
7336
0
                pos = newpos;
7337
0
                Py_CLEAR(rep);
7338
585k
            }
7339
7340
            /* If overallocation was disabled, ensure that it was the last
7341
               write. Otherwise, we missed an optimization */
7342
585k
            assert(writer->overallocate || pos == size);
7343
6.02k
        }
7344
5.95M
    }
7345
7346
5.79k
    Py_XDECREF(error_handler_obj);
7347
5.79k
    Py_XDECREF(exc);
7348
5.79k
    return PyBytesWriter_FinishWithPointer(writer, str);
7349
7350
579k
  onError:
7351
579k
    Py_XDECREF(rep);
7352
579k
    PyBytesWriter_Discard(writer);
7353
579k
    Py_XDECREF(error_handler_obj);
7354
579k
    Py_XDECREF(exc);
7355
579k
    return NULL;
7356
585k
}
7357
7358
PyObject *
7359
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7360
10
{
7361
10
    if (!PyUnicode_Check(unicode)) {
7362
0
        PyErr_BadArgument();
7363
0
        return NULL;
7364
0
    }
7365
    /* Fast path: if it is a one-byte string, construct
7366
       bytes object directly. */
7367
10
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7368
10
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7369
10
                                         PyUnicode_GET_LENGTH(unicode));
7370
    /* Non-Latin-1 characters present. Defer to above function to
7371
       raise the exception. */
7372
0
    return unicode_encode_ucs1(unicode, errors, 256);
7373
10
}
7374
7375
PyObject*
7376
PyUnicode_AsLatin1String(PyObject *unicode)
7377
0
{
7378
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7379
0
}
7380
7381
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7382
7383
PyObject *
7384
PyUnicode_DecodeASCII(const char *s,
7385
                      Py_ssize_t size,
7386
                      const char *errors)
7387
13.9M
{
7388
13.9M
    const char *starts = s;
7389
13.9M
    const char *e = s + size;
7390
13.9M
    PyObject *error_handler_obj = NULL;
7391
13.9M
    PyObject *exc = NULL;
7392
13.9M
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7393
7394
13.9M
    if (size == 0)
7395
0
        _Py_RETURN_UNICODE_EMPTY();
7396
7397
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7398
13.9M
    if (size == 1 && (unsigned char)s[0] < 128) {
7399
524k
        return get_latin1_char((unsigned char)s[0]);
7400
524k
    }
7401
7402
    // Shortcut for simple case
7403
13.3M
    PyObject *u = PyUnicode_New(size, 127);
7404
13.3M
    if (u == NULL) {
7405
0
        return NULL;
7406
0
    }
7407
13.3M
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7408
13.3M
    if (outpos == size) {
7409
10.8M
        return u;
7410
10.8M
    }
7411
7412
2.50M
    _PyUnicodeWriter writer;
7413
2.50M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7414
2.50M
    writer.pos = outpos;
7415
7416
2.50M
    s += outpos;
7417
2.50M
    int kind = writer.kind;
7418
2.50M
    void *data = writer.data;
7419
2.50M
    Py_ssize_t startinpos, endinpos;
7420
7421
20.8M
    while (s < e) {
7422
20.6M
        unsigned char c = (unsigned char)*s;
7423
20.6M
        if (c < 128) {
7424
5.74M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7425
5.74M
            writer.pos++;
7426
5.74M
            ++s;
7427
5.74M
            continue;
7428
5.74M
        }
7429
7430
        /* byte outsize range 0x00..0x7f: call the error handler */
7431
7432
14.9M
        if (error_handler == _Py_ERROR_UNKNOWN)
7433
2.50M
            error_handler = _Py_GetErrorHandler(errors);
7434
7435
14.9M
        switch (error_handler)
7436
14.9M
        {
7437
1.01M
        case _Py_ERROR_REPLACE:
7438
12.5M
        case _Py_ERROR_SURROGATEESCAPE:
7439
            /* Fast-path: the error handler only writes one character,
7440
               but we may switch to UCS2 at the first write */
7441
12.5M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7442
0
                goto onError;
7443
12.5M
            kind = writer.kind;
7444
12.5M
            data = writer.data;
7445
7446
12.5M
            if (error_handler == _Py_ERROR_REPLACE)
7447
1.01M
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7448
11.5M
            else
7449
11.5M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7450
12.5M
            writer.pos++;
7451
12.5M
            ++s;
7452
12.5M
            break;
7453
7454
0
        case _Py_ERROR_IGNORE:
7455
0
            ++s;
7456
0
            break;
7457
7458
2.32M
        default:
7459
2.32M
            startinpos = s-starts;
7460
2.32M
            endinpos = startinpos + 1;
7461
2.32M
            if (unicode_decode_call_errorhandler_writer(
7462
2.32M
                    errors, &error_handler_obj,
7463
2.32M
                    "ascii", "ordinal not in range(128)",
7464
2.32M
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7465
2.32M
                    &writer))
7466
2.32M
                goto onError;
7467
0
            kind = writer.kind;
7468
0
            data = writer.data;
7469
14.9M
        }
7470
14.9M
    }
7471
176k
    Py_XDECREF(error_handler_obj);
7472
176k
    Py_XDECREF(exc);
7473
176k
    return _PyUnicodeWriter_Finish(&writer);
7474
7475
2.32M
  onError:
7476
2.32M
    _PyUnicodeWriter_Dealloc(&writer);
7477
2.32M
    Py_XDECREF(error_handler_obj);
7478
2.32M
    Py_XDECREF(exc);
7479
2.32M
    return NULL;
7480
2.50M
}
7481
7482
PyObject *
7483
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7484
938k
{
7485
938k
    if (!PyUnicode_Check(unicode)) {
7486
0
        PyErr_BadArgument();
7487
0
        return NULL;
7488
0
    }
7489
    /* Fast path: if it is an ASCII-only string, construct bytes object
7490
       directly. Else defer to above function to raise the exception. */
7491
938k
    if (PyUnicode_IS_ASCII(unicode))
7492
353k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7493
353k
                                         PyUnicode_GET_LENGTH(unicode));
7494
585k
    return unicode_encode_ucs1(unicode, errors, 128);
7495
938k
}
7496
7497
PyObject *
7498
PyUnicode_AsASCIIString(PyObject *unicode)
7499
118k
{
7500
118k
    return _PyUnicode_AsASCIIString(unicode, NULL);
7501
118k
}
7502
7503
#ifdef MS_WINDOWS
7504
7505
/* --- MBCS codecs for Windows -------------------------------------------- */
7506
7507
#if SIZEOF_INT < SIZEOF_SIZE_T
7508
#define NEED_RETRY
7509
#endif
7510
7511
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7512
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7513
   both cases also and avoids partial characters overrunning the
7514
   length limit in MultiByteToWideChar on Windows */
7515
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7516
7517
#ifndef WC_ERR_INVALID_CHARS
7518
#  define WC_ERR_INVALID_CHARS 0x0080
7519
#endif
7520
7521
static const char*
7522
code_page_name(UINT code_page, PyObject **obj)
7523
{
7524
    *obj = NULL;
7525
    if (code_page == CP_ACP)
7526
        return "mbcs";
7527
7528
    *obj = PyBytes_FromFormat("cp%u", code_page);
7529
    if (*obj == NULL)
7530
        return NULL;
7531
    return PyBytes_AS_STRING(*obj);
7532
}
7533
7534
static DWORD
7535
decode_code_page_flags(UINT code_page)
7536
{
7537
    if (code_page == CP_UTF7) {
7538
        /* The CP_UTF7 decoder only supports flags=0 */
7539
        return 0;
7540
    }
7541
    else
7542
        return MB_ERR_INVALID_CHARS;
7543
}
7544
7545
/*
7546
 * Decode a byte string from a Windows code page into unicode object in strict
7547
 * mode.
7548
 *
7549
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7550
 * OSError and returns -1 on other error.
7551
 */
7552
static int
7553
decode_code_page_strict(UINT code_page,
7554
                        wchar_t **buf,
7555
                        Py_ssize_t *bufsize,
7556
                        const char *in,
7557
                        int insize)
7558
{
7559
    DWORD flags = MB_ERR_INVALID_CHARS;
7560
    wchar_t *out;
7561
    DWORD outsize;
7562
7563
    /* First get the size of the result */
7564
    assert(insize > 0);
7565
    while ((outsize = MultiByteToWideChar(code_page, flags,
7566
                                          in, insize, NULL, 0)) <= 0)
7567
    {
7568
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7569
            goto error;
7570
        }
7571
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7572
        flags = 0;
7573
    }
7574
7575
    /* Extend a wchar_t* buffer */
7576
    Py_ssize_t n = *bufsize;   /* Get the current length */
7577
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7578
        return -1;
7579
    }
7580
    out = *buf + n;
7581
7582
    /* Do the conversion */
7583
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7584
    if (outsize <= 0)
7585
        goto error;
7586
    return insize;
7587
7588
error:
7589
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7590
        return -2;
7591
    PyErr_SetFromWindowsErr(0);
7592
    return -1;
7593
}
7594
7595
/*
7596
 * Decode a byte string from a code page into unicode object with an error
7597
 * handler.
7598
 *
7599
 * Returns consumed size if succeed, or raise an OSError or
7600
 * UnicodeDecodeError exception and returns -1 on error.
7601
 */
7602
static int
7603
decode_code_page_errors(UINT code_page,
7604
                        wchar_t **buf,
7605
                        Py_ssize_t *bufsize,
7606
                        const char *in, const int size,
7607
                        const char *errors, int final)
7608
{
7609
    const char *startin = in;
7610
    const char *endin = in + size;
7611
    DWORD flags = MB_ERR_INVALID_CHARS;
7612
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7613
       2000 English version of the message. */
7614
    const char *reason = "No mapping for the Unicode character exists "
7615
                         "in the target code page.";
7616
    /* each step cannot decode more than 1 character, but a character can be
7617
       represented as a surrogate pair */
7618
    wchar_t buffer[2], *out;
7619
    int insize;
7620
    Py_ssize_t outsize;
7621
    PyObject *errorHandler = NULL;
7622
    PyObject *exc = NULL;
7623
    PyObject *encoding_obj = NULL;
7624
    const char *encoding;
7625
    DWORD err;
7626
    int ret = -1;
7627
7628
    assert(size > 0);
7629
7630
    encoding = code_page_name(code_page, &encoding_obj);
7631
    if (encoding == NULL)
7632
        return -1;
7633
7634
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7635
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7636
           UnicodeDecodeError. */
7637
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7638
        if (exc != NULL) {
7639
            PyCodec_StrictErrors(exc);
7640
            Py_CLEAR(exc);
7641
        }
7642
        goto error;
7643
    }
7644
7645
    /* Extend a wchar_t* buffer */
7646
    Py_ssize_t n = *bufsize;   /* Get the current length */
7647
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7648
        PyErr_NoMemory();
7649
        goto error;
7650
    }
7651
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7652
        goto error;
7653
    }
7654
    out = *buf + n;
7655
7656
    /* Decode the byte string character per character */
7657
    while (in < endin)
7658
    {
7659
        /* Decode a character */
7660
        insize = 1;
7661
        do
7662
        {
7663
            outsize = MultiByteToWideChar(code_page, flags,
7664
                                          in, insize,
7665
                                          buffer, Py_ARRAY_LENGTH(buffer));
7666
            if (outsize > 0)
7667
                break;
7668
            err = GetLastError();
7669
            if (err == ERROR_INVALID_FLAGS && flags) {
7670
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7671
                flags = 0;
7672
                continue;
7673
            }
7674
            if (err != ERROR_NO_UNICODE_TRANSLATION
7675
                && err != ERROR_INSUFFICIENT_BUFFER)
7676
            {
7677
                PyErr_SetFromWindowsErr(err);
7678
                goto error;
7679
            }
7680
            insize++;
7681
        }
7682
        /* 4=maximum length of a UTF-8 sequence */
7683
        while (insize <= 4 && (in + insize) <= endin);
7684
7685
        if (outsize <= 0) {
7686
            Py_ssize_t startinpos, endinpos, outpos;
7687
7688
            /* last character in partial decode? */
7689
            if (in + insize >= endin && !final)
7690
                break;
7691
7692
            startinpos = in - startin;
7693
            endinpos = startinpos + 1;
7694
            outpos = out - *buf;
7695
            if (unicode_decode_call_errorhandler_wchar(
7696
                    errors, &errorHandler,
7697
                    encoding, reason,
7698
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7699
                    buf, bufsize, &outpos))
7700
            {
7701
                goto error;
7702
            }
7703
            out = *buf + outpos;
7704
        }
7705
        else {
7706
            in += insize;
7707
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7708
            out += outsize;
7709
        }
7710
    }
7711
7712
    /* Shrink the buffer */
7713
    assert(out - *buf <= *bufsize);
7714
    *bufsize = out - *buf;
7715
    /* (in - startin) <= size and size is an int */
7716
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7717
7718
error:
7719
    Py_XDECREF(encoding_obj);
7720
    Py_XDECREF(errorHandler);
7721
    Py_XDECREF(exc);
7722
    return ret;
7723
}
7724
7725
static PyObject *
7726
decode_code_page_stateful(int code_page,
7727
                          const char *s, Py_ssize_t size,
7728
                          const char *errors, Py_ssize_t *consumed)
7729
{
7730
    wchar_t *buf = NULL;
7731
    Py_ssize_t bufsize = 0;
7732
    int chunk_size, final, converted, done;
7733
7734
    if (code_page < 0) {
7735
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7736
        return NULL;
7737
    }
7738
    if (size < 0) {
7739
        PyErr_BadInternalCall();
7740
        return NULL;
7741
    }
7742
7743
    if (consumed)
7744
        *consumed = 0;
7745
7746
    do
7747
    {
7748
#ifdef NEED_RETRY
7749
        if (size > DECODING_CHUNK_SIZE) {
7750
            chunk_size = DECODING_CHUNK_SIZE;
7751
            final = 0;
7752
            done = 0;
7753
        }
7754
        else
7755
#endif
7756
        {
7757
            chunk_size = (int)size;
7758
            final = (consumed == NULL);
7759
            done = 1;
7760
        }
7761
7762
        if (chunk_size == 0 && done) {
7763
            if (buf != NULL)
7764
                break;
7765
            _Py_RETURN_UNICODE_EMPTY();
7766
        }
7767
7768
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7769
                                            s, chunk_size);
7770
        if (converted == -2)
7771
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7772
                                                s, chunk_size,
7773
                                                errors, final);
7774
        assert(converted != 0 || done);
7775
7776
        if (converted < 0) {
7777
            PyMem_Free(buf);
7778
            return NULL;
7779
        }
7780
7781
        if (consumed)
7782
            *consumed += converted;
7783
7784
        s += converted;
7785
        size -= converted;
7786
    } while (!done);
7787
7788
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7789
    PyMem_Free(buf);
7790
    return v;
7791
}
7792
7793
PyObject *
7794
PyUnicode_DecodeCodePageStateful(int code_page,
7795
                                 const char *s,
7796
                                 Py_ssize_t size,
7797
                                 const char *errors,
7798
                                 Py_ssize_t *consumed)
7799
{
7800
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7801
}
7802
7803
PyObject *
7804
PyUnicode_DecodeMBCSStateful(const char *s,
7805
                             Py_ssize_t size,
7806
                             const char *errors,
7807
                             Py_ssize_t *consumed)
7808
{
7809
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7810
}
7811
7812
PyObject *
7813
PyUnicode_DecodeMBCS(const char *s,
7814
                     Py_ssize_t size,
7815
                     const char *errors)
7816
{
7817
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7818
}
7819
7820
static DWORD
7821
encode_code_page_flags(UINT code_page, const char *errors)
7822
{
7823
    if (code_page == CP_UTF8) {
7824
        return WC_ERR_INVALID_CHARS;
7825
    }
7826
    else if (code_page == CP_UTF7) {
7827
        /* CP_UTF7 only supports flags=0 */
7828
        return 0;
7829
    }
7830
    else {
7831
        if (errors != NULL && strcmp(errors, "replace") == 0)
7832
            return 0;
7833
        else
7834
            return WC_NO_BEST_FIT_CHARS;
7835
    }
7836
}
7837
7838
/*
7839
 * Encode a Unicode string to a Windows code page into a byte string in strict
7840
 * mode.
7841
 *
7842
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7843
 * an OSError and returns -1 on other error.
7844
 */
7845
static int
7846
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7847
                        PyObject *unicode, Py_ssize_t offset, int len,
7848
                        const char* errors)
7849
{
7850
    BOOL usedDefaultChar = FALSE;
7851
    BOOL *pusedDefaultChar = &usedDefaultChar;
7852
    int outsize;
7853
    wchar_t *p;
7854
    Py_ssize_t size;
7855
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7856
    char *out;
7857
    /* Create a substring so that we can get the UTF-16 representation
7858
       of just the slice under consideration. */
7859
    PyObject *substring;
7860
    int ret = -1;
7861
7862
    assert(len > 0);
7863
7864
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7865
        pusedDefaultChar = &usedDefaultChar;
7866
    else
7867
        pusedDefaultChar = NULL;
7868
7869
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7870
    if (substring == NULL)
7871
        return -1;
7872
    p = PyUnicode_AsWideCharString(substring, &size);
7873
    Py_CLEAR(substring);
7874
    if (p == NULL) {
7875
        return -1;
7876
    }
7877
    assert(size <= INT_MAX);
7878
7879
    /* First get the size of the result */
7880
    outsize = WideCharToMultiByte(code_page, flags,
7881
                                  p, (int)size,
7882
                                  NULL, 0,
7883
                                  NULL, pusedDefaultChar);
7884
    if (outsize <= 0)
7885
        goto error;
7886
    /* If we used a default char, then we failed! */
7887
    if (pusedDefaultChar && *pusedDefaultChar) {
7888
        ret = -2;
7889
        goto done;
7890
    }
7891
7892
    if (*writer == NULL) {
7893
        /* Create string object */
7894
        *writer = PyBytesWriter_Create(outsize);
7895
        if (*writer == NULL) {
7896
            goto done;
7897
        }
7898
        out = PyBytesWriter_GetData(*writer);
7899
    }
7900
    else {
7901
        /* Extend string object */
7902
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7903
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7904
            goto done;
7905
        }
7906
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7907
    }
7908
7909
    /* Do the conversion */
7910
    outsize = WideCharToMultiByte(code_page, flags,
7911
                                  p, (int)size,
7912
                                  out, outsize,
7913
                                  NULL, pusedDefaultChar);
7914
    if (outsize <= 0)
7915
        goto error;
7916
    if (pusedDefaultChar && *pusedDefaultChar) {
7917
        ret = -2;
7918
        goto done;
7919
    }
7920
    ret = 0;
7921
7922
done:
7923
    PyMem_Free(p);
7924
    return ret;
7925
7926
error:
7927
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7928
        ret = -2;
7929
        goto done;
7930
    }
7931
    PyErr_SetFromWindowsErr(0);
7932
    goto done;
7933
}
7934
7935
/*
7936
 * Encode a Unicode string to a Windows code page into a byte string using an
7937
 * error handler.
7938
 *
7939
 * Returns consumed characters if succeed, or raise an OSError and returns
7940
 * -1 on other error.
7941
 */
7942
static int
7943
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7944
                        PyObject *unicode, Py_ssize_t unicode_offset,
7945
                        Py_ssize_t insize, const char* errors)
7946
{
7947
    const DWORD flags = encode_code_page_flags(code_page, errors);
7948
    Py_ssize_t pos = unicode_offset;
7949
    Py_ssize_t endin = unicode_offset + insize;
7950
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7951
       2000 English version of the message. */
7952
    const char *reason = "invalid character";
7953
    /* 4=maximum length of a UTF-8 sequence */
7954
    char buffer[4];
7955
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7956
    Py_ssize_t outsize;
7957
    char *out;
7958
    PyObject *errorHandler = NULL;
7959
    PyObject *exc = NULL;
7960
    PyObject *encoding_obj = NULL;
7961
    const char *encoding;
7962
    Py_ssize_t newpos;
7963
    PyObject *rep;
7964
    int ret = -1;
7965
7966
    assert(insize > 0);
7967
7968
    encoding = code_page_name(code_page, &encoding_obj);
7969
    if (encoding == NULL)
7970
        return -1;
7971
7972
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7973
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7974
           then we raise a UnicodeEncodeError. */
7975
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7976
        if (exc != NULL) {
7977
            PyCodec_StrictErrors(exc);
7978
            Py_DECREF(exc);
7979
        }
7980
        Py_XDECREF(encoding_obj);
7981
        return -1;
7982
    }
7983
7984
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7985
        pusedDefaultChar = &usedDefaultChar;
7986
    else
7987
        pusedDefaultChar = NULL;
7988
7989
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7990
        PyErr_NoMemory();
7991
        goto error;
7992
    }
7993
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7994
7995
    if (*writer == NULL) {
7996
        /* Create string object */
7997
        *writer = PyBytesWriter_Create(outsize);
7998
        if (*writer == NULL) {
7999
            goto error;
8000
        }
8001
        out = PyBytesWriter_GetData(*writer);
8002
    }
8003
    else {
8004
        /* Extend string object */
8005
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8006
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8007
            goto error;
8008
        }
8009
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8010
    }
8011
8012
    /* Encode the string character per character */
8013
    while (pos < endin)
8014
    {
8015
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8016
        wchar_t chars[2];
8017
        int charsize;
8018
        if (ch < 0x10000) {
8019
            chars[0] = (wchar_t)ch;
8020
            charsize = 1;
8021
        }
8022
        else {
8023
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8024
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8025
            charsize = 2;
8026
        }
8027
8028
        outsize = WideCharToMultiByte(code_page, flags,
8029
                                      chars, charsize,
8030
                                      buffer, Py_ARRAY_LENGTH(buffer),
8031
                                      NULL, pusedDefaultChar);
8032
        if (outsize > 0) {
8033
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8034
            {
8035
                pos++;
8036
                memcpy(out, buffer, outsize);
8037
                out += outsize;
8038
                continue;
8039
            }
8040
        }
8041
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8042
            PyErr_SetFromWindowsErr(0);
8043
            goto error;
8044
        }
8045
8046
        rep = unicode_encode_call_errorhandler(
8047
                  errors, &errorHandler, encoding, reason,
8048
                  unicode, &exc,
8049
                  pos, pos + 1, &newpos);
8050
        if (rep == NULL)
8051
            goto error;
8052
8053
        Py_ssize_t morebytes = pos - newpos;
8054
        if (PyBytes_Check(rep)) {
8055
            outsize = PyBytes_GET_SIZE(rep);
8056
            morebytes += outsize;
8057
            if (morebytes > 0) {
8058
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8059
                if (out == NULL) {
8060
                    Py_DECREF(rep);
8061
                    goto error;
8062
                }
8063
            }
8064
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8065
            out += outsize;
8066
        }
8067
        else {
8068
            Py_ssize_t i;
8069
            int kind;
8070
            const void *data;
8071
8072
            outsize = PyUnicode_GET_LENGTH(rep);
8073
            morebytes += outsize;
8074
            if (morebytes > 0) {
8075
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8076
                if (out == NULL) {
8077
                    Py_DECREF(rep);
8078
                    goto error;
8079
                }
8080
            }
8081
            kind = PyUnicode_KIND(rep);
8082
            data = PyUnicode_DATA(rep);
8083
            for (i=0; i < outsize; i++) {
8084
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8085
                if (ch > 127) {
8086
                    raise_encode_exception(&exc,
8087
                        encoding, unicode,
8088
                        pos, pos + 1,
8089
                        "unable to encode error handler result to ASCII");
8090
                    Py_DECREF(rep);
8091
                    goto error;
8092
                }
8093
                *out = (unsigned char)ch;
8094
                out++;
8095
            }
8096
        }
8097
        pos = newpos;
8098
        Py_DECREF(rep);
8099
    }
8100
    /* write a NUL byte */
8101
    *out = 0;
8102
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8103
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8104
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8105
        goto error;
8106
    }
8107
    ret = 0;
8108
8109
error:
8110
    Py_XDECREF(encoding_obj);
8111
    Py_XDECREF(errorHandler);
8112
    Py_XDECREF(exc);
8113
    return ret;
8114
}
8115
8116
8117
PyObject *
8118
PyUnicode_EncodeCodePage(int code_page,
8119
                         PyObject *unicode,
8120
                         const char *errors)
8121
{
8122
    Py_ssize_t len;
8123
    PyBytesWriter *writer = NULL;
8124
    Py_ssize_t offset;
8125
    int chunk_len, ret, done;
8126
8127
    if (!PyUnicode_Check(unicode)) {
8128
        PyErr_BadArgument();
8129
        return NULL;
8130
    }
8131
8132
    len = PyUnicode_GET_LENGTH(unicode);
8133
8134
    if (code_page < 0) {
8135
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8136
        return NULL;
8137
    }
8138
8139
    if (len == 0)
8140
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8141
8142
    offset = 0;
8143
    do
8144
    {
8145
#ifdef NEED_RETRY
8146
        if (len > DECODING_CHUNK_SIZE) {
8147
            chunk_len = DECODING_CHUNK_SIZE;
8148
            done = 0;
8149
        }
8150
        else
8151
#endif
8152
        {
8153
            chunk_len = (int)len;
8154
            done = 1;
8155
        }
8156
8157
        ret = encode_code_page_strict(code_page, &writer,
8158
                                      unicode, offset, chunk_len,
8159
                                      errors);
8160
        if (ret == -2)
8161
            ret = encode_code_page_errors(code_page, &writer,
8162
                                          unicode, offset,
8163
                                          chunk_len, errors);
8164
        if (ret < 0) {
8165
            PyBytesWriter_Discard(writer);
8166
            return NULL;
8167
        }
8168
8169
        offset += chunk_len;
8170
        len -= chunk_len;
8171
    } while (!done);
8172
8173
    return PyBytesWriter_Finish(writer);
8174
}
8175
8176
8177
PyObject *
8178
PyUnicode_AsMBCSString(PyObject *unicode)
8179
{
8180
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8181
}
8182
8183
#undef NEED_RETRY
8184
8185
#endif /* MS_WINDOWS */
8186
8187
/* --- Character Mapping Codec -------------------------------------------- */
8188
8189
static int
8190
charmap_decode_string(const char *s,
8191
                      Py_ssize_t size,
8192
                      PyObject *mapping,
8193
                      const char *errors,
8194
                      _PyUnicodeWriter *writer)
8195
711k
{
8196
711k
    const char *starts = s;
8197
711k
    const char *e;
8198
711k
    Py_ssize_t startinpos, endinpos;
8199
711k
    PyObject *errorHandler = NULL, *exc = NULL;
8200
711k
    Py_ssize_t maplen;
8201
711k
    int mapkind;
8202
711k
    const void *mapdata;
8203
711k
    Py_UCS4 x;
8204
711k
    unsigned char ch;
8205
8206
711k
    maplen = PyUnicode_GET_LENGTH(mapping);
8207
711k
    mapdata = PyUnicode_DATA(mapping);
8208
711k
    mapkind = PyUnicode_KIND(mapping);
8209
8210
711k
    e = s + size;
8211
8212
711k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8213
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8214
         * is disabled in encoding aliases, latin1 is preferred because
8215
         * its implementation is faster. */
8216
126
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8217
126
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8218
126
        Py_UCS4 maxchar = writer->maxchar;
8219
8220
126
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8221
1.05M
        while (s < e) {
8222
1.05M
            ch = *s;
8223
1.05M
            x = mapdata_ucs1[ch];
8224
1.05M
            if (x > maxchar) {
8225
116
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8226
0
                    goto onError;
8227
116
                maxchar = writer->maxchar;
8228
116
                outdata = (Py_UCS1 *)writer->data;
8229
116
            }
8230
1.05M
            outdata[writer->pos] = x;
8231
1.05M
            writer->pos++;
8232
1.05M
            ++s;
8233
1.05M
        }
8234
126
        return 0;
8235
126
    }
8236
8237
829k
    while (s < e) {
8238
813k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8239
813k
            int outkind = writer->kind;
8240
813k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8241
813k
            if (outkind == PyUnicode_1BYTE_KIND) {
8242
749k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8243
749k
                Py_UCS4 maxchar = writer->maxchar;
8244
22.1M
                while (s < e) {
8245
21.4M
                    ch = *s;
8246
21.4M
                    x = mapdata_ucs2[ch];
8247
21.4M
                    if (x > maxchar)
8248
80.2k
                        goto Error;
8249
21.4M
                    outdata[writer->pos] = x;
8250
21.4M
                    writer->pos++;
8251
21.4M
                    ++s;
8252
21.4M
                }
8253
668k
                break;
8254
749k
            }
8255
63.8k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8256
63.8k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8257
32.4M
                while (s < e) {
8258
32.4M
                    ch = *s;
8259
32.4M
                    x = mapdata_ucs2[ch];
8260
32.4M
                    if (x == 0xFFFE)
8261
38.7k
                        goto Error;
8262
32.3M
                    outdata[writer->pos] = x;
8263
32.3M
                    writer->pos++;
8264
32.3M
                    ++s;
8265
32.3M
                }
8266
25.1k
                break;
8267
63.8k
            }
8268
813k
        }
8269
0
        ch = *s;
8270
8271
0
        if (ch < maplen)
8272
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8273
0
        else
8274
0
            x = 0xfffe; /* invalid value */
8275
119k
Error:
8276
119k
        if (x == 0xfffe)
8277
61.9k
        {
8278
            /* undefined mapping */
8279
61.9k
            startinpos = s-starts;
8280
61.9k
            endinpos = startinpos+1;
8281
61.9k
            if (unicode_decode_call_errorhandler_writer(
8282
61.9k
                    errors, &errorHandler,
8283
61.9k
                    "charmap", "character maps to <undefined>",
8284
61.9k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8285
61.9k
                    writer)) {
8286
17
                goto onError;
8287
17
            }
8288
61.8k
            continue;
8289
61.9k
        }
8290
8291
57.0k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8292
0
            goto onError;
8293
57.0k
        ++s;
8294
57.0k
    }
8295
710k
    Py_XDECREF(errorHandler);
8296
710k
    Py_XDECREF(exc);
8297
710k
    return 0;
8298
8299
17
onError:
8300
17
    Py_XDECREF(errorHandler);
8301
17
    Py_XDECREF(exc);
8302
17
    return -1;
8303
711k
}
8304
8305
static int
8306
charmap_decode_mapping(const char *s,
8307
                       Py_ssize_t size,
8308
                       PyObject *mapping,
8309
                       const char *errors,
8310
                       _PyUnicodeWriter *writer)
8311
0
{
8312
0
    const char *starts = s;
8313
0
    const char *e;
8314
0
    Py_ssize_t startinpos, endinpos;
8315
0
    PyObject *errorHandler = NULL, *exc = NULL;
8316
0
    unsigned char ch;
8317
0
    PyObject *key, *item = NULL;
8318
8319
0
    e = s + size;
8320
8321
0
    while (s < e) {
8322
0
        ch = *s;
8323
8324
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8325
0
        key = PyLong_FromLong((long)ch);
8326
0
        if (key == NULL)
8327
0
            goto onError;
8328
8329
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8330
0
        Py_DECREF(key);
8331
0
        if (rc == 0) {
8332
            /* No mapping found means: mapping is undefined. */
8333
0
            goto Undefined;
8334
0
        }
8335
0
        if (item == NULL) {
8336
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8337
                /* No mapping found means: mapping is undefined. */
8338
0
                PyErr_Clear();
8339
0
                goto Undefined;
8340
0
            } else
8341
0
                goto onError;
8342
0
        }
8343
8344
        /* Apply mapping */
8345
0
        if (item == Py_None)
8346
0
            goto Undefined;
8347
0
        if (PyLong_Check(item)) {
8348
0
            long value = PyLong_AsLong(item);
8349
0
            if (value == 0xFFFE)
8350
0
                goto Undefined;
8351
0
            if (value < 0 || value > MAX_UNICODE) {
8352
0
                PyErr_Format(PyExc_TypeError,
8353
0
                             "character mapping must be in range(0x%lx)",
8354
0
                             (unsigned long)MAX_UNICODE + 1);
8355
0
                goto onError;
8356
0
            }
8357
8358
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8359
0
                goto onError;
8360
0
        }
8361
0
        else if (PyUnicode_Check(item)) {
8362
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8363
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8364
0
                if (value == 0xFFFE)
8365
0
                    goto Undefined;
8366
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8367
0
                    goto onError;
8368
0
            }
8369
0
            else {
8370
0
                writer->overallocate = 1;
8371
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8372
0
                    goto onError;
8373
0
            }
8374
0
        }
8375
0
        else {
8376
            /* wrong return value */
8377
0
            PyErr_SetString(PyExc_TypeError,
8378
0
                            "character mapping must return integer, None or str");
8379
0
            goto onError;
8380
0
        }
8381
0
        Py_CLEAR(item);
8382
0
        ++s;
8383
0
        continue;
8384
8385
0
Undefined:
8386
        /* undefined mapping */
8387
0
        Py_CLEAR(item);
8388
0
        startinpos = s-starts;
8389
0
        endinpos = startinpos+1;
8390
0
        if (unicode_decode_call_errorhandler_writer(
8391
0
                errors, &errorHandler,
8392
0
                "charmap", "character maps to <undefined>",
8393
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8394
0
                writer)) {
8395
0
            goto onError;
8396
0
        }
8397
0
    }
8398
0
    Py_XDECREF(errorHandler);
8399
0
    Py_XDECREF(exc);
8400
0
    return 0;
8401
8402
0
onError:
8403
0
    Py_XDECREF(item);
8404
0
    Py_XDECREF(errorHandler);
8405
0
    Py_XDECREF(exc);
8406
0
    return -1;
8407
0
}
8408
8409
PyObject *
8410
PyUnicode_DecodeCharmap(const char *s,
8411
                        Py_ssize_t size,
8412
                        PyObject *mapping,
8413
                        const char *errors)
8414
711k
{
8415
711k
    _PyUnicodeWriter writer;
8416
8417
    /* Default to Latin-1 */
8418
711k
    if (mapping == NULL)
8419
21
        return PyUnicode_DecodeLatin1(s, size, errors);
8420
8421
711k
    if (size == 0)
8422
0
        _Py_RETURN_UNICODE_EMPTY();
8423
711k
    _PyUnicodeWriter_Init(&writer);
8424
711k
    writer.min_length = size;
8425
711k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8426
0
        goto onError;
8427
8428
711k
    if (PyUnicode_CheckExact(mapping)) {
8429
711k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8430
17
            goto onError;
8431
711k
    }
8432
0
    else {
8433
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8434
0
            goto onError;
8435
0
    }
8436
711k
    return _PyUnicodeWriter_Finish(&writer);
8437
8438
17
  onError:
8439
17
    _PyUnicodeWriter_Dealloc(&writer);
8440
17
    return NULL;
8441
711k
}
8442
8443
/* Charmap encoding: the lookup table */
8444
8445
/*[clinic input]
8446
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8447
[clinic start generated code]*/
8448
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8449
8450
struct encoding_map {
8451
    PyObject_HEAD
8452
    unsigned char level1[32];
8453
    int count2, count3;
8454
    unsigned char level23[1];
8455
};
8456
8457
/*[clinic input]
8458
EncodingMap.size
8459
8460
Return the size (in bytes) of this object.
8461
[clinic start generated code]*/
8462
8463
static PyObject *
8464
EncodingMap_size_impl(struct encoding_map *self)
8465
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8466
0
{
8467
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8468
0
                           128*self->count3);
8469
0
}
8470
8471
static PyMethodDef encoding_map_methods[] = {
8472
    ENCODINGMAP_SIZE_METHODDEF
8473
    {NULL, NULL}
8474
};
8475
8476
static PyTypeObject EncodingMapType = {
8477
    PyVarObject_HEAD_INIT(NULL, 0)
8478
    .tp_name = "EncodingMap",
8479
    .tp_basicsize = sizeof(struct encoding_map),
8480
    /* methods */
8481
    .tp_flags = Py_TPFLAGS_DEFAULT,
8482
    .tp_methods = encoding_map_methods,
8483
};
8484
8485
PyObject*
8486
PyUnicode_BuildEncodingMap(PyObject* string)
8487
135
{
8488
135
    PyObject *result;
8489
135
    struct encoding_map *mresult;
8490
135
    int i;
8491
135
    int need_dict = 0;
8492
135
    unsigned char level1[32];
8493
135
    unsigned char level2[512];
8494
135
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8495
135
    int count2 = 0, count3 = 0;
8496
135
    int kind;
8497
135
    const void *data;
8498
135
    int length;
8499
135
    Py_UCS4 ch;
8500
8501
135
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8502
0
        PyErr_BadArgument();
8503
0
        return NULL;
8504
0
    }
8505
135
    kind = PyUnicode_KIND(string);
8506
135
    data = PyUnicode_DATA(string);
8507
135
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8508
135
    memset(level1, 0xFF, sizeof level1);
8509
135
    memset(level2, 0xFF, sizeof level2);
8510
8511
    /* If there isn't a one-to-one mapping of NULL to \0,
8512
       or if there are non-BMP characters, we need to use
8513
       a mapping dictionary. */
8514
135
    if (PyUnicode_READ(kind, data, 0) != 0)
8515
0
        need_dict = 1;
8516
34.5k
    for (i = 1; i < length; i++) {
8517
34.4k
        int l1, l2;
8518
34.4k
        ch = PyUnicode_READ(kind, data, i);
8519
34.4k
        if (ch == 0 || ch > 0xFFFF) {
8520
0
            need_dict = 1;
8521
0
            break;
8522
0
        }
8523
34.4k
        if (ch == 0xFFFE)
8524
            /* unmapped character */
8525
935
            continue;
8526
33.4k
        l1 = ch >> 11;
8527
33.4k
        l2 = ch >> 7;
8528
33.4k
        if (level1[l1] == 0xFF)
8529
247
            level1[l1] = count2++;
8530
33.4k
        if (level2[l2] == 0xFF)
8531
728
            level2[l2] = count3++;
8532
33.4k
    }
8533
8534
135
    if (count2 >= 0xFF || count3 >= 0xFF)
8535
0
        need_dict = 1;
8536
8537
135
    if (need_dict) {
8538
0
        PyObject *result = PyDict_New();
8539
0
        if (!result)
8540
0
            return NULL;
8541
0
        for (i = 0; i < length; i++) {
8542
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8543
0
            PyObject *key = PyLong_FromLong(c);
8544
0
            if (key == NULL) {
8545
0
                Py_DECREF(result);
8546
0
                return NULL;
8547
0
            }
8548
0
            PyObject *value = PyLong_FromLong(i);
8549
0
            if (value == NULL) {
8550
0
                Py_DECREF(key);
8551
0
                Py_DECREF(result);
8552
0
                return NULL;
8553
0
            }
8554
0
            int rc = PyDict_SetItem(result, key, value);
8555
0
            Py_DECREF(key);
8556
0
            Py_DECREF(value);
8557
0
            if (rc < 0) {
8558
0
                Py_DECREF(result);
8559
0
                return NULL;
8560
0
            }
8561
0
        }
8562
0
        return result;
8563
0
    }
8564
8565
    /* Create a three-level trie */
8566
135
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8567
135
                             16*count2 + 128*count3 - 1);
8568
135
    if (!result) {
8569
0
        return PyErr_NoMemory();
8570
0
    }
8571
8572
135
    _PyObject_Init(result, &EncodingMapType);
8573
135
    mresult = (struct encoding_map*)result;
8574
135
    mresult->count2 = count2;
8575
135
    mresult->count3 = count3;
8576
135
    mlevel1 = mresult->level1;
8577
135
    mlevel2 = mresult->level23;
8578
135
    mlevel3 = mresult->level23 + 16*count2;
8579
135
    memcpy(mlevel1, level1, 32);
8580
135
    memset(mlevel2, 0xFF, 16*count2);
8581
135
    memset(mlevel3, 0, 128*count3);
8582
135
    count3 = 0;
8583
34.5k
    for (i = 1; i < length; i++) {
8584
34.4k
        int o1, o2, o3, i2, i3;
8585
34.4k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8586
34.4k
        if (ch == 0xFFFE)
8587
            /* unmapped character */
8588
935
            continue;
8589
33.4k
        o1 = ch>>11;
8590
33.4k
        o2 = (ch>>7) & 0xF;
8591
33.4k
        i2 = 16*mlevel1[o1] + o2;
8592
33.4k
        if (mlevel2[i2] == 0xFF)
8593
728
            mlevel2[i2] = count3++;
8594
33.4k
        o3 = ch & 0x7F;
8595
33.4k
        i3 = 128*mlevel2[i2] + o3;
8596
33.4k
        mlevel3[i3] = i;
8597
33.4k
    }
8598
135
    return result;
8599
135
}
8600
8601
static int
8602
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8603
0
{
8604
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8605
0
    int l1 = c>>11;
8606
0
    int l2 = (c>>7) & 0xF;
8607
0
    int l3 = c & 0x7F;
8608
0
    int i;
8609
8610
0
    if (c > 0xFFFF)
8611
0
        return -1;
8612
0
    if (c == 0)
8613
0
        return 0;
8614
    /* level 1*/
8615
0
    i = map->level1[l1];
8616
0
    if (i == 0xFF) {
8617
0
        return -1;
8618
0
    }
8619
    /* level 2*/
8620
0
    i = map->level23[16*i+l2];
8621
0
    if (i == 0xFF) {
8622
0
        return -1;
8623
0
    }
8624
    /* level 3 */
8625
0
    i = map->level23[16*map->count2 + 128*i + l3];
8626
0
    if (i == 0) {
8627
0
        return -1;
8628
0
    }
8629
0
    return i;
8630
0
}
8631
8632
/* Lookup the character in the mapping.
8633
   On success, return PyLong, PyBytes or None (if the character can't be found).
8634
   If the result is PyLong, put its value in replace.
8635
   On error, return NULL.
8636
   */
8637
static PyObject *
8638
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8639
0
{
8640
0
    PyObject *w = PyLong_FromLong((long)c);
8641
0
    PyObject *x;
8642
8643
0
    if (w == NULL)
8644
0
        return NULL;
8645
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8646
0
    Py_DECREF(w);
8647
0
    if (rc == 0) {
8648
        /* No mapping found means: mapping is undefined. */
8649
0
        Py_RETURN_NONE;
8650
0
    }
8651
0
    if (x == NULL) {
8652
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8653
            /* No mapping found means: mapping is undefined. */
8654
0
            PyErr_Clear();
8655
0
            Py_RETURN_NONE;
8656
0
        } else
8657
0
            return NULL;
8658
0
    }
8659
0
    else if (x == Py_None)
8660
0
        return x;
8661
0
    else if (PyLong_Check(x)) {
8662
0
        long value = PyLong_AsLong(x);
8663
0
        if (value < 0 || value > 255) {
8664
0
            PyErr_SetString(PyExc_TypeError,
8665
0
                            "character mapping must be in range(256)");
8666
0
            Py_DECREF(x);
8667
0
            return NULL;
8668
0
        }
8669
0
        *replace = (unsigned char)value;
8670
0
        return x;
8671
0
    }
8672
0
    else if (PyBytes_Check(x))
8673
0
        return x;
8674
0
    else {
8675
        /* wrong return value */
8676
0
        PyErr_Format(PyExc_TypeError,
8677
0
                     "character mapping must return integer, bytes or None, not %.400s",
8678
0
                     Py_TYPE(x)->tp_name);
8679
0
        Py_DECREF(x);
8680
0
        return NULL;
8681
0
    }
8682
0
}
8683
8684
static int
8685
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8686
0
{
8687
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8688
    /* exponentially overallocate to minimize reallocations */
8689
0
    if (requiredsize < 2 * outsize)
8690
0
        requiredsize = 2 * outsize;
8691
0
    return PyBytesWriter_Resize(writer, requiredsize);
8692
0
}
8693
8694
typedef enum charmapencode_result {
8695
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8696
} charmapencode_result;
8697
/* lookup the character, put the result in the output string and adjust
8698
   various state variables. Resize the output bytes object if not enough
8699
   space is available. Return a new reference to the object that
8700
   was put in the output buffer, or Py_None, if the mapping was undefined
8701
   (in which case no character was written) or NULL, if a
8702
   reallocation error occurred. The caller must decref the result */
8703
static charmapencode_result
8704
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8705
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8706
0
{
8707
0
    PyObject *rep;
8708
0
    unsigned char replace;
8709
0
    char *outstart;
8710
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8711
8712
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8713
0
        int res = encoding_map_lookup(c, mapping);
8714
0
        Py_ssize_t requiredsize = *outpos+1;
8715
0
        if (res == -1) {
8716
0
            return enc_FAILED;
8717
0
        }
8718
8719
0
        if (outsize<requiredsize) {
8720
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8721
0
                return enc_EXCEPTION;
8722
0
            }
8723
0
        }
8724
0
        outstart = _PyBytesWriter_GetData(writer);
8725
0
        outstart[(*outpos)++] = (char)res;
8726
0
        return enc_SUCCESS;
8727
0
    }
8728
8729
0
    rep = charmapencode_lookup(c, mapping, &replace);
8730
0
    if (rep==NULL)
8731
0
        return enc_EXCEPTION;
8732
0
    else if (rep==Py_None) {
8733
0
        Py_DECREF(rep);
8734
0
        return enc_FAILED;
8735
0
    } else {
8736
0
        if (PyLong_Check(rep)) {
8737
0
            Py_ssize_t requiredsize = *outpos+1;
8738
0
            if (outsize<requiredsize)
8739
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8740
0
                    Py_DECREF(rep);
8741
0
                    return enc_EXCEPTION;
8742
0
                }
8743
0
            outstart = _PyBytesWriter_GetData(writer);
8744
0
            outstart[(*outpos)++] = (char)replace;
8745
0
        }
8746
0
        else {
8747
0
            const char *repchars = PyBytes_AS_STRING(rep);
8748
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8749
0
            Py_ssize_t requiredsize = *outpos+repsize;
8750
0
            if (outsize<requiredsize)
8751
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8752
0
                    Py_DECREF(rep);
8753
0
                    return enc_EXCEPTION;
8754
0
                }
8755
0
            outstart = _PyBytesWriter_GetData(writer);
8756
0
            memcpy(outstart + *outpos, repchars, repsize);
8757
0
            *outpos += repsize;
8758
0
        }
8759
0
    }
8760
0
    Py_DECREF(rep);
8761
0
    return enc_SUCCESS;
8762
0
}
8763
8764
/* handle an error in _PyUnicode_EncodeCharmap()
8765
   Return 0 on success, -1 on error */
8766
static int
8767
charmap_encoding_error(
8768
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8769
    PyObject **exceptionObject,
8770
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8771
    PyBytesWriter *writer, Py_ssize_t *respos)
8772
0
{
8773
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8774
0
    Py_ssize_t size, repsize;
8775
0
    Py_ssize_t newpos;
8776
0
    int kind;
8777
0
    const void *data;
8778
0
    Py_ssize_t index;
8779
    /* startpos for collecting unencodable chars */
8780
0
    Py_ssize_t collstartpos = *inpos;
8781
0
    Py_ssize_t collendpos = *inpos+1;
8782
0
    Py_ssize_t collpos;
8783
0
    const char *encoding = "charmap";
8784
0
    const char *reason = "character maps to <undefined>";
8785
0
    charmapencode_result x;
8786
0
    Py_UCS4 ch;
8787
0
    int val;
8788
8789
0
    size = PyUnicode_GET_LENGTH(unicode);
8790
    /* find all unencodable characters */
8791
0
    while (collendpos < size) {
8792
0
        PyObject *rep;
8793
0
        unsigned char replace;
8794
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8795
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8796
0
            val = encoding_map_lookup(ch, mapping);
8797
0
            if (val != -1)
8798
0
                break;
8799
0
            ++collendpos;
8800
0
            continue;
8801
0
        }
8802
8803
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8804
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8805
0
        if (rep==NULL)
8806
0
            return -1;
8807
0
        else if (rep!=Py_None) {
8808
0
            Py_DECREF(rep);
8809
0
            break;
8810
0
        }
8811
0
        Py_DECREF(rep);
8812
0
        ++collendpos;
8813
0
    }
8814
    /* cache callback name lookup
8815
     * (if not done yet, i.e. it's the first error) */
8816
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8817
0
        *error_handler = _Py_GetErrorHandler(errors);
8818
8819
0
    switch (*error_handler) {
8820
0
    case _Py_ERROR_STRICT:
8821
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8822
0
        return -1;
8823
8824
0
    case _Py_ERROR_REPLACE:
8825
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8826
0
            x = charmapencode_output('?', mapping, writer, respos);
8827
0
            if (x==enc_EXCEPTION) {
8828
0
                return -1;
8829
0
            }
8830
0
            else if (x==enc_FAILED) {
8831
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8832
0
                return -1;
8833
0
            }
8834
0
        }
8835
0
        _Py_FALLTHROUGH;
8836
0
    case _Py_ERROR_IGNORE:
8837
0
        *inpos = collendpos;
8838
0
        break;
8839
8840
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8841
        /* generate replacement (temporarily (mis)uses p) */
8842
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8843
0
            char buffer[2+29+1+1];
8844
0
            char *cp;
8845
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8846
0
            for (cp = buffer; *cp; ++cp) {
8847
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8848
0
                if (x==enc_EXCEPTION)
8849
0
                    return -1;
8850
0
                else if (x==enc_FAILED) {
8851
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8852
0
                    return -1;
8853
0
                }
8854
0
            }
8855
0
        }
8856
0
        *inpos = collendpos;
8857
0
        break;
8858
8859
0
    default:
8860
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8861
0
                                                      encoding, reason, unicode, exceptionObject,
8862
0
                                                      collstartpos, collendpos, &newpos);
8863
0
        if (repunicode == NULL)
8864
0
            return -1;
8865
0
        if (PyBytes_Check(repunicode)) {
8866
            /* Directly copy bytes result to output. */
8867
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8868
0
            Py_ssize_t requiredsize;
8869
0
            repsize = PyBytes_Size(repunicode);
8870
0
            requiredsize = *respos + repsize;
8871
0
            if (requiredsize > outsize)
8872
                /* Make room for all additional bytes. */
8873
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8874
0
                    Py_DECREF(repunicode);
8875
0
                    return -1;
8876
0
                }
8877
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8878
0
                   PyBytes_AsString(repunicode),  repsize);
8879
0
            *respos += repsize;
8880
0
            *inpos = newpos;
8881
0
            Py_DECREF(repunicode);
8882
0
            break;
8883
0
        }
8884
        /* generate replacement  */
8885
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8886
0
        data = PyUnicode_DATA(repunicode);
8887
0
        kind = PyUnicode_KIND(repunicode);
8888
0
        for (index = 0; index < repsize; index++) {
8889
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8890
0
            x = charmapencode_output(repch, mapping, writer, respos);
8891
0
            if (x==enc_EXCEPTION) {
8892
0
                Py_DECREF(repunicode);
8893
0
                return -1;
8894
0
            }
8895
0
            else if (x==enc_FAILED) {
8896
0
                Py_DECREF(repunicode);
8897
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8898
0
                return -1;
8899
0
            }
8900
0
        }
8901
0
        *inpos = newpos;
8902
0
        Py_DECREF(repunicode);
8903
0
    }
8904
0
    return 0;
8905
0
}
8906
8907
PyObject *
8908
_PyUnicode_EncodeCharmap(PyObject *unicode,
8909
                         PyObject *mapping,
8910
                         const char *errors)
8911
0
{
8912
    /* Default to Latin-1 */
8913
0
    if (mapping == NULL) {
8914
0
        return unicode_encode_ucs1(unicode, errors, 256);
8915
0
    }
8916
8917
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8918
0
    if (size == 0) {
8919
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8920
0
    }
8921
0
    const void *data = PyUnicode_DATA(unicode);
8922
0
    int kind = PyUnicode_KIND(unicode);
8923
8924
0
    PyObject *error_handler_obj = NULL;
8925
0
    PyObject *exc = NULL;
8926
8927
    /* output object */
8928
0
    PyBytesWriter *writer;
8929
    /* allocate enough for a simple encoding without
8930
       replacements, if we need more, we'll resize */
8931
0
    writer = PyBytesWriter_Create(size);
8932
0
    if (writer == NULL) {
8933
0
        goto onError;
8934
0
    }
8935
8936
    /* current input position */
8937
0
    Py_ssize_t inpos = 0;
8938
    /* current output position */
8939
0
    Py_ssize_t respos = 0;
8940
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8941
8942
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8943
0
        char *outstart = _PyBytesWriter_GetData(writer);
8944
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8945
8946
0
        while (inpos<size) {
8947
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8948
8949
            /* try to encode it */
8950
0
            int res = encoding_map_lookup(ch, mapping);
8951
0
            Py_ssize_t requiredsize = respos+1;
8952
0
            if (res == -1) {
8953
0
                goto enc_FAILED;
8954
0
            }
8955
8956
0
            if (outsize<requiredsize) {
8957
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8958
0
                    goto onError;
8959
0
                }
8960
0
                outstart = _PyBytesWriter_GetData(writer);
8961
0
                outsize = _PyBytesWriter_GetSize(writer);
8962
0
            }
8963
0
            outstart[respos++] = (char)res;
8964
8965
            /* done with this character => adjust input position */
8966
0
            ++inpos;
8967
0
            continue;
8968
8969
0
enc_FAILED:
8970
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8971
0
                                       &exc,
8972
0
                                       &error_handler, &error_handler_obj, errors,
8973
0
                                       writer, &respos)) {
8974
0
                goto onError;
8975
0
            }
8976
0
            outstart = _PyBytesWriter_GetData(writer);
8977
0
            outsize = _PyBytesWriter_GetSize(writer);
8978
0
        }
8979
0
    }
8980
0
    else {
8981
0
        while (inpos<size) {
8982
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8983
            /* try to encode it */
8984
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8985
0
            if (x==enc_EXCEPTION) { /* error */
8986
0
                goto onError;
8987
0
            }
8988
0
            if (x==enc_FAILED) { /* unencodable character */
8989
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8990
0
                                           &exc,
8991
0
                                           &error_handler, &error_handler_obj, errors,
8992
0
                                           writer, &respos)) {
8993
0
                    goto onError;
8994
0
                }
8995
0
            }
8996
0
            else {
8997
                /* done with this character => adjust input position */
8998
0
                ++inpos;
8999
0
            }
9000
0
        }
9001
0
    }
9002
9003
0
    Py_XDECREF(exc);
9004
0
    Py_XDECREF(error_handler_obj);
9005
9006
    /* Resize if we allocated too much */
9007
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9008
9009
0
  onError:
9010
0
    PyBytesWriter_Discard(writer);
9011
0
    Py_XDECREF(exc);
9012
0
    Py_XDECREF(error_handler_obj);
9013
0
    return NULL;
9014
0
}
9015
9016
PyObject *
9017
PyUnicode_AsCharmapString(PyObject *unicode,
9018
                          PyObject *mapping)
9019
0
{
9020
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9021
0
        PyErr_BadArgument();
9022
0
        return NULL;
9023
0
    }
9024
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9025
0
}
9026
9027
/* create or adjust a UnicodeTranslateError */
9028
static void
9029
make_translate_exception(PyObject **exceptionObject,
9030
                         PyObject *unicode,
9031
                         Py_ssize_t startpos, Py_ssize_t endpos,
9032
                         const char *reason)
9033
0
{
9034
0
    if (*exceptionObject == NULL) {
9035
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9036
0
            unicode, startpos, endpos, reason);
9037
0
    }
9038
0
    else {
9039
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9040
0
            goto onError;
9041
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9042
0
            goto onError;
9043
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9044
0
            goto onError;
9045
0
        return;
9046
0
      onError:
9047
0
        Py_CLEAR(*exceptionObject);
9048
0
    }
9049
0
}
9050
9051
/* error handling callback helper:
9052
   build arguments, call the callback and check the arguments,
9053
   put the result into newpos and return the replacement string, which
9054
   has to be freed by the caller */
9055
static PyObject *
9056
unicode_translate_call_errorhandler(const char *errors,
9057
                                    PyObject **errorHandler,
9058
                                    const char *reason,
9059
                                    PyObject *unicode, PyObject **exceptionObject,
9060
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9061
                                    Py_ssize_t *newpos)
9062
0
{
9063
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9064
9065
0
    Py_ssize_t i_newpos;
9066
0
    PyObject *restuple;
9067
0
    PyObject *resunicode;
9068
9069
0
    if (*errorHandler == NULL) {
9070
0
        *errorHandler = PyCodec_LookupError(errors);
9071
0
        if (*errorHandler == NULL)
9072
0
            return NULL;
9073
0
    }
9074
9075
0
    make_translate_exception(exceptionObject,
9076
0
                             unicode, startpos, endpos, reason);
9077
0
    if (*exceptionObject == NULL)
9078
0
        return NULL;
9079
9080
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9081
0
    if (restuple == NULL)
9082
0
        return NULL;
9083
0
    if (!PyTuple_Check(restuple)) {
9084
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9085
0
        Py_DECREF(restuple);
9086
0
        return NULL;
9087
0
    }
9088
0
    if (!PyArg_ParseTuple(restuple, argparse,
9089
0
                          &resunicode, &i_newpos)) {
9090
0
        Py_DECREF(restuple);
9091
0
        return NULL;
9092
0
    }
9093
0
    if (i_newpos<0)
9094
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9095
0
    else
9096
0
        *newpos = i_newpos;
9097
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9098
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9099
0
        Py_DECREF(restuple);
9100
0
        return NULL;
9101
0
    }
9102
0
    Py_INCREF(resunicode);
9103
0
    Py_DECREF(restuple);
9104
0
    return resunicode;
9105
0
}
9106
9107
/* Lookup the character ch in the mapping and put the result in result,
9108
   which must be decrefed by the caller.
9109
   The result can be PyLong, PyUnicode, None or NULL.
9110
   If the result is PyLong, put its value in replace.
9111
   Return 0 on success, -1 on error */
9112
static int
9113
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9114
18.8k
{
9115
18.8k
    PyObject *w = PyLong_FromLong((long)c);
9116
18.8k
    PyObject *x;
9117
9118
18.8k
    if (w == NULL)
9119
0
        return -1;
9120
18.8k
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9121
18.8k
    Py_DECREF(w);
9122
18.8k
    if (rc == 0) {
9123
        /* No mapping found means: use 1:1 mapping. */
9124
6.35k
        *result = NULL;
9125
6.35k
        return 0;
9126
6.35k
    }
9127
12.4k
    if (x == NULL) {
9128
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9129
            /* No mapping found means: use 1:1 mapping. */
9130
0
            PyErr_Clear();
9131
0
            *result = NULL;
9132
0
            return 0;
9133
0
        } else
9134
0
            return -1;
9135
0
    }
9136
12.4k
    else if (x == Py_None) {
9137
0
        *result = x;
9138
0
        return 0;
9139
0
    }
9140
12.4k
    else if (PyLong_Check(x)) {
9141
0
        long value = PyLong_AsLong(x);
9142
0
        if (value < 0 || value > MAX_UNICODE) {
9143
0
            PyErr_Format(PyExc_ValueError,
9144
0
                         "character mapping must be in range(0x%lx)",
9145
0
                         (unsigned long)MAX_UNICODE + 1);
9146
0
            Py_DECREF(x);
9147
0
            return -1;
9148
0
        }
9149
0
        *result = x;
9150
0
        *replace = (Py_UCS4)value;
9151
0
        return 0;
9152
0
    }
9153
12.4k
    else if (PyUnicode_Check(x)) {
9154
12.4k
        *result = x;
9155
12.4k
        return 0;
9156
12.4k
    }
9157
0
    else {
9158
        /* wrong return value */
9159
0
        PyErr_SetString(PyExc_TypeError,
9160
0
                        "character mapping must return integer, None or str");
9161
0
        Py_DECREF(x);
9162
0
        return -1;
9163
0
    }
9164
12.4k
}
9165
9166
/* lookup the character, write the result into the writer.
9167
   Return 1 if the result was written into the writer, return 0 if the mapping
9168
   was undefined, raise an exception return -1 on error. */
9169
static int
9170
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9171
                        _PyUnicodeWriter *writer)
9172
6.40k
{
9173
6.40k
    PyObject *item;
9174
6.40k
    Py_UCS4 replace;
9175
9176
6.40k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9177
0
        return -1;
9178
9179
6.40k
    if (item == NULL) {
9180
        /* not found => default to 1:1 mapping */
9181
113
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9182
0
            return -1;
9183
0
        }
9184
113
        return 1;
9185
113
    }
9186
9187
6.29k
    if (item == Py_None) {
9188
0
        Py_DECREF(item);
9189
0
        return 0;
9190
0
    }
9191
9192
6.29k
    if (PyLong_Check(item)) {
9193
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9194
0
            Py_DECREF(item);
9195
0
            return -1;
9196
0
        }
9197
0
        Py_DECREF(item);
9198
0
        return 1;
9199
0
    }
9200
9201
6.29k
    if (!PyUnicode_Check(item)) {
9202
0
        Py_DECREF(item);
9203
0
        return -1;
9204
0
    }
9205
9206
6.29k
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9207
0
        Py_DECREF(item);
9208
0
        return -1;
9209
0
    }
9210
9211
6.29k
    Py_DECREF(item);
9212
6.29k
    return 1;
9213
6.29k
}
9214
9215
static int
9216
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9217
                              Py_UCS1 *translate)
9218
12.4k
{
9219
12.4k
    PyObject *item = NULL;
9220
12.4k
    Py_UCS4 replace;
9221
12.4k
    int ret = 0;
9222
9223
12.4k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9224
0
        return -1;
9225
0
    }
9226
9227
12.4k
    if (item == Py_None) {
9228
        /* deletion */
9229
0
        translate[ch] = 0xfe;
9230
0
    }
9231
12.4k
    else if (item == NULL) {
9232
        /* not found => default to 1:1 mapping */
9233
6.24k
        translate[ch] = ch;
9234
6.24k
        return 1;
9235
6.24k
    }
9236
6.18k
    else if (PyLong_Check(item)) {
9237
0
        if (replace > 127) {
9238
            /* invalid character or character outside ASCII:
9239
               skip the fast translate */
9240
0
            goto exit;
9241
0
        }
9242
0
        translate[ch] = (Py_UCS1)replace;
9243
0
    }
9244
6.18k
    else if (PyUnicode_Check(item)) {
9245
6.18k
        if (PyUnicode_GET_LENGTH(item) != 1)
9246
6.18k
            goto exit;
9247
9248
0
        replace = PyUnicode_READ_CHAR(item, 0);
9249
0
        if (replace > 127)
9250
0
            goto exit;
9251
0
        translate[ch] = (Py_UCS1)replace;
9252
0
    }
9253
0
    else {
9254
        /* not None, NULL, long or unicode */
9255
0
        goto exit;
9256
0
    }
9257
0
    ret = 1;
9258
9259
6.18k
  exit:
9260
6.18k
    Py_DECREF(item);
9261
6.18k
    return ret;
9262
0
}
9263
9264
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9265
   was translated into writer, return 0 if the input string was partially
9266
   translated into writer, raise an exception and return -1 on error. */
9267
static int
9268
unicode_fast_translate(PyObject *input, PyObject *mapping,
9269
                       _PyUnicodeWriter *writer, int ignore,
9270
                       Py_ssize_t *input_pos)
9271
12.3k
{
9272
12.3k
    Py_UCS1 ascii_table[128], ch, ch2;
9273
12.3k
    Py_ssize_t len;
9274
12.3k
    const Py_UCS1 *in, *end;
9275
12.3k
    Py_UCS1 *out;
9276
12.3k
    int res = 0;
9277
9278
12.3k
    len = PyUnicode_GET_LENGTH(input);
9279
9280
12.3k
    memset(ascii_table, 0xff, 128);
9281
9282
12.3k
    in = PyUnicode_1BYTE_DATA(input);
9283
12.3k
    end = in + len;
9284
9285
12.3k
    assert(PyUnicode_IS_ASCII(writer->buffer));
9286
12.3k
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9287
12.3k
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9288
9289
18.6k
    for (; in < end; in++) {
9290
12.4k
        ch = *in;
9291
12.4k
        ch2 = ascii_table[ch];
9292
12.4k
        if (ch2 == 0xff) {
9293
12.4k
            int translate = unicode_fast_translate_lookup(mapping, ch,
9294
12.4k
                                                          ascii_table);
9295
12.4k
            if (translate < 0)
9296
0
                return -1;
9297
12.4k
            if (translate == 0)
9298
6.18k
                goto exit;
9299
6.24k
            ch2 = ascii_table[ch];
9300
6.24k
        }
9301
6.28k
        if (ch2 == 0xfe) {
9302
0
            if (ignore)
9303
0
                continue;
9304
0
            goto exit;
9305
0
        }
9306
6.28k
        assert(ch2 < 128);
9307
6.28k
        *out = ch2;
9308
6.28k
        out++;
9309
6.28k
    }
9310
6.17k
    res = 1;
9311
9312
12.3k
exit:
9313
12.3k
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9314
12.3k
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9315
12.3k
    return res;
9316
6.17k
}
9317
9318
static PyObject *
9319
_PyUnicode_TranslateCharmap(PyObject *input,
9320
                            PyObject *mapping,
9321
                            const char *errors)
9322
12.3k
{
9323
    /* input object */
9324
12.3k
    const void *data;
9325
12.3k
    Py_ssize_t size, i;
9326
12.3k
    int kind;
9327
    /* output buffer */
9328
12.3k
    _PyUnicodeWriter writer;
9329
    /* error handler */
9330
12.3k
    const char *reason = "character maps to <undefined>";
9331
12.3k
    PyObject *errorHandler = NULL;
9332
12.3k
    PyObject *exc = NULL;
9333
12.3k
    int ignore;
9334
12.3k
    int res;
9335
9336
12.3k
    if (mapping == NULL) {
9337
0
        PyErr_BadArgument();
9338
0
        return NULL;
9339
0
    }
9340
9341
12.3k
    data = PyUnicode_DATA(input);
9342
12.3k
    kind = PyUnicode_KIND(input);
9343
12.3k
    size = PyUnicode_GET_LENGTH(input);
9344
9345
12.3k
    if (size == 0)
9346
0
        return PyUnicode_FromObject(input);
9347
9348
    /* allocate enough for a simple 1:1 translation without
9349
       replacements, if we need more, we'll resize */
9350
12.3k
    _PyUnicodeWriter_Init(&writer);
9351
12.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9352
0
        goto onError;
9353
9354
12.3k
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9355
9356
12.3k
    if (PyUnicode_IS_ASCII(input)) {
9357
12.3k
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9358
12.3k
        if (res < 0) {
9359
0
            _PyUnicodeWriter_Dealloc(&writer);
9360
0
            return NULL;
9361
0
        }
9362
12.3k
        if (res == 1)
9363
6.17k
            return _PyUnicodeWriter_Finish(&writer);
9364
12.3k
    }
9365
0
    else {
9366
0
        i = 0;
9367
0
    }
9368
9369
12.5k
    while (i<size) {
9370
        /* try to encode it */
9371
6.40k
        int translate;
9372
6.40k
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9373
6.40k
        Py_ssize_t newpos;
9374
        /* startpos for collecting untranslatable chars */
9375
6.40k
        Py_ssize_t collstart;
9376
6.40k
        Py_ssize_t collend;
9377
6.40k
        Py_UCS4 ch;
9378
9379
6.40k
        ch = PyUnicode_READ(kind, data, i);
9380
6.40k
        translate = charmaptranslate_output(ch, mapping, &writer);
9381
6.40k
        if (translate < 0)
9382
0
            goto onError;
9383
9384
6.40k
        if (translate != 0) {
9385
            /* it worked => adjust input pointer */
9386
6.40k
            ++i;
9387
6.40k
            continue;
9388
6.40k
        }
9389
9390
        /* untranslatable character */
9391
0
        collstart = i;
9392
0
        collend = i+1;
9393
9394
        /* find all untranslatable characters */
9395
0
        while (collend < size) {
9396
0
            PyObject *x;
9397
0
            Py_UCS4 replace;
9398
0
            ch = PyUnicode_READ(kind, data, collend);
9399
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9400
0
                goto onError;
9401
0
            Py_XDECREF(x);
9402
0
            if (x != Py_None)
9403
0
                break;
9404
0
            ++collend;
9405
0
        }
9406
9407
0
        if (ignore) {
9408
0
            i = collend;
9409
0
        }
9410
0
        else {
9411
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9412
0
                                                             reason, input, &exc,
9413
0
                                                             collstart, collend, &newpos);
9414
0
            if (repunicode == NULL)
9415
0
                goto onError;
9416
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9417
0
                Py_DECREF(repunicode);
9418
0
                goto onError;
9419
0
            }
9420
0
            Py_DECREF(repunicode);
9421
0
            i = newpos;
9422
0
        }
9423
0
    }
9424
6.18k
    Py_XDECREF(exc);
9425
6.18k
    Py_XDECREF(errorHandler);
9426
6.18k
    return _PyUnicodeWriter_Finish(&writer);
9427
9428
0
  onError:
9429
0
    _PyUnicodeWriter_Dealloc(&writer);
9430
0
    Py_XDECREF(exc);
9431
0
    Py_XDECREF(errorHandler);
9432
0
    return NULL;
9433
6.18k
}
9434
9435
PyObject *
9436
PyUnicode_Translate(PyObject *str,
9437
                    PyObject *mapping,
9438
                    const char *errors)
9439
0
{
9440
0
    if (ensure_unicode(str) < 0)
9441
0
        return NULL;
9442
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9443
0
}
9444
9445
PyObject *
9446
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9447
14.9M
{
9448
14.9M
    if (!PyUnicode_Check(unicode)) {
9449
0
        PyErr_BadInternalCall();
9450
0
        return NULL;
9451
0
    }
9452
14.9M
    if (PyUnicode_IS_ASCII(unicode)) {
9453
        /* If the string is already ASCII, just return the same string */
9454
14.9M
        return Py_NewRef(unicode);
9455
14.9M
    }
9456
9457
2.54k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9458
2.54k
    PyObject *result = PyUnicode_New(len, 127);
9459
2.54k
    if (result == NULL) {
9460
0
        return NULL;
9461
0
    }
9462
9463
2.54k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9464
2.54k
    int kind = PyUnicode_KIND(unicode);
9465
2.54k
    const void *data = PyUnicode_DATA(unicode);
9466
2.54k
    Py_ssize_t i;
9467
59.3k
    for (i = 0; i < len; ++i) {
9468
56.9k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9469
56.9k
        if (ch < 127) {
9470
53.8k
            out[i] = ch;
9471
53.8k
        }
9472
3.03k
        else if (Py_UNICODE_ISSPACE(ch)) {
9473
1.45k
            out[i] = ' ';
9474
1.45k
        }
9475
1.58k
        else {
9476
1.58k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9477
1.58k
            if (decimal < 0) {
9478
141
                out[i] = '?';
9479
141
                out[i+1] = '\0';
9480
141
                _PyUnicode_LENGTH(result) = i + 1;
9481
141
                break;
9482
141
            }
9483
1.44k
            out[i] = '0' + decimal;
9484
1.44k
        }
9485
56.9k
    }
9486
9487
2.54k
    assert(_PyUnicode_CheckConsistency(result, 1));
9488
2.54k
    return result;
9489
2.54k
}
9490
9491
/* --- Helpers ------------------------------------------------------------ */
9492
9493
/* helper macro to fixup start/end slice values */
9494
#define ADJUST_INDICES(start, end, len) \
9495
113M
    do {                                \
9496
113M
        if (end > len) {                \
9497
87.3M
            end = len;                  \
9498
87.3M
        }                               \
9499
113M
        else if (end < 0) {             \
9500
0
            end += len;                 \
9501
0
            if (end < 0) {              \
9502
0
                end = 0;                \
9503
0
            }                           \
9504
0
        }                               \
9505
113M
        if (start < 0) {                \
9506
17.8k
            start += len;               \
9507
17.8k
            if (start < 0) {            \
9508
0
                start = 0;              \
9509
0
            }                           \
9510
17.8k
        }                               \
9511
113M
    } while (0)
9512
9513
static Py_ssize_t
9514
any_find_slice(PyObject* s1, PyObject* s2,
9515
               Py_ssize_t start,
9516
               Py_ssize_t end,
9517
               int direction)
9518
25.7M
{
9519
25.7M
    int kind1, kind2;
9520
25.7M
    const void *buf1, *buf2;
9521
25.7M
    Py_ssize_t len1, len2, result;
9522
9523
25.7M
    kind1 = PyUnicode_KIND(s1);
9524
25.7M
    kind2 = PyUnicode_KIND(s2);
9525
25.7M
    if (kind1 < kind2)
9526
0
        return -1;
9527
9528
25.7M
    len1 = PyUnicode_GET_LENGTH(s1);
9529
25.7M
    len2 = PyUnicode_GET_LENGTH(s2);
9530
25.7M
    ADJUST_INDICES(start, end, len1);
9531
25.7M
    if (end - start < len2)
9532
1.43M
        return -1;
9533
9534
24.3M
    buf1 = PyUnicode_DATA(s1);
9535
24.3M
    buf2 = PyUnicode_DATA(s2);
9536
24.3M
    if (len2 == 1) {
9537
23.3M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9538
23.3M
        result = findchar((const char *)buf1 + kind1*start,
9539
23.3M
                          kind1, end - start, ch, direction);
9540
23.3M
        if (result == -1)
9541
3.96M
            return -1;
9542
19.3M
        else
9543
19.3M
            return start + result;
9544
23.3M
    }
9545
9546
1.03M
    if (kind2 != kind1) {
9547
333k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9548
333k
        if (!buf2)
9549
0
            return -2;
9550
333k
    }
9551
9552
1.03M
    if (direction > 0) {
9553
1.03M
        switch (kind1) {
9554
704k
        case PyUnicode_1BYTE_KIND:
9555
704k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9556
407k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9557
296k
            else
9558
296k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9559
704k
            break;
9560
257k
        case PyUnicode_2BYTE_KIND:
9561
257k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9562
257k
            break;
9563
75.3k
        case PyUnicode_4BYTE_KIND:
9564
75.3k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9565
75.3k
            break;
9566
0
        default:
9567
0
            Py_UNREACHABLE();
9568
1.03M
        }
9569
1.03M
    }
9570
0
    else {
9571
0
        switch (kind1) {
9572
0
        case PyUnicode_1BYTE_KIND:
9573
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9574
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9575
0
            else
9576
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9577
0
            break;
9578
0
        case PyUnicode_2BYTE_KIND:
9579
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9580
0
            break;
9581
0
        case PyUnicode_4BYTE_KIND:
9582
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9583
0
            break;
9584
0
        default:
9585
0
            Py_UNREACHABLE();
9586
0
        }
9587
0
    }
9588
9589
1.03M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9590
1.03M
    if (kind2 != kind1)
9591
333k
        PyMem_Free((void *)buf2);
9592
9593
1.03M
    return result;
9594
1.03M
}
9595
9596
9597
Py_ssize_t
9598
PyUnicode_Count(PyObject *str,
9599
                PyObject *substr,
9600
                Py_ssize_t start,
9601
                Py_ssize_t end)
9602
0
{
9603
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9604
0
        return -1;
9605
9606
0
    return unicode_count_impl(str, substr, start, end);
9607
0
}
9608
9609
Py_ssize_t
9610
PyUnicode_Find(PyObject *str,
9611
               PyObject *substr,
9612
               Py_ssize_t start,
9613
               Py_ssize_t end,
9614
               int direction)
9615
0
{
9616
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9617
0
        return -2;
9618
9619
0
    return any_find_slice(str, substr, start, end, direction);
9620
0
}
9621
9622
Py_ssize_t
9623
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9624
                   Py_ssize_t start, Py_ssize_t end,
9625
                   int direction)
9626
3.88M
{
9627
3.88M
    int kind;
9628
3.88M
    Py_ssize_t len, result;
9629
3.88M
    len = PyUnicode_GET_LENGTH(str);
9630
3.88M
    ADJUST_INDICES(start, end, len);
9631
3.88M
    if (end - start < 1)
9632
0
        return -1;
9633
3.88M
    kind = PyUnicode_KIND(str);
9634
3.88M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9635
3.88M
                      kind, end-start, ch, direction);
9636
3.88M
    if (result == -1)
9637
2.83M
        return -1;
9638
1.04M
    else
9639
1.04M
        return start + result;
9640
3.88M
}
9641
9642
static int
9643
tailmatch(PyObject *self,
9644
          PyObject *substring,
9645
          Py_ssize_t start,
9646
          Py_ssize_t end,
9647
          int direction)
9648
57.5M
{
9649
57.5M
    int kind_self;
9650
57.5M
    int kind_sub;
9651
57.5M
    const void *data_self;
9652
57.5M
    const void *data_sub;
9653
57.5M
    Py_ssize_t offset;
9654
57.5M
    Py_ssize_t i;
9655
57.5M
    Py_ssize_t end_sub;
9656
9657
57.5M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9658
57.5M
    end -= PyUnicode_GET_LENGTH(substring);
9659
57.5M
    if (end < start)
9660
8.13M
        return 0;
9661
9662
49.3M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9663
0
        return 1;
9664
9665
49.3M
    kind_self = PyUnicode_KIND(self);
9666
49.3M
    data_self = PyUnicode_DATA(self);
9667
49.3M
    kind_sub = PyUnicode_KIND(substring);
9668
49.3M
    data_sub = PyUnicode_DATA(substring);
9669
49.3M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9670
9671
49.3M
    if (direction > 0)
9672
7.22M
        offset = end;
9673
42.1M
    else
9674
42.1M
        offset = start;
9675
9676
49.3M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9677
49.3M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9678
34.5M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9679
34.5M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9680
        /* If both are of the same kind, memcmp is sufficient */
9681
14.6M
        if (kind_self == kind_sub) {
9682
6.92M
            return ! memcmp((char *)data_self +
9683
6.92M
                                (offset * PyUnicode_KIND(substring)),
9684
6.92M
                            data_sub,
9685
6.92M
                            PyUnicode_GET_LENGTH(substring) *
9686
6.92M
                                PyUnicode_KIND(substring));
9687
6.92M
        }
9688
        /* otherwise we have to compare each character by first accessing it */
9689
7.68M
        else {
9690
            /* We do not need to compare 0 and len(substring)-1 because
9691
               the if statement above ensured already that they are equal
9692
               when we end up here. */
9693
7.81M
            for (i = 1; i < end_sub; ++i) {
9694
146k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9695
146k
                    PyUnicode_READ(kind_sub, data_sub, i))
9696
24.6k
                    return 0;
9697
146k
            }
9698
7.66M
            return 1;
9699
7.68M
        }
9700
14.6M
    }
9701
9702
34.7M
    return 0;
9703
49.3M
}
9704
9705
Py_ssize_t
9706
PyUnicode_Tailmatch(PyObject *str,
9707
                    PyObject *substr,
9708
                    Py_ssize_t start,
9709
                    Py_ssize_t end,
9710
                    int direction)
9711
291
{
9712
291
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9713
0
        return -1;
9714
9715
291
    return tailmatch(str, substr, start, end, direction);
9716
291
}
9717
9718
static PyObject *
9719
ascii_upper_or_lower(PyObject *self, int lower)
9720
71.0M
{
9721
71.0M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9722
71.0M
    const char *data = PyUnicode_DATA(self);
9723
71.0M
    char *resdata;
9724
71.0M
    PyObject *res;
9725
9726
71.0M
    res = PyUnicode_New(len, 127);
9727
71.0M
    if (res == NULL)
9728
0
        return NULL;
9729
71.0M
    resdata = PyUnicode_DATA(res);
9730
71.0M
    if (lower)
9731
71.0M
        _Py_bytes_lower(resdata, data, len);
9732
306
    else
9733
306
        _Py_bytes_upper(resdata, data, len);
9734
71.0M
    return res;
9735
71.0M
}
9736
9737
static Py_UCS4
9738
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9739
541k
{
9740
541k
    Py_ssize_t j;
9741
541k
    int final_sigma;
9742
541k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9743
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9744
9745
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9746
9747
    where ! is a negation and \p{xxx} is a character with property xxx.
9748
    */
9749
1.00M
    for (j = i - 1; j >= 0; j--) {
9750
998k
        c = PyUnicode_READ(kind, data, j);
9751
998k
        if (!_PyUnicode_IsCaseIgnorable(c))
9752
536k
            break;
9753
998k
    }
9754
541k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9755
541k
    if (final_sigma) {
9756
841k
        for (j = i + 1; j < length; j++) {
9757
839k
            c = PyUnicode_READ(kind, data, j);
9758
839k
            if (!_PyUnicode_IsCaseIgnorable(c))
9759
420k
                break;
9760
839k
        }
9761
422k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9762
422k
    }
9763
541k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9764
541k
}
9765
9766
static int
9767
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9768
           Py_UCS4 c, Py_UCS4 *mapped)
9769
123M
{
9770
    /* Obscure special case. */
9771
123M
    if (c == 0x3A3) {
9772
541k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9773
541k
        return 1;
9774
541k
    }
9775
122M
    return _PyUnicode_ToLowerFull(c, mapped);
9776
123M
}
9777
9778
static Py_ssize_t
9779
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9780
0
{
9781
0
    Py_ssize_t i, k = 0;
9782
0
    int n_res, j;
9783
0
    Py_UCS4 c, mapped[3];
9784
9785
0
    c = PyUnicode_READ(kind, data, 0);
9786
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9787
0
    for (j = 0; j < n_res; j++) {
9788
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9789
0
        res[k++] = mapped[j];
9790
0
    }
9791
0
    for (i = 1; i < length; i++) {
9792
0
        c = PyUnicode_READ(kind, data, i);
9793
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9794
0
        for (j = 0; j < n_res; j++) {
9795
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9796
0
            res[k++] = mapped[j];
9797
0
        }
9798
0
    }
9799
0
    return k;
9800
0
}
9801
9802
static Py_ssize_t
9803
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9804
0
    Py_ssize_t i, k = 0;
9805
9806
0
    for (i = 0; i < length; i++) {
9807
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9808
0
        int n_res, j;
9809
0
        if (Py_UNICODE_ISUPPER(c)) {
9810
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9811
0
        }
9812
0
        else if (Py_UNICODE_ISLOWER(c)) {
9813
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9814
0
        }
9815
0
        else {
9816
0
            n_res = 1;
9817
0
            mapped[0] = c;
9818
0
        }
9819
0
        for (j = 0; j < n_res; j++) {
9820
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9821
0
            res[k++] = mapped[j];
9822
0
        }
9823
0
    }
9824
0
    return k;
9825
0
}
9826
9827
static Py_ssize_t
9828
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9829
                  Py_UCS4 *maxchar, int lower)
9830
5.62M
{
9831
5.62M
    Py_ssize_t i, k = 0;
9832
9833
129M
    for (i = 0; i < length; i++) {
9834
123M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9835
123M
        int n_res, j;
9836
123M
        if (lower)
9837
123M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9838
0
        else
9839
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9840
247M
        for (j = 0; j < n_res; j++) {
9841
123M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9842
123M
            res[k++] = mapped[j];
9843
123M
        }
9844
123M
    }
9845
5.62M
    return k;
9846
5.62M
}
9847
9848
static Py_ssize_t
9849
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9850
0
{
9851
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9852
0
}
9853
9854
static Py_ssize_t
9855
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9856
5.62M
{
9857
5.62M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9858
5.62M
}
9859
9860
static Py_ssize_t
9861
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9862
0
{
9863
0
    Py_ssize_t i, k = 0;
9864
9865
0
    for (i = 0; i < length; i++) {
9866
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9867
0
        Py_UCS4 mapped[3];
9868
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9869
0
        for (j = 0; j < n_res; j++) {
9870
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9871
0
            res[k++] = mapped[j];
9872
0
        }
9873
0
    }
9874
0
    return k;
9875
0
}
9876
9877
static Py_ssize_t
9878
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9879
0
{
9880
0
    Py_ssize_t i, k = 0;
9881
0
    int previous_is_cased;
9882
9883
0
    previous_is_cased = 0;
9884
0
    for (i = 0; i < length; i++) {
9885
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9886
0
        Py_UCS4 mapped[3];
9887
0
        int n_res, j;
9888
9889
0
        if (previous_is_cased)
9890
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9891
0
        else
9892
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9893
9894
0
        for (j = 0; j < n_res; j++) {
9895
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9896
0
            res[k++] = mapped[j];
9897
0
        }
9898
9899
0
        previous_is_cased = _PyUnicode_IsCased(c);
9900
0
    }
9901
0
    return k;
9902
0
}
9903
9904
static PyObject *
9905
case_operation(PyObject *self,
9906
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9907
5.62M
{
9908
5.62M
    PyObject *res = NULL;
9909
5.62M
    Py_ssize_t length, newlength = 0;
9910
5.62M
    int kind, outkind;
9911
5.62M
    const void *data;
9912
5.62M
    void *outdata;
9913
5.62M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9914
9915
5.62M
    kind = PyUnicode_KIND(self);
9916
5.62M
    data = PyUnicode_DATA(self);
9917
5.62M
    length = PyUnicode_GET_LENGTH(self);
9918
5.62M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9919
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9920
0
        return NULL;
9921
0
    }
9922
5.62M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9923
5.62M
    if (tmp == NULL)
9924
0
        return PyErr_NoMemory();
9925
5.62M
    newlength = perform(kind, data, length, tmp, &maxchar);
9926
5.62M
    res = PyUnicode_New(newlength, maxchar);
9927
5.62M
    if (res == NULL)
9928
0
        goto leave;
9929
5.62M
    tmpend = tmp + newlength;
9930
5.62M
    outdata = PyUnicode_DATA(res);
9931
5.62M
    outkind = PyUnicode_KIND(res);
9932
5.62M
    switch (outkind) {
9933
204k
    case PyUnicode_1BYTE_KIND:
9934
204k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9935
204k
        break;
9936
5.25M
    case PyUnicode_2BYTE_KIND:
9937
5.25M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9938
5.25M
        break;
9939
167k
    case PyUnicode_4BYTE_KIND:
9940
167k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9941
167k
        break;
9942
0
    default:
9943
0
        Py_UNREACHABLE();
9944
5.62M
    }
9945
5.62M
  leave:
9946
5.62M
    PyMem_Free(tmp);
9947
5.62M
    return res;
9948
5.62M
}
9949
9950
PyObject *
9951
PyUnicode_Join(PyObject *separator, PyObject *seq)
9952
28.2M
{
9953
28.2M
    PyObject *res;
9954
28.2M
    PyObject *fseq;
9955
28.2M
    Py_ssize_t seqlen;
9956
28.2M
    PyObject **items;
9957
9958
28.2M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9959
28.2M
    if (fseq == NULL) {
9960
621
        return NULL;
9961
621
    }
9962
9963
28.2M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9964
9965
28.2M
    items = PySequence_Fast_ITEMS(fseq);
9966
28.2M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9967
28.2M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9968
9969
28.2M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9970
9971
28.2M
    Py_DECREF(fseq);
9972
28.2M
    return res;
9973
28.2M
}
9974
9975
PyObject *
9976
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9977
45.3M
{
9978
45.3M
    PyObject *res = NULL; /* the result */
9979
45.3M
    PyObject *sep = NULL;
9980
45.3M
    Py_ssize_t seplen;
9981
45.3M
    PyObject *item;
9982
45.3M
    Py_ssize_t sz, i, res_offset;
9983
45.3M
    Py_UCS4 maxchar;
9984
45.3M
    Py_UCS4 item_maxchar;
9985
45.3M
    int use_memcpy;
9986
45.3M
    unsigned char *res_data = NULL, *sep_data = NULL;
9987
45.3M
    PyObject *last_obj;
9988
45.3M
    int kind = 0;
9989
9990
    /* If empty sequence, return u"". */
9991
45.3M
    if (seqlen == 0) {
9992
7.61M
        _Py_RETURN_UNICODE_EMPTY();
9993
7.61M
    }
9994
9995
    /* If singleton sequence with an exact Unicode, return that. */
9996
37.7M
    last_obj = NULL;
9997
37.7M
    if (seqlen == 1) {
9998
14.2M
        if (PyUnicode_CheckExact(items[0])) {
9999
12.9M
            res = items[0];
10000
12.9M
            return Py_NewRef(res);
10001
12.9M
        }
10002
1.29M
        seplen = 0;
10003
1.29M
        maxchar = 0;
10004
1.29M
    }
10005
23.5M
    else {
10006
        /* Set up sep and seplen */
10007
23.5M
        if (separator == NULL) {
10008
            /* fall back to a blank space separator */
10009
0
            sep = PyUnicode_FromOrdinal(' ');
10010
0
            if (!sep)
10011
0
                goto onError;
10012
0
            seplen = 1;
10013
0
            maxchar = 32;
10014
0
        }
10015
23.5M
        else {
10016
23.5M
            if (!PyUnicode_Check(separator)) {
10017
0
                PyErr_Format(PyExc_TypeError,
10018
0
                             "separator: expected str instance,"
10019
0
                             " %.80s found",
10020
0
                             Py_TYPE(separator)->tp_name);
10021
0
                goto onError;
10022
0
            }
10023
23.5M
            sep = separator;
10024
23.5M
            seplen = PyUnicode_GET_LENGTH(separator);
10025
23.5M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10026
            /* inc refcount to keep this code path symmetric with the
10027
               above case of a blank separator */
10028
23.5M
            Py_INCREF(sep);
10029
23.5M
        }
10030
23.5M
        last_obj = sep;
10031
23.5M
    }
10032
10033
    /* There are at least two things to join, or else we have a subclass
10034
     * of str in the sequence.
10035
     * Do a pre-pass to figure out the total amount of space we'll
10036
     * need (sz), and see whether all argument are strings.
10037
     */
10038
24.8M
    sz = 0;
10039
#ifdef Py_DEBUG
10040
    use_memcpy = 0;
10041
#else
10042
24.8M
    use_memcpy = 1;
10043
24.8M
#endif
10044
216M
    for (i = 0; i < seqlen; i++) {
10045
191M
        size_t add_sz;
10046
191M
        item = items[i];
10047
191M
        if (!PyUnicode_Check(item)) {
10048
0
            PyErr_Format(PyExc_TypeError,
10049
0
                         "sequence item %zd: expected str instance,"
10050
0
                         " %.80s found",
10051
0
                         i, Py_TYPE(item)->tp_name);
10052
0
            goto onError;
10053
0
        }
10054
191M
        add_sz = PyUnicode_GET_LENGTH(item);
10055
191M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10056
191M
        maxchar = Py_MAX(maxchar, item_maxchar);
10057
191M
        if (i != 0) {
10058
166M
            add_sz += seplen;
10059
166M
        }
10060
191M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10061
0
            PyErr_SetString(PyExc_OverflowError,
10062
0
                            "join() result is too long for a Python string");
10063
0
            goto onError;
10064
0
        }
10065
191M
        sz += add_sz;
10066
191M
        if (use_memcpy && last_obj != NULL) {
10067
123M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10068
3.56M
                use_memcpy = 0;
10069
123M
        }
10070
191M
        last_obj = item;
10071
191M
    }
10072
10073
24.8M
    res = PyUnicode_New(sz, maxchar);
10074
24.8M
    if (res == NULL)
10075
0
        goto onError;
10076
10077
    /* Catenate everything. */
10078
#ifdef Py_DEBUG
10079
    use_memcpy = 0;
10080
#else
10081
24.8M
    if (use_memcpy) {
10082
21.2M
        res_data = PyUnicode_1BYTE_DATA(res);
10083
21.2M
        kind = PyUnicode_KIND(res);
10084
21.2M
        if (seplen != 0)
10085
197k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10086
21.2M
    }
10087
24.8M
#endif
10088
24.8M
    if (use_memcpy) {
10089
127M
        for (i = 0; i < seqlen; ++i) {
10090
106M
            Py_ssize_t itemlen;
10091
106M
            item = items[i];
10092
10093
            /* Copy item, and maybe the separator. */
10094
106M
            if (i && seplen != 0) {
10095
776k
                memcpy(res_data,
10096
776k
                          sep_data,
10097
776k
                          kind * seplen);
10098
776k
                res_data += kind * seplen;
10099
776k
            }
10100
10101
106M
            itemlen = PyUnicode_GET_LENGTH(item);
10102
106M
            if (itemlen != 0) {
10103
95.4M
                memcpy(res_data,
10104
95.4M
                          PyUnicode_DATA(item),
10105
95.4M
                          kind * itemlen);
10106
95.4M
                res_data += kind * itemlen;
10107
95.4M
            }
10108
106M
        }
10109
21.2M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10110
21.2M
                           + kind * PyUnicode_GET_LENGTH(res));
10111
21.2M
    }
10112
3.56M
    else {
10113
88.6M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10114
85.0M
            Py_ssize_t itemlen;
10115
85.0M
            item = items[i];
10116
10117
            /* Copy item, and maybe the separator. */
10118
85.0M
            if (i && seplen != 0) {
10119
2.10M
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10120
2.10M
                res_offset += seplen;
10121
2.10M
            }
10122
10123
85.0M
            itemlen = PyUnicode_GET_LENGTH(item);
10124
85.0M
            if (itemlen != 0) {
10125
83.6M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10126
83.6M
                res_offset += itemlen;
10127
83.6M
            }
10128
85.0M
        }
10129
3.56M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10130
3.56M
    }
10131
10132
24.8M
    Py_XDECREF(sep);
10133
24.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
10134
24.8M
    return res;
10135
10136
0
  onError:
10137
0
    Py_XDECREF(sep);
10138
0
    Py_XDECREF(res);
10139
0
    return NULL;
10140
24.8M
}
10141
10142
void
10143
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10144
                    Py_UCS4 fill_char)
10145
17.6k
{
10146
17.6k
    const int kind = PyUnicode_KIND(unicode);
10147
17.6k
    void *data = PyUnicode_DATA(unicode);
10148
17.6k
    assert(_PyUnicode_IsModifiable(unicode));
10149
17.6k
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10150
17.6k
    assert(start >= 0);
10151
17.6k
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10152
17.6k
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10153
17.6k
}
10154
10155
Py_ssize_t
10156
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10157
               Py_UCS4 fill_char)
10158
650
{
10159
650
    Py_ssize_t maxlen;
10160
10161
650
    if (!PyUnicode_Check(unicode)) {
10162
0
        PyErr_BadInternalCall();
10163
0
        return -1;
10164
0
    }
10165
650
    if (unicode_check_modifiable(unicode))
10166
0
        return -1;
10167
10168
650
    if (start < 0) {
10169
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10170
0
        return -1;
10171
0
    }
10172
650
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10173
0
        PyErr_SetString(PyExc_ValueError,
10174
0
                         "fill character is bigger than "
10175
0
                         "the string maximum character");
10176
0
        return -1;
10177
0
    }
10178
10179
650
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10180
650
    length = Py_MIN(maxlen, length);
10181
650
    if (length <= 0)
10182
0
        return 0;
10183
10184
650
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10185
650
    return length;
10186
650
}
10187
10188
static PyObject *
10189
pad(PyObject *self,
10190
    Py_ssize_t left,
10191
    Py_ssize_t right,
10192
    Py_UCS4 fill)
10193
68
{
10194
68
    PyObject *u;
10195
68
    Py_UCS4 maxchar;
10196
68
    int kind;
10197
68
    void *data;
10198
10199
68
    if (left < 0)
10200
0
        left = 0;
10201
68
    if (right < 0)
10202
0
        right = 0;
10203
10204
68
    if (left == 0 && right == 0)
10205
0
        return unicode_result_unchanged(self);
10206
10207
68
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10208
68
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10209
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10210
0
        return NULL;
10211
0
    }
10212
68
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10213
68
    maxchar = Py_MAX(maxchar, fill);
10214
68
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10215
68
    if (!u)
10216
0
        return NULL;
10217
10218
68
    kind = PyUnicode_KIND(u);
10219
68
    data = PyUnicode_DATA(u);
10220
68
    if (left)
10221
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10222
68
    if (right)
10223
68
        _PyUnicode_Fill(kind, data, fill,
10224
68
                        left + _PyUnicode_LENGTH(self), right);
10225
68
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10226
68
    assert(_PyUnicode_CheckConsistency(u, 1));
10227
68
    return u;
10228
68
}
10229
10230
PyObject *
10231
PyUnicode_Splitlines(PyObject *string, int keepends)
10232
17.3k
{
10233
17.3k
    PyObject *list;
10234
10235
17.3k
    if (ensure_unicode(string) < 0)
10236
0
        return NULL;
10237
10238
17.3k
    switch (PyUnicode_KIND(string)) {
10239
4.88k
    case PyUnicode_1BYTE_KIND:
10240
4.88k
        if (PyUnicode_IS_ASCII(string))
10241
3.89k
            list = asciilib_splitlines(
10242
3.89k
                string, PyUnicode_1BYTE_DATA(string),
10243
3.89k
                PyUnicode_GET_LENGTH(string), keepends);
10244
985
        else
10245
985
            list = ucs1lib_splitlines(
10246
985
                string, PyUnicode_1BYTE_DATA(string),
10247
985
                PyUnicode_GET_LENGTH(string), keepends);
10248
4.88k
        break;
10249
8.96k
    case PyUnicode_2BYTE_KIND:
10250
8.96k
        list = ucs2lib_splitlines(
10251
8.96k
            string, PyUnicode_2BYTE_DATA(string),
10252
8.96k
            PyUnicode_GET_LENGTH(string), keepends);
10253
8.96k
        break;
10254
3.52k
    case PyUnicode_4BYTE_KIND:
10255
3.52k
        list = ucs4lib_splitlines(
10256
3.52k
            string, PyUnicode_4BYTE_DATA(string),
10257
3.52k
            PyUnicode_GET_LENGTH(string), keepends);
10258
3.52k
        break;
10259
0
    default:
10260
0
        Py_UNREACHABLE();
10261
17.3k
    }
10262
17.3k
    return list;
10263
17.3k
}
10264
10265
static PyObject *
10266
split(PyObject *self,
10267
      PyObject *substring,
10268
      Py_ssize_t maxcount)
10269
22.0M
{
10270
22.0M
    int kind1, kind2;
10271
22.0M
    const void *buf1, *buf2;
10272
22.0M
    Py_ssize_t len1, len2;
10273
22.0M
    PyObject* out;
10274
22.0M
    len1 = PyUnicode_GET_LENGTH(self);
10275
22.0M
    kind1 = PyUnicode_KIND(self);
10276
10277
22.0M
    if (substring == NULL) {
10278
155k
        if (maxcount < 0) {
10279
129k
            maxcount = (len1 - 1) / 2 + 1;
10280
129k
        }
10281
155k
        switch (kind1) {
10282
99.8k
        case PyUnicode_1BYTE_KIND:
10283
99.8k
            if (PyUnicode_IS_ASCII(self))
10284
73.1k
                return asciilib_split_whitespace(
10285
73.1k
                    self,  PyUnicode_1BYTE_DATA(self),
10286
73.1k
                    len1, maxcount
10287
73.1k
                    );
10288
26.7k
            else
10289
26.7k
                return ucs1lib_split_whitespace(
10290
26.7k
                    self,  PyUnicode_1BYTE_DATA(self),
10291
26.7k
                    len1, maxcount
10292
26.7k
                    );
10293
44.9k
        case PyUnicode_2BYTE_KIND:
10294
44.9k
            return ucs2lib_split_whitespace(
10295
44.9k
                self,  PyUnicode_2BYTE_DATA(self),
10296
44.9k
                len1, maxcount
10297
44.9k
                );
10298
10.4k
        case PyUnicode_4BYTE_KIND:
10299
10.4k
            return ucs4lib_split_whitespace(
10300
10.4k
                self,  PyUnicode_4BYTE_DATA(self),
10301
10.4k
                len1, maxcount
10302
10.4k
                );
10303
0
        default:
10304
0
            Py_UNREACHABLE();
10305
155k
        }
10306
155k
    }
10307
10308
21.8M
    kind2 = PyUnicode_KIND(substring);
10309
21.8M
    len2 = PyUnicode_GET_LENGTH(substring);
10310
21.8M
    if (maxcount < 0) {
10311
        // if len2 == 0, it will raise ValueError.
10312
16.1M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10313
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10314
16.1M
        maxcount = maxcount < 0 ? len1 : maxcount;
10315
16.1M
    }
10316
21.8M
    if (kind1 < kind2 || len1 < len2) {
10317
1.11M
        out = PyList_New(1);
10318
1.11M
        if (out == NULL)
10319
0
            return NULL;
10320
1.11M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10321
1.11M
        return out;
10322
1.11M
    }
10323
20.7M
    buf1 = PyUnicode_DATA(self);
10324
20.7M
    buf2 = PyUnicode_DATA(substring);
10325
20.7M
    if (kind2 != kind1) {
10326
262k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10327
262k
        if (!buf2)
10328
0
            return NULL;
10329
262k
    }
10330
10331
20.7M
    switch (kind1) {
10332
20.4M
    case PyUnicode_1BYTE_KIND:
10333
20.4M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10334
19.2M
            out = asciilib_split(
10335
19.2M
                self,  buf1, len1, buf2, len2, maxcount);
10336
1.27M
        else
10337
1.27M
            out = ucs1lib_split(
10338
1.27M
                self,  buf1, len1, buf2, len2, maxcount);
10339
20.4M
        break;
10340
230k
    case PyUnicode_2BYTE_KIND:
10341
230k
        out = ucs2lib_split(
10342
230k
            self,  buf1, len1, buf2, len2, maxcount);
10343
230k
        break;
10344
32.6k
    case PyUnicode_4BYTE_KIND:
10345
32.6k
        out = ucs4lib_split(
10346
32.6k
            self,  buf1, len1, buf2, len2, maxcount);
10347
32.6k
        break;
10348
0
    default:
10349
0
        out = NULL;
10350
20.7M
    }
10351
20.7M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10352
20.7M
    if (kind2 != kind1)
10353
262k
        PyMem_Free((void *)buf2);
10354
20.7M
    return out;
10355
20.7M
}
10356
10357
static PyObject *
10358
rsplit(PyObject *self,
10359
       PyObject *substring,
10360
       Py_ssize_t maxcount)
10361
66
{
10362
66
    int kind1, kind2;
10363
66
    const void *buf1, *buf2;
10364
66
    Py_ssize_t len1, len2;
10365
66
    PyObject* out;
10366
10367
66
    len1 = PyUnicode_GET_LENGTH(self);
10368
66
    kind1 = PyUnicode_KIND(self);
10369
10370
66
    if (substring == NULL) {
10371
0
        if (maxcount < 0) {
10372
0
            maxcount = (len1 - 1) / 2 + 1;
10373
0
        }
10374
0
        switch (kind1) {
10375
0
        case PyUnicode_1BYTE_KIND:
10376
0
            if (PyUnicode_IS_ASCII(self))
10377
0
                return asciilib_rsplit_whitespace(
10378
0
                    self,  PyUnicode_1BYTE_DATA(self),
10379
0
                    len1, maxcount
10380
0
                    );
10381
0
            else
10382
0
                return ucs1lib_rsplit_whitespace(
10383
0
                    self,  PyUnicode_1BYTE_DATA(self),
10384
0
                    len1, maxcount
10385
0
                    );
10386
0
        case PyUnicode_2BYTE_KIND:
10387
0
            return ucs2lib_rsplit_whitespace(
10388
0
                self,  PyUnicode_2BYTE_DATA(self),
10389
0
                len1, maxcount
10390
0
                );
10391
0
        case PyUnicode_4BYTE_KIND:
10392
0
            return ucs4lib_rsplit_whitespace(
10393
0
                self,  PyUnicode_4BYTE_DATA(self),
10394
0
                len1, maxcount
10395
0
                );
10396
0
        default:
10397
0
            Py_UNREACHABLE();
10398
0
        }
10399
0
    }
10400
66
    kind2 = PyUnicode_KIND(substring);
10401
66
    len2 = PyUnicode_GET_LENGTH(substring);
10402
66
    if (maxcount < 0) {
10403
        // if len2 == 0, it will raise ValueError.
10404
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10405
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10406
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10407
0
    }
10408
66
    if (kind1 < kind2 || len1 < len2) {
10409
0
        out = PyList_New(1);
10410
0
        if (out == NULL)
10411
0
            return NULL;
10412
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10413
0
        return out;
10414
0
    }
10415
66
    buf1 = PyUnicode_DATA(self);
10416
66
    buf2 = PyUnicode_DATA(substring);
10417
66
    if (kind2 != kind1) {
10418
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10419
0
        if (!buf2)
10420
0
            return NULL;
10421
0
    }
10422
10423
66
    switch (kind1) {
10424
66
    case PyUnicode_1BYTE_KIND:
10425
66
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10426
66
            out = asciilib_rsplit(
10427
66
                self,  buf1, len1, buf2, len2, maxcount);
10428
0
        else
10429
0
            out = ucs1lib_rsplit(
10430
0
                self,  buf1, len1, buf2, len2, maxcount);
10431
66
        break;
10432
0
    case PyUnicode_2BYTE_KIND:
10433
0
        out = ucs2lib_rsplit(
10434
0
            self,  buf1, len1, buf2, len2, maxcount);
10435
0
        break;
10436
0
    case PyUnicode_4BYTE_KIND:
10437
0
        out = ucs4lib_rsplit(
10438
0
            self,  buf1, len1, buf2, len2, maxcount);
10439
0
        break;
10440
0
    default:
10441
0
        out = NULL;
10442
66
    }
10443
66
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10444
66
    if (kind2 != kind1)
10445
0
        PyMem_Free((void *)buf2);
10446
66
    return out;
10447
66
}
10448
10449
static Py_ssize_t
10450
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10451
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10452
23.0M
{
10453
23.0M
    switch (kind) {
10454
8.45M
    case PyUnicode_1BYTE_KIND:
10455
8.45M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10456
4.23M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10457
4.21M
        else
10458
4.21M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10459
6.88M
    case PyUnicode_2BYTE_KIND:
10460
6.88M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10461
7.75M
    case PyUnicode_4BYTE_KIND:
10462
7.75M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10463
23.0M
    }
10464
23.0M
    Py_UNREACHABLE();
10465
23.0M
}
10466
10467
static Py_ssize_t
10468
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10469
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10470
11.0M
{
10471
11.0M
    switch (kind) {
10472
10.2M
    case PyUnicode_1BYTE_KIND:
10473
10.2M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10474
726k
    case PyUnicode_2BYTE_KIND:
10475
726k
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10476
82.4k
    case PyUnicode_4BYTE_KIND:
10477
82.4k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10478
11.0M
    }
10479
11.0M
    Py_UNREACHABLE();
10480
11.0M
}
10481
10482
static void
10483
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10484
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10485
62.3k
{
10486
62.3k
    int kind = PyUnicode_KIND(u);
10487
62.3k
    void *data = PyUnicode_DATA(u);
10488
62.3k
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10489
62.3k
    if (kind == PyUnicode_1BYTE_KIND) {
10490
32.2k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10491
32.2k
                                      (Py_UCS1 *)data + len,
10492
32.2k
                                      u1, u2, maxcount);
10493
32.2k
    }
10494
30.0k
    else if (kind == PyUnicode_2BYTE_KIND) {
10495
24.7k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10496
24.7k
                                      (Py_UCS2 *)data + len,
10497
24.7k
                                      u1, u2, maxcount);
10498
24.7k
    }
10499
5.37k
    else {
10500
5.37k
        assert(kind == PyUnicode_4BYTE_KIND);
10501
5.37k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10502
5.37k
                                      (Py_UCS4 *)data + len,
10503
5.37k
                                      u1, u2, maxcount);
10504
5.37k
    }
10505
62.3k
}
10506
10507
static PyObject *
10508
replace(PyObject *self, PyObject *str1,
10509
        PyObject *str2, Py_ssize_t maxcount)
10510
18.5M
{
10511
18.5M
    PyObject *u;
10512
18.5M
    const char *sbuf = PyUnicode_DATA(self);
10513
18.5M
    const void *buf1 = PyUnicode_DATA(str1);
10514
18.5M
    const void *buf2 = PyUnicode_DATA(str2);
10515
18.5M
    int srelease = 0, release1 = 0, release2 = 0;
10516
18.5M
    int skind = PyUnicode_KIND(self);
10517
18.5M
    int kind1 = PyUnicode_KIND(str1);
10518
18.5M
    int kind2 = PyUnicode_KIND(str2);
10519
18.5M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10520
18.5M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10521
18.5M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10522
18.5M
    int mayshrink;
10523
18.5M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10524
10525
18.5M
    if (slen < len1)
10526
7.04M
        goto nothing;
10527
10528
11.5M
    if (maxcount < 0)
10529
11.5M
        maxcount = PY_SSIZE_T_MAX;
10530
0
    else if (maxcount == 0)
10531
0
        goto nothing;
10532
10533
11.5M
    if (str1 == str2)
10534
28.2k
        goto nothing;
10535
10536
11.4M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10537
11.4M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10538
11.4M
    if (maxchar < maxchar_str1)
10539
        /* substring too wide to be present */
10540
0
        goto nothing;
10541
11.4M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10542
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10543
       result string. */
10544
11.4M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10545
11.4M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10546
10547
11.4M
    if (len1 == len2) {
10548
        /* same length */
10549
397k
        if (len1 == 0)
10550
0
            goto nothing;
10551
397k
        if (len1 == 1) {
10552
            /* replace characters */
10553
390k
            Py_UCS4 u1, u2;
10554
390k
            Py_ssize_t pos;
10555
10556
390k
            u1 = PyUnicode_READ(kind1, buf1, 0);
10557
390k
            pos = findchar(sbuf, skind, slen, u1, 1);
10558
390k
            if (pos < 0)
10559
328k
                goto nothing;
10560
62.3k
            u2 = PyUnicode_READ(kind2, buf2, 0);
10561
62.3k
            u = PyUnicode_New(slen, maxchar);
10562
62.3k
            if (!u)
10563
0
                goto error;
10564
10565
62.3k
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10566
62.3k
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10567
62.3k
        }
10568
7.25k
        else {
10569
7.25k
            int rkind = skind;
10570
7.25k
            char *res;
10571
7.25k
            Py_ssize_t i;
10572
10573
7.25k
            if (kind1 < rkind) {
10574
                /* widen substring */
10575
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10576
0
                if (!buf1) goto error;
10577
0
                release1 = 1;
10578
0
            }
10579
7.25k
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10580
7.25k
            if (i < 0)
10581
7.25k
                goto nothing;
10582
0
            if (rkind > kind2) {
10583
                /* widen replacement */
10584
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10585
0
                if (!buf2) goto error;
10586
0
                release2 = 1;
10587
0
            }
10588
0
            else if (rkind < kind2) {
10589
                /* widen self and buf1 */
10590
0
                rkind = kind2;
10591
0
                if (release1) {
10592
0
                    assert(buf1 != PyUnicode_DATA(str1));
10593
0
                    PyMem_Free((void *)buf1);
10594
0
                    buf1 = PyUnicode_DATA(str1);
10595
0
                    release1 = 0;
10596
0
                }
10597
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10598
0
                if (!sbuf) goto error;
10599
0
                srelease = 1;
10600
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10601
0
                if (!buf1) goto error;
10602
0
                release1 = 1;
10603
0
            }
10604
0
            u = PyUnicode_New(slen, maxchar);
10605
0
            if (!u)
10606
0
                goto error;
10607
0
            assert(PyUnicode_KIND(u) == rkind);
10608
0
            res = PyUnicode_DATA(u);
10609
10610
0
            memcpy(res, sbuf, rkind * slen);
10611
            /* change everything in-place, starting with this one */
10612
0
            memcpy(res + rkind * i,
10613
0
                   buf2,
10614
0
                   rkind * len2);
10615
0
            i += len1;
10616
10617
0
            while ( --maxcount > 0) {
10618
0
                i = anylib_find(rkind, self,
10619
0
                                sbuf+rkind*i, slen-i,
10620
0
                                str1, buf1, len1, i);
10621
0
                if (i == -1)
10622
0
                    break;
10623
0
                memcpy(res + rkind * i,
10624
0
                       buf2,
10625
0
                       rkind * len2);
10626
0
                i += len1;
10627
0
            }
10628
0
        }
10629
397k
    }
10630
11.0M
    else {
10631
11.0M
        Py_ssize_t n, i, j, ires;
10632
11.0M
        Py_ssize_t new_size;
10633
11.0M
        int rkind = skind;
10634
11.0M
        char *res;
10635
10636
11.0M
        if (kind1 < rkind) {
10637
            /* widen substring */
10638
808k
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10639
808k
            if (!buf1) goto error;
10640
808k
            release1 = 1;
10641
808k
        }
10642
11.0M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10643
11.0M
        if (n == 0)
10644
9.73M
            goto nothing;
10645
1.34M
        if (kind2 < rkind) {
10646
            /* widen replacement */
10647
46.3k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10648
46.3k
            if (!buf2) goto error;
10649
46.3k
            release2 = 1;
10650
46.3k
        }
10651
1.29M
        else if (kind2 > rkind) {
10652
            /* widen self and buf1 */
10653
0
            rkind = kind2;
10654
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10655
0
            if (!sbuf) goto error;
10656
0
            srelease = 1;
10657
0
            if (release1) {
10658
0
                assert(buf1 != PyUnicode_DATA(str1));
10659
0
                PyMem_Free((void *)buf1);
10660
0
                buf1 = PyUnicode_DATA(str1);
10661
0
                release1 = 0;
10662
0
            }
10663
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10664
0
            if (!buf1) goto error;
10665
0
            release1 = 1;
10666
0
        }
10667
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10668
           PyUnicode_GET_LENGTH(str1)); */
10669
1.34M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10670
0
                PyErr_SetString(PyExc_OverflowError,
10671
0
                                "replace string is too long");
10672
0
                goto error;
10673
0
        }
10674
1.34M
        new_size = slen + n * (len2 - len1);
10675
1.34M
        if (new_size == 0) {
10676
0
            u = _PyUnicode_GetEmpty();
10677
0
            goto done;
10678
0
        }
10679
1.34M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10680
0
            PyErr_SetString(PyExc_OverflowError,
10681
0
                            "replace string is too long");
10682
0
            goto error;
10683
0
        }
10684
1.34M
        u = PyUnicode_New(new_size, maxchar);
10685
1.34M
        if (!u)
10686
0
            goto error;
10687
1.34M
        assert(PyUnicode_KIND(u) == rkind);
10688
1.34M
        res = PyUnicode_DATA(u);
10689
1.34M
        ires = i = 0;
10690
1.34M
        if (len1 > 0) {
10691
24.4M
            while (n-- > 0) {
10692
                /* look for next match */
10693
23.0M
                j = anylib_find(rkind, self,
10694
23.0M
                                sbuf + rkind * i, slen-i,
10695
23.0M
                                str1, buf1, len1, i);
10696
23.0M
                if (j == -1)
10697
0
                    break;
10698
23.0M
                else if (j > i) {
10699
                    /* copy unchanged part [i:j] */
10700
4.62M
                    memcpy(res + rkind * ires,
10701
4.62M
                           sbuf + rkind * i,
10702
4.62M
                           rkind * (j-i));
10703
4.62M
                    ires += j - i;
10704
4.62M
                }
10705
                /* copy substitution string */
10706
23.0M
                if (len2 > 0) {
10707
23.0M
                    memcpy(res + rkind * ires,
10708
23.0M
                           buf2,
10709
23.0M
                           rkind * len2);
10710
23.0M
                    ires += len2;
10711
23.0M
                }
10712
23.0M
                i = j + len1;
10713
23.0M
            }
10714
1.34M
            if (i < slen)
10715
                /* copy tail [i:] */
10716
1.33M
                memcpy(res + rkind * ires,
10717
1.33M
                       sbuf + rkind * i,
10718
1.33M
                       rkind * (slen-i));
10719
1.34M
        }
10720
0
        else {
10721
            /* interleave */
10722
0
            while (n > 0) {
10723
0
                memcpy(res + rkind * ires,
10724
0
                       buf2,
10725
0
                       rkind * len2);
10726
0
                ires += len2;
10727
0
                if (--n <= 0)
10728
0
                    break;
10729
0
                memcpy(res + rkind * ires,
10730
0
                       sbuf + rkind * i,
10731
0
                       rkind);
10732
0
                ires++;
10733
0
                i++;
10734
0
            }
10735
0
            memcpy(res + rkind * ires,
10736
0
                   sbuf + rkind * i,
10737
0
                   rkind * (slen-i));
10738
0
        }
10739
1.34M
    }
10740
10741
1.40M
    if (mayshrink) {
10742
0
        unicode_adjust_maxchar(&u);
10743
0
        if (u == NULL)
10744
0
            goto error;
10745
0
    }
10746
10747
1.40M
  done:
10748
1.40M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10749
1.40M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10750
1.40M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10751
1.40M
    if (srelease)
10752
0
        PyMem_Free((void *)sbuf);
10753
1.40M
    if (release1)
10754
46.3k
        PyMem_Free((void *)buf1);
10755
1.40M
    if (release2)
10756
46.3k
        PyMem_Free((void *)buf2);
10757
1.40M
    assert(_PyUnicode_CheckConsistency(u, 1));
10758
1.40M
    return u;
10759
10760
17.1M
  nothing:
10761
    /* nothing to replace; return original string (when possible) */
10762
17.1M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10763
17.1M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10764
17.1M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10765
17.1M
    if (srelease)
10766
0
        PyMem_Free((void *)sbuf);
10767
17.1M
    if (release1)
10768
762k
        PyMem_Free((void *)buf1);
10769
17.1M
    if (release2)
10770
0
        PyMem_Free((void *)buf2);
10771
17.1M
    return unicode_result_unchanged(self);
10772
10773
0
  error:
10774
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10775
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10776
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10777
0
    if (srelease)
10778
0
        PyMem_Free((void *)sbuf);
10779
0
    if (release1)
10780
0
        PyMem_Free((void *)buf1);
10781
0
    if (release2)
10782
0
        PyMem_Free((void *)buf2);
10783
0
    return NULL;
10784
1.40M
}
10785
10786
/* --- Unicode Object Methods --------------------------------------------- */
10787
10788
/*[clinic input]
10789
@permit_long_docstring_body
10790
str.title as unicode_title
10791
10792
Return a version of the string where each word is titlecased.
10793
10794
More specifically, words start with uppercased characters and all remaining
10795
cased characters have lower case.
10796
[clinic start generated code]*/
10797
10798
static PyObject *
10799
unicode_title_impl(PyObject *self)
10800
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10801
0
{
10802
0
    return case_operation(self, do_title);
10803
0
}
10804
10805
/*[clinic input]
10806
@permit_long_docstring_body
10807
str.capitalize as unicode_capitalize
10808
10809
Return a capitalized version of the string.
10810
10811
More specifically, make the first character have upper case and the rest lower
10812
case.
10813
[clinic start generated code]*/
10814
10815
static PyObject *
10816
unicode_capitalize_impl(PyObject *self)
10817
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10818
0
{
10819
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10820
0
        return unicode_result_unchanged(self);
10821
0
    return case_operation(self, do_capitalize);
10822
0
}
10823
10824
/*[clinic input]
10825
str.casefold as unicode_casefold
10826
10827
Return a version of the string suitable for caseless comparisons.
10828
[clinic start generated code]*/
10829
10830
static PyObject *
10831
unicode_casefold_impl(PyObject *self)
10832
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10833
0
{
10834
0
    if (PyUnicode_IS_ASCII(self))
10835
0
        return ascii_upper_or_lower(self, 1);
10836
0
    return case_operation(self, do_casefold);
10837
0
}
10838
10839
10840
/* Argument converter. Accepts a single Unicode character. */
10841
10842
static int
10843
convert_uc(PyObject *obj, void *addr)
10844
130
{
10845
130
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10846
10847
130
    if (!PyUnicode_Check(obj)) {
10848
0
        PyErr_Format(PyExc_TypeError,
10849
0
                     "The fill character must be a unicode character, "
10850
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10851
0
        return 0;
10852
0
    }
10853
130
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10854
0
        PyErr_SetString(PyExc_TypeError,
10855
0
                        "The fill character must be exactly one character long");
10856
0
        return 0;
10857
0
    }
10858
130
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10859
130
    return 1;
10860
130
}
10861
10862
/*[clinic input]
10863
str.center as unicode_center
10864
10865
    width: Py_ssize_t
10866
    fillchar: Py_UCS4 = ' '
10867
    /
10868
10869
Return a centered string of length width.
10870
10871
Padding is done using the specified fill character (default is a space).
10872
[clinic start generated code]*/
10873
10874
static PyObject *
10875
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10876
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10877
0
{
10878
0
    Py_ssize_t marg, left;
10879
10880
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10881
0
        return unicode_result_unchanged(self);
10882
10883
0
    marg = width - PyUnicode_GET_LENGTH(self);
10884
0
    left = marg / 2 + (marg & width & 1);
10885
10886
0
    return pad(self, left, marg - left, fillchar);
10887
0
}
10888
10889
/* This function assumes that str1 and str2 are readied by the caller. */
10890
10891
static int
10892
unicode_compare(PyObject *str1, PyObject *str2)
10893
30.4M
{
10894
30.4M
#define COMPARE(TYPE1, TYPE2) \
10895
30.4M
    do { \
10896
27.7M
        TYPE1* p1 = (TYPE1 *)data1; \
10897
27.7M
        TYPE2* p2 = (TYPE2 *)data2; \
10898
27.7M
        TYPE1* end = p1 + len; \
10899
27.7M
        Py_UCS4 c1, c2; \
10900
27.7M
        for (; p1 != end; p1++, p2++) { \
10901
27.7M
            c1 = *p1; \
10902
27.7M
            c2 = *p2; \
10903
27.7M
            if (c1 != c2) \
10904
27.7M
                return (c1 < c2) ? -1 : 1; \
10905
27.7M
        } \
10906
27.7M
    } \
10907
27.7M
    while (0)
10908
10909
30.4M
    int kind1, kind2;
10910
30.4M
    const void *data1, *data2;
10911
30.4M
    Py_ssize_t len1, len2, len;
10912
10913
30.4M
    kind1 = PyUnicode_KIND(str1);
10914
30.4M
    kind2 = PyUnicode_KIND(str2);
10915
30.4M
    data1 = PyUnicode_DATA(str1);
10916
30.4M
    data2 = PyUnicode_DATA(str2);
10917
30.4M
    len1 = PyUnicode_GET_LENGTH(str1);
10918
30.4M
    len2 = PyUnicode_GET_LENGTH(str2);
10919
30.4M
    len = Py_MIN(len1, len2);
10920
10921
30.4M
    switch(kind1) {
10922
4.31M
    case PyUnicode_1BYTE_KIND:
10923
4.31M
    {
10924
4.31M
        switch(kind2) {
10925
436k
        case PyUnicode_1BYTE_KIND:
10926
436k
        {
10927
436k
            int cmp = memcmp(data1, data2, len);
10928
            /* normalize result of memcmp() into the range [-1; 1] */
10929
436k
            if (cmp < 0)
10930
370k
                return -1;
10931
65.9k
            if (cmp > 0)
10932
60.0k
                return 1;
10933
5.90k
            break;
10934
65.9k
        }
10935
3.37M
        case PyUnicode_2BYTE_KIND:
10936
3.37M
            COMPARE(Py_UCS1, Py_UCS2);
10937
0
            break;
10938
507k
        case PyUnicode_4BYTE_KIND:
10939
507k
            COMPARE(Py_UCS1, Py_UCS4);
10940
0
            break;
10941
0
        default:
10942
0
            Py_UNREACHABLE();
10943
4.31M
        }
10944
5.90k
        break;
10945
4.31M
    }
10946
23.3M
    case PyUnicode_2BYTE_KIND:
10947
23.3M
    {
10948
23.3M
        switch(kind2) {
10949
79.4k
        case PyUnicode_1BYTE_KIND:
10950
79.4k
            COMPARE(Py_UCS2, Py_UCS1);
10951
0
            break;
10952
22.6M
        case PyUnicode_2BYTE_KIND:
10953
22.6M
        {
10954
22.6M
            COMPARE(Py_UCS2, Py_UCS2);
10955
0
            break;
10956
22.6M
        }
10957
674k
        case PyUnicode_4BYTE_KIND:
10958
674k
            COMPARE(Py_UCS2, Py_UCS4);
10959
0
            break;
10960
0
        default:
10961
0
            Py_UNREACHABLE();
10962
23.3M
        }
10963
0
        break;
10964
23.3M
    }
10965
2.80M
    case PyUnicode_4BYTE_KIND:
10966
2.80M
    {
10967
2.80M
        switch(kind2) {
10968
9.17k
        case PyUnicode_1BYTE_KIND:
10969
9.17k
            COMPARE(Py_UCS4, Py_UCS1);
10970
0
            break;
10971
515k
        case PyUnicode_2BYTE_KIND:
10972
515k
            COMPARE(Py_UCS4, Py_UCS2);
10973
0
            break;
10974
2.28M
        case PyUnicode_4BYTE_KIND:
10975
2.28M
        {
10976
2.28M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10977
2.28M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10978
            /* normalize result of wmemcmp() into the range [-1; 1] */
10979
2.28M
            if (cmp < 0)
10980
1.10M
                return -1;
10981
1.18M
            if (cmp > 0)
10982
1.18M
                return 1;
10983
#else
10984
            COMPARE(Py_UCS4, Py_UCS4);
10985
#endif
10986
0
            break;
10987
1.18M
        }
10988
0
        default:
10989
0
            Py_UNREACHABLE();
10990
2.80M
        }
10991
0
        break;
10992
2.80M
    }
10993
0
    default:
10994
0
        Py_UNREACHABLE();
10995
30.4M
    }
10996
10997
5.90k
    if (len1 == len2)
10998
5.86k
        return 0;
10999
36
    if (len1 < len2)
11000
13
        return -1;
11001
23
    else
11002
23
        return 1;
11003
11004
36
#undef COMPARE
11005
36
}
11006
11007
11008
int
11009
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11010
623M
{
11011
623M
    assert(PyUnicode_Check(str1));
11012
623M
    assert(PyUnicode_Check(str2));
11013
623M
    if (str1 == str2) {
11014
86.1M
        return 1;
11015
86.1M
    }
11016
537M
    return unicode_eq(str1, str2);
11017
623M
}
11018
11019
11020
int
11021
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11022
0
{
11023
0
    if (!PyUnicode_Check(str1)) {
11024
0
        PyErr_Format(PyExc_TypeError,
11025
0
                     "first argument must be str, not %T", str1);
11026
0
        return -1;
11027
0
    }
11028
0
    if (!PyUnicode_Check(str2)) {
11029
0
        PyErr_Format(PyExc_TypeError,
11030
0
                     "second argument must be str, not %T", str2);
11031
0
        return -1;
11032
0
    }
11033
11034
0
    return _PyUnicode_Equal(str1, str2);
11035
0
}
11036
11037
11038
int
11039
PyUnicode_Compare(PyObject *left, PyObject *right)
11040
261k
{
11041
261k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11042
        /* a string is equal to itself */
11043
261k
        if (left == right)
11044
0
            return 0;
11045
11046
261k
        return unicode_compare(left, right);
11047
261k
    }
11048
0
    PyErr_Format(PyExc_TypeError,
11049
0
                 "Can't compare %.100s and %.100s",
11050
0
                 Py_TYPE(left)->tp_name,
11051
0
                 Py_TYPE(right)->tp_name);
11052
0
    return -1;
11053
261k
}
11054
11055
int
11056
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11057
12.2M
{
11058
12.2M
    Py_ssize_t i;
11059
12.2M
    int kind;
11060
12.2M
    Py_UCS4 chr;
11061
11062
12.2M
    assert(_PyUnicode_CHECK(uni));
11063
12.2M
    kind = PyUnicode_KIND(uni);
11064
12.2M
    if (kind == PyUnicode_1BYTE_KIND) {
11065
12.2M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11066
12.2M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11067
12.2M
        size_t len, len2 = strlen(str);
11068
12.2M
        int cmp;
11069
11070
12.2M
        len = Py_MIN(len1, len2);
11071
12.2M
        cmp = memcmp(data, str, len);
11072
12.2M
        if (cmp != 0) {
11073
8.11M
            if (cmp < 0)
11074
48.5k
                return -1;
11075
8.06M
            else
11076
8.06M
                return 1;
11077
8.11M
        }
11078
4.18M
        if (len1 > len2)
11079
200
            return 1; /* uni is longer */
11080
4.18M
        if (len1 < len2)
11081
687
            return -1; /* str is longer */
11082
4.18M
        return 0;
11083
4.18M
    }
11084
1.54k
    else {
11085
1.54k
        const void *data = PyUnicode_DATA(uni);
11086
        /* Compare Unicode string and source character set string */
11087
2.66k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11088
2.45k
            if (chr != (unsigned char)str[i])
11089
1.34k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11090
        /* This check keeps Python strings that end in '\0' from comparing equal
11091
         to C strings identical up to that point. */
11092
203
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11093
203
            return 1; /* uni is longer */
11094
0
        if (str[i])
11095
0
            return -1; /* str is longer */
11096
0
        return 0;
11097
0
    }
11098
12.2M
}
11099
11100
int
11101
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11102
24
{
11103
24
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11104
24
}
11105
11106
int
11107
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11108
24
{
11109
24
    assert(_PyUnicode_CHECK(unicode));
11110
24
    assert(str);
11111
11112
24
    if (PyUnicode_IS_ASCII(unicode)) {
11113
24
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11114
24
        return size == len &&
11115
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11116
24
    }
11117
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11118
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11119
0
        return size == len &&
11120
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11121
0
    }
11122
11123
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11124
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11125
0
        return 0;
11126
0
    }
11127
0
    const unsigned char *s = (const unsigned char *)str;
11128
0
    const unsigned char *ends = s + (size_t)size;
11129
0
    int kind = PyUnicode_KIND(unicode);
11130
0
    const void *data = PyUnicode_DATA(unicode);
11131
    /* Compare Unicode string and UTF-8 string */
11132
0
    for (Py_ssize_t i = 0; i < len; i++) {
11133
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11134
0
        if (ch < 0x80) {
11135
0
            if (ends == s || s[0] != ch) {
11136
0
                return 0;
11137
0
            }
11138
0
            s += 1;
11139
0
        }
11140
0
        else if (ch < 0x800) {
11141
0
            if ((ends - s) < 2 ||
11142
0
                s[0] != (0xc0 | (ch >> 6)) ||
11143
0
                s[1] != (0x80 | (ch & 0x3f)))
11144
0
            {
11145
0
                return 0;
11146
0
            }
11147
0
            s += 2;
11148
0
        }
11149
0
        else if (ch < 0x10000) {
11150
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11151
0
                (ends - s) < 3 ||
11152
0
                s[0] != (0xe0 | (ch >> 12)) ||
11153
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11154
0
                s[2] != (0x80 | (ch & 0x3f)))
11155
0
            {
11156
0
                return 0;
11157
0
            }
11158
0
            s += 3;
11159
0
        }
11160
0
        else {
11161
0
            assert(ch <= MAX_UNICODE);
11162
0
            if ((ends - s) < 4 ||
11163
0
                s[0] != (0xf0 | (ch >> 18)) ||
11164
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11165
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11166
0
                s[3] != (0x80 | (ch & 0x3f)))
11167
0
            {
11168
0
                return 0;
11169
0
            }
11170
0
            s += 4;
11171
0
        }
11172
0
    }
11173
0
    return s == ends;
11174
0
}
11175
11176
int
11177
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11178
37.8M
{
11179
37.8M
    size_t len;
11180
37.8M
    assert(_PyUnicode_CHECK(unicode));
11181
37.8M
    assert(str);
11182
#ifndef NDEBUG
11183
    for (const char *p = str; *p; p++) {
11184
        assert((unsigned char)*p < 128);
11185
    }
11186
#endif
11187
37.8M
    if (!PyUnicode_IS_ASCII(unicode))
11188
176k
        return 0;
11189
37.6M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11190
37.6M
    return strlen(str) == len &&
11191
700k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11192
37.8M
}
11193
11194
PyObject *
11195
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11196
75.3M
{
11197
75.3M
    int result;
11198
11199
75.3M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11200
296k
        Py_RETURN_NOTIMPLEMENTED;
11201
11202
75.0M
    if (left == right) {
11203
2.34k
        switch (op) {
11204
2.11k
        case Py_EQ:
11205
2.11k
        case Py_LE:
11206
2.11k
        case Py_GE:
11207
            /* a string is equal to itself */
11208
2.11k
            Py_RETURN_TRUE;
11209
234
        case Py_NE:
11210
234
        case Py_LT:
11211
234
        case Py_GT:
11212
234
            Py_RETURN_FALSE;
11213
0
        default:
11214
0
            PyErr_BadArgument();
11215
0
            return NULL;
11216
2.34k
        }
11217
2.34k
    }
11218
75.0M
    else if (op == Py_EQ || op == Py_NE) {
11219
44.8M
        result = unicode_eq(left, right);
11220
44.8M
        result ^= (op == Py_NE);
11221
44.8M
        return PyBool_FromLong(result);
11222
44.8M
    }
11223
30.2M
    else {
11224
30.2M
        result = unicode_compare(left, right);
11225
30.2M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11226
30.2M
    }
11227
75.0M
}
11228
11229
int
11230
PyUnicode_Contains(PyObject *str, PyObject *substr)
11231
222M
{
11232
222M
    int kind1, kind2;
11233
222M
    const void *buf1, *buf2;
11234
222M
    Py_ssize_t len1, len2;
11235
222M
    int result;
11236
11237
222M
    if (!PyUnicode_Check(substr)) {
11238
0
        PyErr_Format(PyExc_TypeError,
11239
0
                     "'in <string>' requires string as left operand, not %.100s",
11240
0
                     Py_TYPE(substr)->tp_name);
11241
0
        return -1;
11242
0
    }
11243
222M
    if (ensure_unicode(str) < 0)
11244
0
        return -1;
11245
11246
222M
    kind1 = PyUnicode_KIND(str);
11247
222M
    kind2 = PyUnicode_KIND(substr);
11248
222M
    if (kind1 < kind2)
11249
13.7M
        return 0;
11250
208M
    len1 = PyUnicode_GET_LENGTH(str);
11251
208M
    len2 = PyUnicode_GET_LENGTH(substr);
11252
208M
    if (len1 < len2)
11253
1.11M
        return 0;
11254
207M
    buf1 = PyUnicode_DATA(str);
11255
207M
    buf2 = PyUnicode_DATA(substr);
11256
207M
    if (len2 == 1) {
11257
186M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11258
186M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11259
186M
        return result;
11260
186M
    }
11261
20.8M
    if (kind2 != kind1) {
11262
19.5k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11263
19.5k
        if (!buf2)
11264
0
            return -1;
11265
19.5k
    }
11266
11267
20.8M
    switch (kind1) {
11268
20.8M
    case PyUnicode_1BYTE_KIND:
11269
20.8M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11270
20.8M
        break;
11271
14.4k
    case PyUnicode_2BYTE_KIND:
11272
14.4k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11273
14.4k
        break;
11274
5.15k
    case PyUnicode_4BYTE_KIND:
11275
5.15k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11276
5.15k
        break;
11277
0
    default:
11278
0
        Py_UNREACHABLE();
11279
20.8M
    }
11280
11281
20.8M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11282
20.8M
    if (kind2 != kind1)
11283
19.5k
        PyMem_Free((void *)buf2);
11284
11285
20.8M
    return result;
11286
20.8M
}
11287
11288
/* Concat to string or Unicode object giving a new Unicode object. */
11289
11290
PyObject *
11291
PyUnicode_Concat(PyObject *left, PyObject *right)
11292
25.2M
{
11293
25.2M
    PyObject *result;
11294
25.2M
    Py_UCS4 maxchar, maxchar2;
11295
25.2M
    Py_ssize_t left_len, right_len, new_len;
11296
11297
25.2M
    if (ensure_unicode(left) < 0)
11298
0
        return NULL;
11299
11300
25.2M
    if (!PyUnicode_Check(right)) {
11301
0
        PyErr_Format(PyExc_TypeError,
11302
0
            "can only concatenate str (not \"%.200s\") to str",
11303
0
            Py_TYPE(right)->tp_name);
11304
0
        return NULL;
11305
0
    }
11306
11307
    /* Shortcuts */
11308
25.2M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11309
25.2M
    if (left == empty) {
11310
445k
        return PyUnicode_FromObject(right);
11311
445k
    }
11312
24.8M
    if (right == empty) {
11313
1.61M
        return PyUnicode_FromObject(left);
11314
1.61M
    }
11315
11316
23.1M
    left_len = PyUnicode_GET_LENGTH(left);
11317
23.1M
    right_len = PyUnicode_GET_LENGTH(right);
11318
23.1M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11319
0
        PyErr_SetString(PyExc_OverflowError,
11320
0
                        "strings are too large to concat");
11321
0
        return NULL;
11322
0
    }
11323
23.1M
    new_len = left_len + right_len;
11324
11325
23.1M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11326
23.1M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11327
23.1M
    maxchar = Py_MAX(maxchar, maxchar2);
11328
11329
    /* Concat the two Unicode strings */
11330
23.1M
    result = PyUnicode_New(new_len, maxchar);
11331
23.1M
    if (result == NULL)
11332
0
        return NULL;
11333
23.1M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11334
23.1M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11335
23.1M
    assert(_PyUnicode_CheckConsistency(result, 1));
11336
23.1M
    return result;
11337
23.1M
}
11338
11339
void
11340
PyUnicode_Append(PyObject **p_left, PyObject *right)
11341
5.61M
{
11342
5.61M
    PyObject *left, *res;
11343
5.61M
    Py_UCS4 maxchar, maxchar2;
11344
5.61M
    Py_ssize_t left_len, right_len, new_len;
11345
11346
5.61M
    if (p_left == NULL) {
11347
0
        if (!PyErr_Occurred())
11348
0
            PyErr_BadInternalCall();
11349
0
        return;
11350
0
    }
11351
5.61M
    left = *p_left;
11352
5.61M
    if (right == NULL || left == NULL
11353
5.61M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11354
0
        if (!PyErr_Occurred())
11355
0
            PyErr_BadInternalCall();
11356
0
        goto error;
11357
0
    }
11358
11359
    /* Shortcuts */
11360
5.61M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11361
5.61M
    if (left == empty) {
11362
493k
        Py_DECREF(left);
11363
493k
        *p_left = Py_NewRef(right);
11364
493k
        return;
11365
493k
    }
11366
5.12M
    if (right == empty) {
11367
13.3k
        return;
11368
13.3k
    }
11369
11370
5.10M
    left_len = PyUnicode_GET_LENGTH(left);
11371
5.10M
    right_len = PyUnicode_GET_LENGTH(right);
11372
5.10M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11373
0
        PyErr_SetString(PyExc_OverflowError,
11374
0
                        "strings are too large to concat");
11375
0
        goto error;
11376
0
    }
11377
5.10M
    new_len = left_len + right_len;
11378
11379
5.10M
    if (_PyUnicode_IsModifiable(left)
11380
5.10M
        && PyUnicode_CheckExact(right)
11381
5.10M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11382
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11383
           to change the structure size, but characters are stored just after
11384
           the structure, and so it requires to move all characters which is
11385
           not so different than duplicating the string. */
11386
2.35M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11387
2.35M
    {
11388
        /* append inplace */
11389
2.35M
        if (unicode_resize(p_left, new_len) != 0)
11390
0
            goto error;
11391
11392
        /* copy 'right' into the newly allocated area of 'left' */
11393
2.35M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11394
2.35M
    }
11395
2.75M
    else {
11396
2.75M
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11397
2.75M
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11398
2.75M
        maxchar = Py_MAX(maxchar, maxchar2);
11399
11400
        /* Concat the two Unicode strings */
11401
2.75M
        res = PyUnicode_New(new_len, maxchar);
11402
2.75M
        if (res == NULL)
11403
0
            goto error;
11404
2.75M
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11405
2.75M
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11406
2.75M
        Py_DECREF(left);
11407
2.75M
        *p_left = res;
11408
2.75M
    }
11409
5.10M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11410
5.10M
    return;
11411
11412
0
error:
11413
0
    Py_CLEAR(*p_left);
11414
0
}
11415
11416
void
11417
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11418
8
{
11419
8
    PyUnicode_Append(pleft, right);
11420
8
    Py_XDECREF(right);
11421
8
}
11422
11423
/*[clinic input]
11424
@permit_long_summary
11425
@text_signature "($self, sub[, start[, end]], /)"
11426
str.count as unicode_count -> Py_ssize_t
11427
11428
    self as str: self
11429
    sub as substr: unicode
11430
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11431
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11432
    /
11433
11434
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11435
11436
Optional arguments start and end are interpreted as in slice notation.
11437
[clinic start generated code]*/
11438
11439
static Py_ssize_t
11440
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11441
                   Py_ssize_t end)
11442
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11443
26.7M
{
11444
26.7M
    assert(PyUnicode_Check(str));
11445
26.7M
    assert(PyUnicode_Check(substr));
11446
11447
26.7M
    Py_ssize_t result;
11448
26.7M
    int kind1, kind2;
11449
26.7M
    const void *buf1 = NULL, *buf2 = NULL;
11450
26.7M
    Py_ssize_t len1, len2;
11451
11452
26.7M
    kind1 = PyUnicode_KIND(str);
11453
26.7M
    kind2 = PyUnicode_KIND(substr);
11454
26.7M
    if (kind1 < kind2)
11455
0
        return 0;
11456
11457
26.7M
    len1 = PyUnicode_GET_LENGTH(str);
11458
26.7M
    len2 = PyUnicode_GET_LENGTH(substr);
11459
26.7M
    ADJUST_INDICES(start, end, len1);
11460
26.7M
    if (end - start < len2)
11461
3.02M
        return 0;
11462
11463
23.7M
    buf1 = PyUnicode_DATA(str);
11464
23.7M
    buf2 = PyUnicode_DATA(substr);
11465
23.7M
    if (kind2 != kind1) {
11466
6.76M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11467
6.76M
        if (!buf2)
11468
0
            goto onError;
11469
6.76M
    }
11470
11471
    // We don't reuse `anylib_count` here because of the explicit casts.
11472
23.7M
    switch (kind1) {
11473
16.9M
    case PyUnicode_1BYTE_KIND:
11474
16.9M
        result = ucs1lib_count(
11475
16.9M
            ((const Py_UCS1*)buf1) + start, end - start,
11476
16.9M
            buf2, len2, PY_SSIZE_T_MAX
11477
16.9M
            );
11478
16.9M
        break;
11479
4.01M
    case PyUnicode_2BYTE_KIND:
11480
4.01M
        result = ucs2lib_count(
11481
4.01M
            ((const Py_UCS2*)buf1) + start, end - start,
11482
4.01M
            buf2, len2, PY_SSIZE_T_MAX
11483
4.01M
            );
11484
4.01M
        break;
11485
2.74M
    case PyUnicode_4BYTE_KIND:
11486
2.74M
        result = ucs4lib_count(
11487
2.74M
            ((const Py_UCS4*)buf1) + start, end - start,
11488
2.74M
            buf2, len2, PY_SSIZE_T_MAX
11489
2.74M
            );
11490
2.74M
        break;
11491
0
    default:
11492
0
        Py_UNREACHABLE();
11493
23.7M
    }
11494
11495
23.7M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11496
23.7M
    if (kind2 != kind1)
11497
6.76M
        PyMem_Free((void *)buf2);
11498
11499
23.7M
    return result;
11500
0
  onError:
11501
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11502
0
    if (kind2 != kind1)
11503
0
        PyMem_Free((void *)buf2);
11504
0
    return -1;
11505
23.7M
}
11506
11507
/*[clinic input]
11508
str.encode as unicode_encode
11509
11510
    encoding: str(c_default="NULL") = 'utf-8'
11511
        The encoding in which to encode the string.
11512
    errors: str(c_default="NULL") = 'strict'
11513
        The error handling scheme to use for encoding errors.
11514
        The default is 'strict' meaning that encoding errors raise a
11515
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11516
        'xmlcharrefreplace' as well as any other name registered with
11517
        codecs.register_error that can handle UnicodeEncodeErrors.
11518
11519
Encode the string using the codec registered for encoding.
11520
[clinic start generated code]*/
11521
11522
static PyObject *
11523
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11524
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11525
19.4M
{
11526
19.4M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11527
19.4M
}
11528
11529
/*[clinic input]
11530
str.expandtabs as unicode_expandtabs
11531
11532
    tabsize: int = 8
11533
11534
Return a copy where all tab characters are expanded using spaces.
11535
11536
If tabsize is not given, a tab size of 8 characters is assumed.
11537
[clinic start generated code]*/
11538
11539
static PyObject *
11540
unicode_expandtabs_impl(PyObject *self, int tabsize)
11541
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11542
1.22M
{
11543
1.22M
    Py_ssize_t i, j, line_pos, src_len, incr;
11544
1.22M
    Py_UCS4 ch;
11545
1.22M
    PyObject *u;
11546
1.22M
    const void *src_data;
11547
1.22M
    void *dest_data;
11548
1.22M
    int kind;
11549
1.22M
    int found;
11550
11551
    /* First pass: determine size of output string */
11552
1.22M
    src_len = PyUnicode_GET_LENGTH(self);
11553
1.22M
    i = j = line_pos = 0;
11554
1.22M
    kind = PyUnicode_KIND(self);
11555
1.22M
    src_data = PyUnicode_DATA(self);
11556
1.22M
    found = 0;
11557
2.78M
    for (; i < src_len; i++) {
11558
1.56M
        ch = PyUnicode_READ(kind, src_data, i);
11559
1.56M
        if (ch == '\t') {
11560
367k
            found = 1;
11561
367k
            if (tabsize > 0) {
11562
367k
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11563
367k
                if (j > PY_SSIZE_T_MAX - incr)
11564
0
                    goto overflow;
11565
367k
                line_pos += incr;
11566
367k
                j += incr;
11567
367k
            }
11568
367k
        }
11569
1.20M
        else {
11570
1.20M
            if (j > PY_SSIZE_T_MAX - 1)
11571
0
                goto overflow;
11572
1.20M
            line_pos++;
11573
1.20M
            j++;
11574
1.20M
            if (ch == '\n' || ch == '\r')
11575
4.45k
                line_pos = 0;
11576
1.20M
        }
11577
1.56M
    }
11578
1.22M
    if (!found)
11579
1.19M
        return unicode_result_unchanged(self);
11580
11581
    /* Second pass: create output string and fill it */
11582
26.7k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11583
26.7k
    if (!u)
11584
0
        return NULL;
11585
26.7k
    dest_data = PyUnicode_DATA(u);
11586
11587
26.7k
    i = j = line_pos = 0;
11588
11589
743k
    for (; i < src_len; i++) {
11590
716k
        ch = PyUnicode_READ(kind, src_data, i);
11591
716k
        if (ch == '\t') {
11592
367k
            if (tabsize > 0) {
11593
367k
                incr = tabsize - (line_pos % tabsize);
11594
367k
                line_pos += incr;
11595
367k
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11596
367k
                j += incr;
11597
367k
            }
11598
367k
        }
11599
349k
        else {
11600
349k
            line_pos++;
11601
349k
            PyUnicode_WRITE(kind, dest_data, j, ch);
11602
349k
            j++;
11603
349k
            if (ch == '\n' || ch == '\r')
11604
0
                line_pos = 0;
11605
349k
        }
11606
716k
    }
11607
26.7k
    assert (j == PyUnicode_GET_LENGTH(u));
11608
26.7k
    return unicode_result(u);
11609
11610
0
  overflow:
11611
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11612
0
    return NULL;
11613
26.7k
}
11614
11615
/*[clinic input]
11616
@permit_long_summary
11617
str.find as unicode_find = str.count
11618
11619
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11620
11621
Optional arguments start and end are interpreted as in slice notation.
11622
Return -1 on failure.
11623
[clinic start generated code]*/
11624
11625
static Py_ssize_t
11626
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11627
                  Py_ssize_t end)
11628
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11629
25.2M
{
11630
25.2M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11631
25.2M
    if (result < 0) {
11632
5.39M
        return -1;
11633
5.39M
    }
11634
19.8M
    return result;
11635
25.2M
}
11636
11637
static PyObject *
11638
unicode_getitem(PyObject *self, Py_ssize_t index)
11639
57.4M
{
11640
57.4M
    const void *data;
11641
57.4M
    int kind;
11642
57.4M
    Py_UCS4 ch;
11643
11644
57.4M
    if (!PyUnicode_Check(self)) {
11645
0
        PyErr_BadArgument();
11646
0
        return NULL;
11647
0
    }
11648
57.4M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11649
15.4k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11650
15.4k
        return NULL;
11651
15.4k
    }
11652
57.4M
    kind = PyUnicode_KIND(self);
11653
57.4M
    data = PyUnicode_DATA(self);
11654
57.4M
    ch = PyUnicode_READ(kind, data, index);
11655
57.4M
    return unicode_char(ch);
11656
57.4M
}
11657
11658
/* Believe it or not, this produces the same value for ASCII strings
11659
   as bytes_hash(). */
11660
static Py_hash_t
11661
unicode_hash(PyObject *self)
11662
989M
{
11663
989M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11664
11665
#ifdef Py_DEBUG
11666
    assert(_Py_HashSecret_Initialized);
11667
#endif
11668
989M
    Py_hash_t hash = PyUnicode_HASH(self);
11669
989M
    if (hash != -1) {
11670
940M
        return hash;
11671
940M
    }
11672
48.6M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11673
48.6M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11674
11675
48.6M
    PyUnicode_SET_HASH(self, x);
11676
48.6M
    return x;
11677
989M
}
11678
11679
/*[clinic input]
11680
@permit_long_summary
11681
str.index as unicode_index = str.count
11682
11683
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11684
11685
Optional arguments start and end are interpreted as in slice notation.
11686
Raises ValueError when the substring is not found.
11687
[clinic start generated code]*/
11688
11689
static Py_ssize_t
11690
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11691
                   Py_ssize_t end)
11692
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11693
45.2k
{
11694
45.2k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11695
45.2k
    if (result == -1) {
11696
712
        PyErr_SetString(PyExc_ValueError, "substring not found");
11697
712
    }
11698
44.5k
    else if (result < 0) {
11699
0
        return -1;
11700
0
    }
11701
45.2k
    return result;
11702
45.2k
}
11703
11704
/*[clinic input]
11705
str.isascii as unicode_isascii
11706
11707
Return True if all characters in the string are ASCII, False otherwise.
11708
11709
ASCII characters have code points in the range U+0000-U+007F.
11710
Empty string is ASCII too.
11711
[clinic start generated code]*/
11712
11713
static PyObject *
11714
unicode_isascii_impl(PyObject *self)
11715
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11716
5.24k
{
11717
5.24k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11718
5.24k
}
11719
11720
/*[clinic input]
11721
@permit_long_docstring_body
11722
str.islower as unicode_islower
11723
11724
Return True if the string is a lowercase string, False otherwise.
11725
11726
A string is lowercase if all cased characters in the string are lowercase and
11727
there is at least one cased character in the string.
11728
[clinic start generated code]*/
11729
11730
static PyObject *
11731
unicode_islower_impl(PyObject *self)
11732
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11733
0
{
11734
0
    Py_ssize_t i, length;
11735
0
    int kind;
11736
0
    const void *data;
11737
0
    int cased;
11738
11739
0
    length = PyUnicode_GET_LENGTH(self);
11740
0
    kind = PyUnicode_KIND(self);
11741
0
    data = PyUnicode_DATA(self);
11742
11743
    /* Shortcut for single character strings */
11744
0
    if (length == 1)
11745
0
        return PyBool_FromLong(
11746
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11747
11748
    /* Special case for empty strings */
11749
0
    if (length == 0)
11750
0
        Py_RETURN_FALSE;
11751
11752
0
    cased = 0;
11753
0
    for (i = 0; i < length; i++) {
11754
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11755
11756
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11757
0
            Py_RETURN_FALSE;
11758
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11759
0
            cased = 1;
11760
0
    }
11761
0
    return PyBool_FromLong(cased);
11762
0
}
11763
11764
/*[clinic input]
11765
@permit_long_docstring_body
11766
str.isupper as unicode_isupper
11767
11768
Return True if the string is an uppercase string, False otherwise.
11769
11770
A string is uppercase if all cased characters in the string are uppercase and
11771
there is at least one cased character in the string.
11772
[clinic start generated code]*/
11773
11774
static PyObject *
11775
unicode_isupper_impl(PyObject *self)
11776
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11777
10.7k
{
11778
10.7k
    Py_ssize_t i, length;
11779
10.7k
    int kind;
11780
10.7k
    const void *data;
11781
10.7k
    int cased;
11782
11783
10.7k
    length = PyUnicode_GET_LENGTH(self);
11784
10.7k
    kind = PyUnicode_KIND(self);
11785
10.7k
    data = PyUnicode_DATA(self);
11786
11787
    /* Shortcut for single character strings */
11788
10.7k
    if (length == 1)
11789
0
        return PyBool_FromLong(
11790
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11791
11792
    /* Special case for empty strings */
11793
10.7k
    if (length == 0)
11794
0
        Py_RETURN_FALSE;
11795
11796
10.7k
    cased = 0;
11797
135k
    for (i = 0; i < length; i++) {
11798
126k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11799
11800
126k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11801
1.36k
            Py_RETURN_FALSE;
11802
124k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11803
9.56k
            cased = 1;
11804
126k
    }
11805
9.42k
    return PyBool_FromLong(cased);
11806
10.7k
}
11807
11808
/*[clinic input]
11809
str.istitle as unicode_istitle
11810
11811
Return True if the string is a title-cased string, False otherwise.
11812
11813
In a title-cased string, upper- and title-case characters may only
11814
follow uncased characters and lowercase characters only cased ones.
11815
[clinic start generated code]*/
11816
11817
static PyObject *
11818
unicode_istitle_impl(PyObject *self)
11819
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11820
0
{
11821
0
    Py_ssize_t i, length;
11822
0
    int kind;
11823
0
    const void *data;
11824
0
    int cased, previous_is_cased;
11825
11826
0
    length = PyUnicode_GET_LENGTH(self);
11827
0
    kind = PyUnicode_KIND(self);
11828
0
    data = PyUnicode_DATA(self);
11829
11830
    /* Shortcut for single character strings */
11831
0
    if (length == 1) {
11832
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11833
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11834
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11835
0
    }
11836
11837
    /* Special case for empty strings */
11838
0
    if (length == 0)
11839
0
        Py_RETURN_FALSE;
11840
11841
0
    cased = 0;
11842
0
    previous_is_cased = 0;
11843
0
    for (i = 0; i < length; i++) {
11844
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11845
11846
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11847
0
            if (previous_is_cased)
11848
0
                Py_RETURN_FALSE;
11849
0
            previous_is_cased = 1;
11850
0
            cased = 1;
11851
0
        }
11852
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11853
0
            if (!previous_is_cased)
11854
0
                Py_RETURN_FALSE;
11855
0
            previous_is_cased = 1;
11856
0
            cased = 1;
11857
0
        }
11858
0
        else
11859
0
            previous_is_cased = 0;
11860
0
    }
11861
0
    return PyBool_FromLong(cased);
11862
0
}
11863
11864
/*[clinic input]
11865
@permit_long_docstring_body
11866
str.isspace as unicode_isspace
11867
11868
Return True if the string is a whitespace string, False otherwise.
11869
11870
A string is whitespace if all characters in the string are whitespace and there
11871
is at least one character in the string.
11872
[clinic start generated code]*/
11873
11874
static PyObject *
11875
unicode_isspace_impl(PyObject *self)
11876
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
11877
1.45M
{
11878
1.45M
    Py_ssize_t i, length;
11879
1.45M
    int kind;
11880
1.45M
    const void *data;
11881
11882
1.45M
    length = PyUnicode_GET_LENGTH(self);
11883
1.45M
    kind = PyUnicode_KIND(self);
11884
1.45M
    data = PyUnicode_DATA(self);
11885
11886
    /* Shortcut for single character strings */
11887
1.45M
    if (length == 1)
11888
1.45M
        return PyBool_FromLong(
11889
1.45M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11890
11891
    /* Special case for empty strings */
11892
1.33k
    if (length == 0)
11893
284
        Py_RETURN_FALSE;
11894
11895
7.60k
    for (i = 0; i < length; i++) {
11896
7.50k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11897
7.50k
        if (!Py_UNICODE_ISSPACE(ch))
11898
944
            Py_RETURN_FALSE;
11899
7.50k
    }
11900
1.04k
    Py_RETURN_TRUE;
11901
1.04k
}
11902
11903
/*[clinic input]
11904
@permit_long_docstring_body
11905
str.isalpha as unicode_isalpha
11906
11907
Return True if the string is an alphabetic string, False otherwise.
11908
11909
A string is alphabetic if all characters in the string are alphabetic and there
11910
is at least one character in the string.
11911
[clinic start generated code]*/
11912
11913
static PyObject *
11914
unicode_isalpha_impl(PyObject *self)
11915
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
11916
22
{
11917
22
    Py_ssize_t i, length;
11918
22
    int kind;
11919
22
    const void *data;
11920
11921
22
    length = PyUnicode_GET_LENGTH(self);
11922
22
    kind = PyUnicode_KIND(self);
11923
22
    data = PyUnicode_DATA(self);
11924
11925
    /* Shortcut for single character strings */
11926
22
    if (length == 1)
11927
20
        return PyBool_FromLong(
11928
20
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11929
11930
    /* Special case for empty strings */
11931
2
    if (length == 0)
11932
0
        Py_RETURN_FALSE;
11933
11934
2
    for (i = 0; i < length; i++) {
11935
2
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11936
2
            Py_RETURN_FALSE;
11937
2
    }
11938
2
    Py_RETURN_TRUE;
11939
2
}
11940
11941
/*[clinic input]
11942
@permit_long_docstring_body
11943
str.isalnum as unicode_isalnum
11944
11945
Return True if the string is an alpha-numeric string, False otherwise.
11946
11947
A string is alpha-numeric if all characters in the string are alpha-numeric and
11948
there is at least one character in the string.
11949
[clinic start generated code]*/
11950
11951
static PyObject *
11952
unicode_isalnum_impl(PyObject *self)
11953
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
11954
0
{
11955
0
    int kind;
11956
0
    const void *data;
11957
0
    Py_ssize_t len, i;
11958
11959
0
    kind = PyUnicode_KIND(self);
11960
0
    data = PyUnicode_DATA(self);
11961
0
    len = PyUnicode_GET_LENGTH(self);
11962
11963
    /* Shortcut for single character strings */
11964
0
    if (len == 1) {
11965
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11966
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11967
0
    }
11968
11969
    /* Special case for empty strings */
11970
0
    if (len == 0)
11971
0
        Py_RETURN_FALSE;
11972
11973
0
    for (i = 0; i < len; i++) {
11974
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11975
0
        if (!Py_UNICODE_ISALNUM(ch))
11976
0
            Py_RETURN_FALSE;
11977
0
    }
11978
0
    Py_RETURN_TRUE;
11979
0
}
11980
11981
/*[clinic input]
11982
@permit_long_docstring_body
11983
str.isdecimal as unicode_isdecimal
11984
11985
Return True if the string is a decimal string, False otherwise.
11986
11987
A string is a decimal string if all characters in the string are decimal and
11988
there is at least one character in the string.
11989
[clinic start generated code]*/
11990
11991
static PyObject *
11992
unicode_isdecimal_impl(PyObject *self)
11993
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
11994
1.38k
{
11995
1.38k
    Py_ssize_t i, length;
11996
1.38k
    int kind;
11997
1.38k
    const void *data;
11998
11999
1.38k
    length = PyUnicode_GET_LENGTH(self);
12000
1.38k
    kind = PyUnicode_KIND(self);
12001
1.38k
    data = PyUnicode_DATA(self);
12002
12003
    /* Shortcut for single character strings */
12004
1.38k
    if (length == 1)
12005
197
        return PyBool_FromLong(
12006
197
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12007
12008
    /* Special case for empty strings */
12009
1.19k
    if (length == 0)
12010
0
        Py_RETURN_FALSE;
12011
12012
7.70k
    for (i = 0; i < length; i++) {
12013
7.12k
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12014
608
            Py_RETURN_FALSE;
12015
7.12k
    }
12016
1.19k
    Py_RETURN_TRUE;
12017
1.19k
}
12018
12019
/*[clinic input]
12020
@permit_long_docstring_body
12021
str.isdigit as unicode_isdigit
12022
12023
Return True if the string is a digit string, False otherwise.
12024
12025
A string is a digit string if all characters in the string are digits and there
12026
is at least one character in the string.
12027
[clinic start generated code]*/
12028
12029
static PyObject *
12030
unicode_isdigit_impl(PyObject *self)
12031
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12032
1.23M
{
12033
1.23M
    Py_ssize_t i, length;
12034
1.23M
    int kind;
12035
1.23M
    const void *data;
12036
12037
1.23M
    length = PyUnicode_GET_LENGTH(self);
12038
1.23M
    kind = PyUnicode_KIND(self);
12039
1.23M
    data = PyUnicode_DATA(self);
12040
12041
    /* Shortcut for single character strings */
12042
1.23M
    if (length == 1) {
12043
1.23M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12044
1.23M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12045
1.23M
    }
12046
12047
    /* Special case for empty strings */
12048
408
    if (length == 0)
12049
0
        Py_RETURN_FALSE;
12050
12051
1.45k
    for (i = 0; i < length; i++) {
12052
1.04k
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12053
0
            Py_RETURN_FALSE;
12054
1.04k
    }
12055
408
    Py_RETURN_TRUE;
12056
408
}
12057
12058
/*[clinic input]
12059
@permit_long_docstring_body
12060
str.isnumeric as unicode_isnumeric
12061
12062
Return True if the string is a numeric string, False otherwise.
12063
12064
A string is numeric if all characters in the string are numeric and there is at
12065
least one character in the string.
12066
[clinic start generated code]*/
12067
12068
static PyObject *
12069
unicode_isnumeric_impl(PyObject *self)
12070
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12071
0
{
12072
0
    Py_ssize_t i, length;
12073
0
    int kind;
12074
0
    const void *data;
12075
12076
0
    length = PyUnicode_GET_LENGTH(self);
12077
0
    kind = PyUnicode_KIND(self);
12078
0
    data = PyUnicode_DATA(self);
12079
12080
    /* Shortcut for single character strings */
12081
0
    if (length == 1)
12082
0
        return PyBool_FromLong(
12083
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12084
12085
    /* Special case for empty strings */
12086
0
    if (length == 0)
12087
0
        Py_RETURN_FALSE;
12088
12089
0
    for (i = 0; i < length; i++) {
12090
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12091
0
            Py_RETURN_FALSE;
12092
0
    }
12093
0
    Py_RETURN_TRUE;
12094
0
}
12095
12096
Py_ssize_t
12097
_PyUnicode_ScanIdentifier(PyObject *self)
12098
63.3k
{
12099
63.3k
    Py_ssize_t i;
12100
63.3k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12101
63.3k
    if (len == 0) {
12102
        /* an empty string is not a valid identifier */
12103
0
        return 0;
12104
0
    }
12105
12106
63.3k
    int kind = PyUnicode_KIND(self);
12107
63.3k
    const void *data = PyUnicode_DATA(self);
12108
63.3k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12109
    /* PEP 3131 says that the first character must be in
12110
       XID_Start and subsequent characters in XID_Continue,
12111
       and for the ASCII range, the 2.x rules apply (i.e
12112
       start with letters and underscore, continue with
12113
       letters, digits, underscore). However, given the current
12114
       definition of XID_Start and XID_Continue, it is sufficient
12115
       to check just for these, except that _ must be allowed
12116
       as starting an identifier.  */
12117
63.3k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12118
808
        return 0;
12119
808
    }
12120
12121
520k
    for (i = 1; i < len; i++) {
12122
458k
        ch = PyUnicode_READ(kind, data, i);
12123
458k
        if (!_PyUnicode_IsXidContinue(ch)) {
12124
341
            return i;
12125
341
        }
12126
458k
    }
12127
62.2k
    return i;
12128
62.5k
}
12129
12130
int
12131
PyUnicode_IsIdentifier(PyObject *self)
12132
51.5k
{
12133
51.5k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12134
51.5k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12135
    /* an empty string is not a valid identifier */
12136
51.5k
    return len && i == len;
12137
51.5k
}
12138
12139
/*[clinic input]
12140
@permit_long_docstring_body
12141
str.isidentifier as unicode_isidentifier
12142
12143
Return True if the string is a valid Python identifier, False otherwise.
12144
12145
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12146
such as "def" or "class".
12147
[clinic start generated code]*/
12148
12149
static PyObject *
12150
unicode_isidentifier_impl(PyObject *self)
12151
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12152
49.1k
{
12153
49.1k
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12154
49.1k
}
12155
12156
/*[clinic input]
12157
@permit_long_summary
12158
str.isprintable as unicode_isprintable
12159
12160
Return True if all characters in the string are printable, False otherwise.
12161
12162
A character is printable if repr() may use it in its output.
12163
[clinic start generated code]*/
12164
12165
static PyObject *
12166
unicode_isprintable_impl(PyObject *self)
12167
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12168
2.00M
{
12169
2.00M
    Py_ssize_t i, length;
12170
2.00M
    int kind;
12171
2.00M
    const void *data;
12172
12173
2.00M
    length = PyUnicode_GET_LENGTH(self);
12174
2.00M
    kind = PyUnicode_KIND(self);
12175
2.00M
    data = PyUnicode_DATA(self);
12176
12177
    /* Shortcut for single character strings */
12178
2.00M
    if (length == 1)
12179
2.00M
        return PyBool_FromLong(
12180
2.00M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12181
12182
0
    for (i = 0; i < length; i++) {
12183
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12184
0
            Py_RETURN_FALSE;
12185
0
        }
12186
0
    }
12187
0
    Py_RETURN_TRUE;
12188
0
}
12189
12190
/*[clinic input]
12191
@permit_long_docstring_body
12192
str.join as unicode_join
12193
12194
    iterable: object
12195
    /
12196
12197
Concatenate any number of strings.
12198
12199
The string whose method is called is inserted in between each given string.
12200
The result is returned as a new string.
12201
12202
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12203
[clinic start generated code]*/
12204
12205
static PyObject *
12206
unicode_join(PyObject *self, PyObject *iterable)
12207
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12208
20.1M
{
12209
20.1M
    return PyUnicode_Join(self, iterable);
12210
20.1M
}
12211
12212
static Py_ssize_t
12213
unicode_length(PyObject *self)
12214
32.5M
{
12215
32.5M
    return PyUnicode_GET_LENGTH(self);
12216
32.5M
}
12217
12218
/*[clinic input]
12219
str.ljust as unicode_ljust
12220
12221
    width: Py_ssize_t
12222
    fillchar: Py_UCS4 = ' '
12223
    /
12224
12225
Return a left-justified string of length width.
12226
12227
Padding is done using the specified fill character (default is a space).
12228
[clinic start generated code]*/
12229
12230
static PyObject *
12231
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12232
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12233
130
{
12234
130
    if (PyUnicode_GET_LENGTH(self) >= width)
12235
62
        return unicode_result_unchanged(self);
12236
12237
68
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12238
130
}
12239
12240
/*[clinic input]
12241
str.lower as unicode_lower
12242
12243
Return a copy of the string converted to lowercase.
12244
[clinic start generated code]*/
12245
12246
static PyObject *
12247
unicode_lower_impl(PyObject *self)
12248
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12249
76.6M
{
12250
76.6M
    if (PyUnicode_IS_ASCII(self))
12251
71.0M
        return ascii_upper_or_lower(self, 1);
12252
5.62M
    return case_operation(self, do_lower);
12253
76.6M
}
12254
12255
68.9M
#define LEFTSTRIP 0
12256
83.9M
#define RIGHTSTRIP 1
12257
47.7M
#define BOTHSTRIP 2
12258
12259
/* Arrays indexed by above */
12260
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12261
12262
0
#define STRIPNAME(i) (stripfuncnames[i])
12263
12264
/* externally visible for str.strip(unicode) */
12265
PyObject *
12266
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12267
4.79M
{
12268
4.79M
    const void *data;
12269
4.79M
    int kind;
12270
4.79M
    Py_ssize_t i, j, len;
12271
4.79M
    BLOOM_MASK sepmask;
12272
4.79M
    Py_ssize_t seplen;
12273
12274
4.79M
    kind = PyUnicode_KIND(self);
12275
4.79M
    data = PyUnicode_DATA(self);
12276
4.79M
    len = PyUnicode_GET_LENGTH(self);
12277
4.79M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12278
4.79M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12279
4.79M
                              PyUnicode_DATA(sepobj),
12280
4.79M
                              seplen);
12281
12282
4.79M
    i = 0;
12283
4.79M
    if (striptype != RIGHTSTRIP) {
12284
491k
        while (i < len) {
12285
488k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12286
488k
            if (!BLOOM(sepmask, ch))
12287
457k
                break;
12288
31.2k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12289
2.71k
                break;
12290
28.5k
            i++;
12291
28.5k
        }
12292
462k
    }
12293
12294
4.79M
    j = len;
12295
4.79M
    if (striptype != LEFTSTRIP) {
12296
4.32M
        j--;
12297
5.09M
        while (j >= i) {
12298
3.99M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12299
3.99M
            if (!BLOOM(sepmask, ch))
12300
3.13M
                break;
12301
861k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12302
97.0k
                break;
12303
764k
            j--;
12304
764k
        }
12305
12306
4.32M
        j++;
12307
4.32M
    }
12308
12309
4.79M
    return PyUnicode_Substring(self, i, j);
12310
4.79M
}
12311
12312
PyObject*
12313
_PyUnicode_BinarySlice(PyObject *container, PyObject *start_o, PyObject *stop_o)
12314
34.6M
{
12315
34.6M
    assert(PyUnicode_CheckExact(container));
12316
34.6M
    Py_ssize_t len = PyUnicode_GET_LENGTH(container);
12317
34.6M
    Py_ssize_t istart, istop;
12318
34.6M
    if (!_PyEval_UnpackIndices(start_o, stop_o, len, &istart, &istop)) {
12319
0
        return NULL;
12320
0
    }
12321
34.6M
    return PyUnicode_Substring(container, istart, istop);
12322
34.6M
}
12323
12324
PyObject*
12325
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12326
264M
{
12327
264M
    const unsigned char *data;
12328
264M
    int kind;
12329
264M
    Py_ssize_t length;
12330
12331
264M
    length = PyUnicode_GET_LENGTH(self);
12332
264M
    end = Py_MIN(end, length);
12333
12334
264M
    if (start == 0 && end == length)
12335
70.3M
        return unicode_result_unchanged(self);
12336
12337
194M
    if (start < 0 || end < 0) {
12338
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12339
0
        return NULL;
12340
0
    }
12341
194M
    if (start >= length || end < start)
12342
5.32M
        _Py_RETURN_UNICODE_EMPTY();
12343
12344
188M
    length = end - start;
12345
188M
    if (PyUnicode_IS_ASCII(self)) {
12346
65.0M
        data = PyUnicode_1BYTE_DATA(self);
12347
65.0M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12348
65.0M
    }
12349
123M
    else {
12350
123M
        kind = PyUnicode_KIND(self);
12351
123M
        data = PyUnicode_1BYTE_DATA(self);
12352
123M
        return PyUnicode_FromKindAndData(kind,
12353
123M
                                         data + kind * start,
12354
123M
                                         length);
12355
123M
    }
12356
188M
}
12357
12358
static PyObject *
12359
do_strip(PyObject *self, int striptype)
12360
62.0M
{
12361
62.0M
    Py_ssize_t len, i, j;
12362
12363
62.0M
    len = PyUnicode_GET_LENGTH(self);
12364
12365
62.0M
    if (PyUnicode_IS_ASCII(self)) {
12366
48.7M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12367
12368
48.7M
        i = 0;
12369
48.7M
        if (striptype != RIGHTSTRIP) {
12370
43.8M
            while (i < len) {
12371
36.5M
                Py_UCS1 ch = data[i];
12372
36.5M
                if (!_Py_ascii_whitespace[ch])
12373
31.8M
                    break;
12374
4.75M
                i++;
12375
4.75M
            }
12376
39.0M
        }
12377
12378
48.7M
        j = len;
12379
48.7M
        if (striptype != LEFTSTRIP) {
12380
48.4M
            j--;
12381
53.4M
            while (j >= i) {
12382
41.0M
                Py_UCS1 ch = data[j];
12383
41.0M
                if (!_Py_ascii_whitespace[ch])
12384
36.0M
                    break;
12385
5.01M
                j--;
12386
5.01M
            }
12387
48.4M
            j++;
12388
48.4M
        }
12389
48.7M
    }
12390
13.2M
    else {
12391
13.2M
        int kind = PyUnicode_KIND(self);
12392
13.2M
        const void *data = PyUnicode_DATA(self);
12393
12394
13.2M
        i = 0;
12395
13.2M
        if (striptype != RIGHTSTRIP) {
12396
12.4M
            while (i < len) {
12397
12.4M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12398
12.4M
                if (!Py_UNICODE_ISSPACE(ch))
12399
10.2M
                    break;
12400
2.21M
                i++;
12401
2.21M
            }
12402
10.2M
        }
12403
12404
13.2M
        j = len;
12405
13.2M
        if (striptype != LEFTSTRIP) {
12406
11.9M
            j--;
12407
14.8M
            while (j >= i) {
12408
14.8M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12409
14.8M
                if (!Py_UNICODE_ISSPACE(ch))
12410
11.9M
                    break;
12411
2.87M
                j--;
12412
2.87M
            }
12413
11.9M
            j++;
12414
11.9M
        }
12415
13.2M
    }
12416
12417
62.0M
    return PyUnicode_Substring(self, i, j);
12418
62.0M
}
12419
12420
12421
static PyObject *
12422
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12423
66.8M
{
12424
66.8M
    if (sep != Py_None) {
12425
4.79M
        if (PyUnicode_Check(sep))
12426
4.79M
            return _PyUnicode_XStrip(self, striptype, sep);
12427
0
        else {
12428
0
            PyErr_Format(PyExc_TypeError,
12429
0
                         "%s arg must be None or str",
12430
0
                         STRIPNAME(striptype));
12431
0
            return NULL;
12432
0
        }
12433
4.79M
    }
12434
12435
62.0M
    return do_strip(self, striptype);
12436
66.8M
}
12437
12438
12439
/*[clinic input]
12440
@permit_long_summary
12441
str.strip as unicode_strip
12442
12443
    chars: object = None
12444
    /
12445
12446
Return a copy of the string with leading and trailing whitespace removed.
12447
12448
If chars is given and not None, remove characters in chars instead.
12449
[clinic start generated code]*/
12450
12451
static PyObject *
12452
unicode_strip_impl(PyObject *self, PyObject *chars)
12453
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12454
47.7M
{
12455
47.7M
    return do_argstrip(self, BOTHSTRIP, chars);
12456
47.7M
}
12457
12458
12459
/*[clinic input]
12460
str.lstrip as unicode_lstrip
12461
12462
    chars: object = None
12463
    /
12464
12465
Return a copy of the string with leading whitespace removed.
12466
12467
If chars is given and not None, remove characters in chars instead.
12468
[clinic start generated code]*/
12469
12470
static PyObject *
12471
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12472
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12473
2.06M
{
12474
2.06M
    return do_argstrip(self, LEFTSTRIP, chars);
12475
2.06M
}
12476
12477
12478
/*[clinic input]
12479
str.rstrip as unicode_rstrip
12480
12481
    chars: object = None
12482
    /
12483
12484
Return a copy of the string with trailing whitespace removed.
12485
12486
If chars is given and not None, remove characters in chars instead.
12487
[clinic start generated code]*/
12488
12489
static PyObject *
12490
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12491
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12492
17.0M
{
12493
17.0M
    return do_argstrip(self, RIGHTSTRIP, chars);
12494
17.0M
}
12495
12496
12497
static PyObject*
12498
unicode_repeat(PyObject *str, Py_ssize_t len)
12499
402k
{
12500
402k
    PyObject *u;
12501
402k
    Py_ssize_t nchars, n;
12502
12503
402k
    if (len < 1)
12504
33.4k
        _Py_RETURN_UNICODE_EMPTY();
12505
12506
    /* no repeat, return original string */
12507
368k
    if (len == 1)
12508
28.8k
        return unicode_result_unchanged(str);
12509
12510
340k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12511
0
        PyErr_SetString(PyExc_OverflowError,
12512
0
                        "repeated string is too long");
12513
0
        return NULL;
12514
0
    }
12515
340k
    nchars = len * PyUnicode_GET_LENGTH(str);
12516
12517
340k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12518
340k
    if (!u)
12519
0
        return NULL;
12520
340k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12521
12522
340k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12523
337k
        int kind = PyUnicode_KIND(str);
12524
337k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12525
337k
        if (kind == PyUnicode_1BYTE_KIND) {
12526
337k
            void *to = PyUnicode_DATA(u);
12527
337k
            memset(to, (unsigned char)fill_char, len);
12528
337k
        }
12529
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12530
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12531
0
            for (n = 0; n < len; ++n)
12532
0
                ucs2[n] = fill_char;
12533
0
        } else {
12534
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12535
0
            assert(kind == PyUnicode_4BYTE_KIND);
12536
0
            for (n = 0; n < len; ++n)
12537
0
                ucs4[n] = fill_char;
12538
0
        }
12539
337k
    }
12540
2.38k
    else {
12541
2.38k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12542
2.38k
        char *to = (char *) PyUnicode_DATA(u);
12543
2.38k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12544
2.38k
            PyUnicode_GET_LENGTH(str) * char_size);
12545
2.38k
    }
12546
12547
340k
    assert(_PyUnicode_CheckConsistency(u, 1));
12548
340k
    return u;
12549
340k
}
12550
12551
PyObject *
12552
PyUnicode_Replace(PyObject *str,
12553
                  PyObject *substr,
12554
                  PyObject *replstr,
12555
                  Py_ssize_t maxcount)
12556
0
{
12557
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12558
0
            ensure_unicode(replstr) < 0)
12559
0
        return NULL;
12560
0
    return replace(str, substr, replstr, maxcount);
12561
0
}
12562
12563
/*[clinic input]
12564
str.replace as unicode_replace
12565
12566
    old: unicode
12567
    new: unicode
12568
    /
12569
    count: Py_ssize_t = -1
12570
        Maximum number of occurrences to replace.
12571
        -1 (the default value) means replace all occurrences.
12572
12573
Return a copy with all occurrences of substring old replaced by new.
12574
12575
If count is given, only the first count occurrences are replaced.
12576
If count is not specified or -1, then all occurrences are replaced.
12577
[clinic start generated code]*/
12578
12579
static PyObject *
12580
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12581
                     Py_ssize_t count)
12582
/*[clinic end generated code: output=b63f1a8b5eebf448 input=d15a6886b05e2edc]*/
12583
18.5M
{
12584
18.5M
    return replace(self, old, new, count);
12585
18.5M
}
12586
12587
/*[clinic input]
12588
@permit_long_docstring_body
12589
str.removeprefix as unicode_removeprefix
12590
12591
    prefix: unicode
12592
    /
12593
12594
Return a str with the given prefix string removed if present.
12595
12596
If the string starts with the prefix string, return string[len(prefix):].
12597
Otherwise, return a copy of the original string.
12598
[clinic start generated code]*/
12599
12600
static PyObject *
12601
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12602
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12603
308
{
12604
308
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12605
308
    if (match == -1) {
12606
0
        return NULL;
12607
0
    }
12608
308
    if (match) {
12609
80
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12610
80
                                   PyUnicode_GET_LENGTH(self));
12611
80
    }
12612
228
    return unicode_result_unchanged(self);
12613
308
}
12614
12615
/*[clinic input]
12616
str.removesuffix as unicode_removesuffix
12617
12618
    suffix: unicode
12619
    /
12620
12621
Return a str with the given suffix string removed if present.
12622
12623
If the string ends with the suffix string and that suffix is not empty,
12624
return string[:-len(suffix)]. Otherwise, return a copy of the original
12625
string.
12626
[clinic start generated code]*/
12627
12628
static PyObject *
12629
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12630
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12631
0
{
12632
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12633
0
    if (match == -1) {
12634
0
        return NULL;
12635
0
    }
12636
0
    if (match) {
12637
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12638
0
                                            - PyUnicode_GET_LENGTH(suffix));
12639
0
    }
12640
0
    return unicode_result_unchanged(self);
12641
0
}
12642
12643
static PyObject *
12644
unicode_repr(PyObject *unicode)
12645
13.1M
{
12646
13.1M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12647
13.1M
    const void *idata = PyUnicode_DATA(unicode);
12648
12649
    /* Compute length of output, quote characters, and
12650
       maximum character */
12651
13.1M
    Py_ssize_t osize = 0;
12652
13.1M
    Py_UCS4 maxch = 127;
12653
13.1M
    Py_ssize_t squote = 0;
12654
13.1M
    Py_ssize_t dquote = 0;
12655
13.1M
    int ikind = PyUnicode_KIND(unicode);
12656
289M
    for (Py_ssize_t i = 0; i < isize; i++) {
12657
276M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12658
276M
        Py_ssize_t incr = 1;
12659
276M
        switch (ch) {
12660
181k
        case '\'': squote++; break;
12661
591k
        case '"':  dquote++; break;
12662
4.49M
        case '\\': case '\t': case '\r': case '\n':
12663
4.49M
            incr = 2;
12664
4.49M
            break;
12665
270M
        default:
12666
            /* Fast-path ASCII */
12667
270M
            if (ch < ' ' || ch == 0x7f)
12668
161M
                incr = 4; /* \xHH */
12669
109M
            else if (ch < 0x7f)
12670
98.1M
                ;
12671
11.0M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12672
10.6M
                maxch = (ch > maxch) ? ch : maxch;
12673
373k
            else if (ch < 0x100)
12674
85.3k
                incr = 4; /* \xHH */
12675
288k
            else if (ch < 0x10000)
12676
84.8k
                incr = 6; /* \uHHHH */
12677
203k
            else
12678
203k
                incr = 10; /* \uHHHHHHHH */
12679
276M
        }
12680
276M
        if (osize > PY_SSIZE_T_MAX - incr) {
12681
0
            PyErr_SetString(PyExc_OverflowError,
12682
0
                            "string is too long to generate repr");
12683
0
            return NULL;
12684
0
        }
12685
276M
        osize += incr;
12686
276M
    }
12687
12688
13.1M
    Py_UCS4 quote = '\'';
12689
13.1M
    int changed = (osize != isize);
12690
13.1M
    if (squote) {
12691
96.3k
        changed = 1;
12692
96.3k
        if (dquote)
12693
            /* Both squote and dquote present. Use squote,
12694
               and escape them */
12695
7.80k
            osize += squote;
12696
88.5k
        else
12697
88.5k
            quote = '"';
12698
96.3k
    }
12699
13.1M
    osize += 2;   /* quotes */
12700
12701
13.1M
    PyObject *repr = PyUnicode_New(osize, maxch);
12702
13.1M
    if (repr == NULL)
12703
0
        return NULL;
12704
13.1M
    int okind = PyUnicode_KIND(repr);
12705
13.1M
    void *odata = PyUnicode_DATA(repr);
12706
12707
13.1M
    if (!changed) {
12708
6.89M
        PyUnicode_WRITE(okind, odata, 0, quote);
12709
12710
6.89M
        _PyUnicode_FastCopyCharacters(repr, 1,
12711
6.89M
                                      unicode, 0,
12712
6.89M
                                      isize);
12713
12714
6.89M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12715
6.89M
    }
12716
6.27M
    else {
12717
6.27M
        switch (okind) {
12718
6.01M
        case PyUnicode_1BYTE_KIND:
12719
6.01M
            ucs1lib_repr(unicode, quote, odata);
12720
6.01M
            break;
12721
251k
        case PyUnicode_2BYTE_KIND:
12722
251k
            ucs2lib_repr(unicode, quote, odata);
12723
251k
            break;
12724
9.35k
        default:
12725
9.35k
            assert(okind == PyUnicode_4BYTE_KIND);
12726
9.35k
            ucs4lib_repr(unicode, quote, odata);
12727
6.27M
        }
12728
6.27M
    }
12729
12730
13.1M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12731
13.1M
    return repr;
12732
13.1M
}
12733
12734
/*[clinic input]
12735
@permit_long_summary
12736
str.rfind as unicode_rfind = str.count
12737
12738
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12739
12740
Optional arguments start and end are interpreted as in slice notation.
12741
Return -1 on failure.
12742
[clinic start generated code]*/
12743
12744
static Py_ssize_t
12745
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12746
                   Py_ssize_t end)
12747
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12748
325k
{
12749
325k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12750
325k
    if (result < 0) {
12751
10.6k
        return -1;
12752
10.6k
    }
12753
314k
    return result;
12754
325k
}
12755
12756
/*[clinic input]
12757
@permit_long_summary
12758
str.rindex as unicode_rindex = str.count
12759
12760
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12761
12762
Optional arguments start and end are interpreted as in slice notation.
12763
Raises ValueError when the substring is not found.
12764
[clinic start generated code]*/
12765
12766
static Py_ssize_t
12767
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12768
                    Py_ssize_t end)
12769
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12770
120k
{
12771
120k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12772
120k
    if (result == -1) {
12773
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12774
0
    }
12775
120k
    else if (result < 0) {
12776
0
        return -1;
12777
0
    }
12778
120k
    return result;
12779
120k
}
12780
12781
/*[clinic input]
12782
str.rjust as unicode_rjust
12783
12784
    width: Py_ssize_t
12785
    fillchar: Py_UCS4 = ' '
12786
    /
12787
12788
Return a right-justified string of length width.
12789
12790
Padding is done using the specified fill character (default is a space).
12791
[clinic start generated code]*/
12792
12793
static PyObject *
12794
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12795
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12796
0
{
12797
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12798
0
        return unicode_result_unchanged(self);
12799
12800
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12801
0
}
12802
12803
PyObject *
12804
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12805
0
{
12806
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12807
0
        return NULL;
12808
12809
0
    return split(s, sep, maxsplit);
12810
0
}
12811
12812
/*[clinic input]
12813
@permit_long_summary
12814
str.split as unicode_split
12815
12816
    sep: object = None
12817
        The separator used to split the string.
12818
12819
        When set to None (the default value), will split on any whitespace
12820
        character (including \n \r \t \f and spaces) and will discard
12821
        empty strings from the result.
12822
    maxsplit: Py_ssize_t = -1
12823
        Maximum number of splits.
12824
        -1 (the default value) means no limit.
12825
12826
Return a list of the substrings in the string, using sep as the separator string.
12827
12828
Splitting starts at the front of the string and works to the end.
12829
12830
Note, str.split() is mainly useful for data that has been intentionally
12831
delimited.  With natural text that includes punctuation, consider using
12832
the regular expression module.
12833
12834
[clinic start generated code]*/
12835
12836
static PyObject *
12837
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12838
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12839
22.0M
{
12840
22.0M
    if (sep == Py_None)
12841
155k
        return split(self, NULL, maxsplit);
12842
21.8M
    if (PyUnicode_Check(sep))
12843
21.8M
        return split(self, sep, maxsplit);
12844
12845
0
    PyErr_Format(PyExc_TypeError,
12846
0
                 "must be str or None, not %.100s",
12847
0
                 Py_TYPE(sep)->tp_name);
12848
0
    return NULL;
12849
21.8M
}
12850
12851
PyObject *
12852
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12853
8.93M
{
12854
8.93M
    PyObject* out;
12855
8.93M
    int kind1, kind2;
12856
8.93M
    const void *buf1, *buf2;
12857
8.93M
    Py_ssize_t len1, len2;
12858
12859
8.93M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12860
0
        return NULL;
12861
12862
8.93M
    kind1 = PyUnicode_KIND(str_obj);
12863
8.93M
    kind2 = PyUnicode_KIND(sep_obj);
12864
8.93M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12865
8.93M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12866
8.93M
    if (kind1 < kind2 || len1 < len2) {
12867
1.26k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12868
1.26k
        return PyTuple_Pack(3, str_obj, empty, empty);
12869
1.26k
    }
12870
8.93M
    buf1 = PyUnicode_DATA(str_obj);
12871
8.93M
    buf2 = PyUnicode_DATA(sep_obj);
12872
8.93M
    if (kind2 != kind1) {
12873
81.1k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12874
81.1k
        if (!buf2)
12875
0
            return NULL;
12876
81.1k
    }
12877
12878
8.93M
    switch (kind1) {
12879
8.85M
    case PyUnicode_1BYTE_KIND:
12880
8.85M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12881
2.97M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12882
5.88M
        else
12883
5.88M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12884
8.85M
        break;
12885
69.8k
    case PyUnicode_2BYTE_KIND:
12886
69.8k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12887
69.8k
        break;
12888
11.3k
    case PyUnicode_4BYTE_KIND:
12889
11.3k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12890
11.3k
        break;
12891
0
    default:
12892
0
        Py_UNREACHABLE();
12893
8.93M
    }
12894
12895
8.93M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12896
8.93M
    if (kind2 != kind1)
12897
81.1k
        PyMem_Free((void *)buf2);
12898
12899
8.93M
    return out;
12900
8.93M
}
12901
12902
12903
PyObject *
12904
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12905
49.7k
{
12906
49.7k
    PyObject* out;
12907
49.7k
    int kind1, kind2;
12908
49.7k
    const void *buf1, *buf2;
12909
49.7k
    Py_ssize_t len1, len2;
12910
12911
49.7k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12912
0
        return NULL;
12913
12914
49.7k
    kind1 = PyUnicode_KIND(str_obj);
12915
49.7k
    kind2 = PyUnicode_KIND(sep_obj);
12916
49.7k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12917
49.7k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12918
49.7k
    if (kind1 < kind2 || len1 < len2) {
12919
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12920
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12921
0
    }
12922
49.7k
    buf1 = PyUnicode_DATA(str_obj);
12923
49.7k
    buf2 = PyUnicode_DATA(sep_obj);
12924
49.7k
    if (kind2 != kind1) {
12925
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12926
0
        if (!buf2)
12927
0
            return NULL;
12928
0
    }
12929
12930
49.7k
    switch (kind1) {
12931
49.7k
    case PyUnicode_1BYTE_KIND:
12932
49.7k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12933
49.7k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12934
0
        else
12935
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12936
49.7k
        break;
12937
0
    case PyUnicode_2BYTE_KIND:
12938
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12939
0
        break;
12940
0
    case PyUnicode_4BYTE_KIND:
12941
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12942
0
        break;
12943
0
    default:
12944
0
        Py_UNREACHABLE();
12945
49.7k
    }
12946
12947
49.7k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12948
49.7k
    if (kind2 != kind1)
12949
0
        PyMem_Free((void *)buf2);
12950
12951
49.7k
    return out;
12952
49.7k
}
12953
12954
/*[clinic input]
12955
@permit_long_docstring_body
12956
str.partition as unicode_partition
12957
12958
    sep: object
12959
    /
12960
12961
Partition the string into three parts using the given separator.
12962
12963
This will search for the separator in the string.  If the separator is found,
12964
returns a 3-tuple containing the part before the separator, the separator
12965
itself, and the part after it.
12966
12967
If the separator is not found, returns a 3-tuple containing the original string
12968
and two empty strings.
12969
[clinic start generated code]*/
12970
12971
static PyObject *
12972
unicode_partition(PyObject *self, PyObject *sep)
12973
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
12974
8.93M
{
12975
8.93M
    return PyUnicode_Partition(self, sep);
12976
8.93M
}
12977
12978
/*[clinic input]
12979
@permit_long_docstring_body
12980
str.rpartition as unicode_rpartition = str.partition
12981
12982
Partition the string into three parts using the given separator.
12983
12984
This will search for the separator in the string, starting at the end. If
12985
the separator is found, returns a 3-tuple containing the part before the
12986
separator, the separator itself, and the part after it.
12987
12988
If the separator is not found, returns a 3-tuple containing two empty strings
12989
and the original string.
12990
[clinic start generated code]*/
12991
12992
static PyObject *
12993
unicode_rpartition(PyObject *self, PyObject *sep)
12994
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
12995
49.7k
{
12996
49.7k
    return PyUnicode_RPartition(self, sep);
12997
49.7k
}
12998
12999
PyObject *
13000
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13001
0
{
13002
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13003
0
        return NULL;
13004
13005
0
    return rsplit(s, sep, maxsplit);
13006
0
}
13007
13008
/*[clinic input]
13009
@permit_long_summary
13010
str.rsplit as unicode_rsplit = str.split
13011
13012
Return a list of the substrings in the string, using sep as the separator string.
13013
13014
Splitting starts at the end of the string and works to the front.
13015
[clinic start generated code]*/
13016
13017
static PyObject *
13018
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13019
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13020
66
{
13021
66
    if (sep == Py_None)
13022
0
        return rsplit(self, NULL, maxsplit);
13023
66
    if (PyUnicode_Check(sep))
13024
66
        return rsplit(self, sep, maxsplit);
13025
13026
0
    PyErr_Format(PyExc_TypeError,
13027
0
                 "must be str or None, not %.100s",
13028
0
                 Py_TYPE(sep)->tp_name);
13029
0
    return NULL;
13030
66
}
13031
13032
/*[clinic input]
13033
@permit_long_docstring_body
13034
str.splitlines as unicode_splitlines
13035
13036
    keepends: bool = False
13037
13038
Return a list of the lines in the string, breaking at line boundaries.
13039
13040
Line breaks are not included in the resulting list unless keepends is given and
13041
true.
13042
[clinic start generated code]*/
13043
13044
static PyObject *
13045
unicode_splitlines_impl(PyObject *self, int keepends)
13046
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13047
17.3k
{
13048
17.3k
    return PyUnicode_Splitlines(self, keepends);
13049
17.3k
}
13050
13051
static
13052
PyObject *unicode_str(PyObject *self)
13053
2.80M
{
13054
2.80M
    return unicode_result_unchanged(self);
13055
2.80M
}
13056
13057
/*[clinic input]
13058
@permit_long_summary
13059
str.swapcase as unicode_swapcase
13060
13061
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13062
[clinic start generated code]*/
13063
13064
static PyObject *
13065
unicode_swapcase_impl(PyObject *self)
13066
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13067
0
{
13068
0
    return case_operation(self, do_swapcase);
13069
0
}
13070
13071
static int
13072
unicode_maketrans_from_dict(PyObject *x, PyObject *newdict)
13073
0
{
13074
0
    PyObject *key, *value;
13075
0
    Py_ssize_t i = 0;
13076
0
    int res;
13077
0
    while (PyDict_Next(x, &i, &key, &value)) {
13078
0
        if (PyUnicode_Check(key)) {
13079
0
            PyObject *newkey;
13080
0
            int kind;
13081
0
            const void *data;
13082
0
            if (PyUnicode_GET_LENGTH(key) != 1) {
13083
0
                PyErr_SetString(PyExc_ValueError, "string keys in translate"
13084
0
                                "table must be of length 1");
13085
0
                return -1;
13086
0
            }
13087
0
            kind = PyUnicode_KIND(key);
13088
0
            data = PyUnicode_DATA(key);
13089
0
            newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13090
0
            if (!newkey)
13091
0
                return -1;
13092
0
            res = PyDict_SetItem(newdict, newkey, value);
13093
0
            Py_DECREF(newkey);
13094
0
            if (res < 0)
13095
0
                return -1;
13096
0
        }
13097
0
        else if (PyLong_Check(key)) {
13098
0
            if (PyDict_SetItem(newdict, key, value) < 0)
13099
0
                return -1;
13100
0
        }
13101
0
        else {
13102
0
            PyErr_SetString(PyExc_TypeError, "keys in translate table must"
13103
0
                            "be strings or integers");
13104
0
            return -1;
13105
0
        }
13106
0
    }
13107
0
    return 0;
13108
0
}
13109
13110
/*[clinic input]
13111
13112
@staticmethod
13113
str.maketrans as unicode_maketrans
13114
13115
  x: object
13116
13117
  y: unicode=NULL
13118
13119
  z: unicode=NULL
13120
13121
  /
13122
13123
Return a translation table usable for str.translate().
13124
13125
If there is only one argument, it must be a dictionary mapping Unicode
13126
ordinals (integers) or characters to Unicode ordinals, strings or None.
13127
Character keys will be then converted to ordinals.
13128
If there are two arguments, they must be strings of equal length, and
13129
in the resulting dictionary, each character in x will be mapped to the
13130
character at the same position in y. If there is a third argument, it
13131
must be a string, whose characters will be mapped to None in the result.
13132
[clinic start generated code]*/
13133
13134
static PyObject *
13135
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13136
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13137
4
{
13138
4
    PyObject *new = NULL, *key, *value;
13139
4
    Py_ssize_t i = 0;
13140
4
    int res;
13141
13142
4
    new = PyDict_New();
13143
4
    if (!new)
13144
0
        return NULL;
13145
4
    if (y != NULL) {
13146
4
        int x_kind, y_kind, z_kind;
13147
4
        const void *x_data, *y_data, *z_data;
13148
13149
        /* x must be a string too, of equal length */
13150
4
        if (!PyUnicode_Check(x)) {
13151
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13152
0
                            "be a string if there is a second argument");
13153
0
            goto err;
13154
0
        }
13155
4
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13156
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13157
0
                            "arguments must have equal length");
13158
0
            goto err;
13159
0
        }
13160
        /* create entries for translating chars in x to those in y */
13161
4
        x_kind = PyUnicode_KIND(x);
13162
4
        y_kind = PyUnicode_KIND(y);
13163
4
        x_data = PyUnicode_DATA(x);
13164
4
        y_data = PyUnicode_DATA(y);
13165
36
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13166
32
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13167
32
            if (!key)
13168
0
                goto err;
13169
32
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13170
32
            if (!value) {
13171
0
                Py_DECREF(key);
13172
0
                goto err;
13173
0
            }
13174
32
            res = PyDict_SetItem(new, key, value);
13175
32
            Py_DECREF(key);
13176
32
            Py_DECREF(value);
13177
32
            if (res < 0)
13178
0
                goto err;
13179
32
        }
13180
        /* create entries for deleting chars in z */
13181
4
        if (z != NULL) {
13182
0
            z_kind = PyUnicode_KIND(z);
13183
0
            z_data = PyUnicode_DATA(z);
13184
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13185
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13186
0
                if (!key)
13187
0
                    goto err;
13188
0
                res = PyDict_SetItem(new, key, Py_None);
13189
0
                Py_DECREF(key);
13190
0
                if (res < 0)
13191
0
                    goto err;
13192
0
            }
13193
0
        }
13194
4
    } else {
13195
        /* x must be a dict */
13196
0
        if (!PyAnyDict_CheckExact(x)) {
13197
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13198
0
                            "to maketrans it must be a dict");
13199
0
            goto err;
13200
0
        }
13201
        /* copy entries into the new dict, converting string keys to int keys */
13202
0
        int errcode;
13203
0
        Py_BEGIN_CRITICAL_SECTION(x);
13204
0
        errcode = unicode_maketrans_from_dict(x, new);
13205
0
        Py_END_CRITICAL_SECTION();
13206
0
        if (errcode < 0)
13207
0
            goto err;
13208
0
    }
13209
4
    return new;
13210
0
  err:
13211
0
    Py_DECREF(new);
13212
0
    return NULL;
13213
4
}
13214
13215
/*[clinic input]
13216
@permit_long_docstring_body
13217
str.translate as unicode_translate
13218
13219
    table: object
13220
        Translation table, which must be a mapping of Unicode ordinals to
13221
        Unicode ordinals, strings, or None.
13222
    /
13223
13224
Replace each character in the string using the given translation table.
13225
13226
The table must implement lookup/indexing via __getitem__, for instance a
13227
dictionary or list.  If this operation raises LookupError, the character is
13228
left untouched.  Characters mapped to None are deleted.
13229
[clinic start generated code]*/
13230
13231
static PyObject *
13232
unicode_translate(PyObject *self, PyObject *table)
13233
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13234
12.3k
{
13235
12.3k
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13236
12.3k
}
13237
13238
/*[clinic input]
13239
str.upper as unicode_upper
13240
13241
Return a copy of the string converted to uppercase.
13242
[clinic start generated code]*/
13243
13244
static PyObject *
13245
unicode_upper_impl(PyObject *self)
13246
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13247
306
{
13248
306
    if (PyUnicode_IS_ASCII(self))
13249
306
        return ascii_upper_or_lower(self, 0);
13250
0
    return case_operation(self, do_upper);
13251
306
}
13252
13253
/*[clinic input]
13254
@permit_long_summary
13255
str.zfill as unicode_zfill
13256
13257
    width: Py_ssize_t
13258
    /
13259
13260
Pad a numeric string with zeros on the left, to fill a field of the given width.
13261
13262
The string is never truncated.
13263
[clinic start generated code]*/
13264
13265
static PyObject *
13266
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13267
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13268
0
{
13269
0
    Py_ssize_t fill;
13270
0
    PyObject *u;
13271
0
    int kind;
13272
0
    const void *data;
13273
0
    Py_UCS4 chr;
13274
13275
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13276
0
        return unicode_result_unchanged(self);
13277
13278
0
    fill = width - PyUnicode_GET_LENGTH(self);
13279
13280
0
    u = pad(self, fill, 0, '0');
13281
13282
0
    if (u == NULL)
13283
0
        return NULL;
13284
13285
0
    kind = PyUnicode_KIND(u);
13286
0
    data = PyUnicode_DATA(u);
13287
0
    chr = PyUnicode_READ(kind, data, fill);
13288
13289
0
    if (chr == '+' || chr == '-') {
13290
        /* move sign to beginning of string */
13291
0
        PyUnicode_WRITE(kind, data, 0, chr);
13292
0
        PyUnicode_WRITE(kind, data, fill, '0');
13293
0
    }
13294
13295
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13296
0
    return u;
13297
0
}
13298
13299
/*[clinic input]
13300
@permit_long_summary
13301
@text_signature "($self, prefix[, start[, end]], /)"
13302
str.startswith as unicode_startswith
13303
13304
    prefix as subobj: object
13305
        A string or a tuple of strings to try.
13306
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13307
        Optional start position. Default: start of the string.
13308
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13309
        Optional stop position. Default: end of the string.
13310
    /
13311
13312
Return True if the string starts with the specified prefix, False otherwise.
13313
[clinic start generated code]*/
13314
13315
static PyObject *
13316
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13317
                        Py_ssize_t end)
13318
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13319
44.2M
{
13320
44.2M
    if (PyTuple_Check(subobj)) {
13321
1.68M
        Py_ssize_t i;
13322
6.10M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13323
4.44M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13324
4.44M
            if (!PyUnicode_Check(substring)) {
13325
0
                PyErr_Format(PyExc_TypeError,
13326
0
                             "tuple for startswith must only contain str, "
13327
0
                             "not %.100s",
13328
0
                             Py_TYPE(substring)->tp_name);
13329
0
                return NULL;
13330
0
            }
13331
4.44M
            int result = tailmatch(self, substring, start, end, -1);
13332
4.44M
            if (result < 0) {
13333
0
                return NULL;
13334
0
            }
13335
4.44M
            if (result) {
13336
18.9k
                Py_RETURN_TRUE;
13337
18.9k
            }
13338
4.44M
        }
13339
        /* nothing matched */
13340
1.68M
        Py_RETURN_FALSE;
13341
1.68M
    }
13342
42.5M
    if (!PyUnicode_Check(subobj)) {
13343
0
        PyErr_Format(PyExc_TypeError,
13344
0
                     "startswith first arg must be str or "
13345
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13346
0
        return NULL;
13347
0
    }
13348
42.5M
    int result = tailmatch(self, subobj, start, end, -1);
13349
42.5M
    if (result < 0) {
13350
0
        return NULL;
13351
0
    }
13352
42.5M
    return PyBool_FromLong(result);
13353
42.5M
}
13354
13355
13356
/*[clinic input]
13357
@permit_long_summary
13358
@text_signature "($self, suffix[, start[, end]], /)"
13359
str.endswith as unicode_endswith
13360
13361
    suffix as subobj: object
13362
        A string or a tuple of strings to try.
13363
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13364
        Optional start position. Default: start of the string.
13365
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13366
        Optional stop position. Default: end of the string.
13367
    /
13368
13369
Return True if the string ends with the specified suffix, False otherwise.
13370
[clinic start generated code]*/
13371
13372
static PyObject *
13373
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13374
                      Py_ssize_t end)
13375
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13376
10.4M
{
13377
10.4M
    if (PyTuple_Check(subobj)) {
13378
169k
        Py_ssize_t i;
13379
313k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13380
293k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13381
293k
            if (!PyUnicode_Check(substring)) {
13382
0
                PyErr_Format(PyExc_TypeError,
13383
0
                             "tuple for endswith must only contain str, "
13384
0
                             "not %.100s",
13385
0
                             Py_TYPE(substring)->tp_name);
13386
0
                return NULL;
13387
0
            }
13388
293k
            int result = tailmatch(self, substring, start, end, +1);
13389
293k
            if (result < 0) {
13390
0
                return NULL;
13391
0
            }
13392
293k
            if (result) {
13393
150k
                Py_RETURN_TRUE;
13394
150k
            }
13395
293k
        }
13396
169k
        Py_RETURN_FALSE;
13397
169k
    }
13398
10.2M
    if (!PyUnicode_Check(subobj)) {
13399
0
        PyErr_Format(PyExc_TypeError,
13400
0
                     "endswith first arg must be str or "
13401
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13402
0
        return NULL;
13403
0
    }
13404
10.2M
    int result = tailmatch(self, subobj, start, end, +1);
13405
10.2M
    if (result < 0) {
13406
0
        return NULL;
13407
0
    }
13408
10.2M
    return PyBool_FromLong(result);
13409
10.2M
}
13410
13411
13412
#include "stringlib/unicode_format.h"
13413
13414
PyDoc_STRVAR(format__doc__,
13415
             "format($self, /, *args, **kwargs)\n\
13416
--\n\
13417
\n\
13418
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13419
The substitutions are identified by braces ('{' and '}').");
13420
13421
PyDoc_STRVAR(format_map__doc__,
13422
             "format_map($self, mapping, /)\n\
13423
--\n\
13424
\n\
13425
Return a formatted version of the string, using substitutions from mapping.\n\
13426
The substitutions are identified by braces ('{' and '}').");
13427
13428
/*[clinic input]
13429
str.__format__ as unicode___format__
13430
13431
    format_spec: unicode
13432
    /
13433
13434
Return a formatted version of the string as described by format_spec.
13435
[clinic start generated code]*/
13436
13437
static PyObject *
13438
unicode___format___impl(PyObject *self, PyObject *format_spec)
13439
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13440
0
{
13441
0
    _PyUnicodeWriter writer;
13442
0
    int ret;
13443
13444
0
    _PyUnicodeWriter_Init(&writer);
13445
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13446
0
                                          self, format_spec, 0,
13447
0
                                          PyUnicode_GET_LENGTH(format_spec));
13448
0
    if (ret == -1) {
13449
0
        _PyUnicodeWriter_Dealloc(&writer);
13450
0
        return NULL;
13451
0
    }
13452
0
    return _PyUnicodeWriter_Finish(&writer);
13453
0
}
13454
13455
/*[clinic input]
13456
str.__sizeof__ as unicode_sizeof
13457
13458
Return the size of the string in memory, in bytes.
13459
[clinic start generated code]*/
13460
13461
static PyObject *
13462
unicode_sizeof_impl(PyObject *self)
13463
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13464
0
{
13465
0
    Py_ssize_t size;
13466
13467
    /* If it's a compact object, account for base structure +
13468
       character data. */
13469
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13470
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13471
0
    }
13472
0
    else if (PyUnicode_IS_COMPACT(self)) {
13473
0
        size = sizeof(PyCompactUnicodeObject) +
13474
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13475
0
    }
13476
0
    else {
13477
        /* If it is a two-block object, account for base object, and
13478
           for character block if present. */
13479
0
        size = sizeof(PyUnicodeObject);
13480
0
        if (_PyUnicode_DATA_ANY(self))
13481
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13482
0
                PyUnicode_KIND(self);
13483
0
    }
13484
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13485
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13486
13487
0
    return PyLong_FromSsize_t(size);
13488
0
}
13489
13490
static PyObject *
13491
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13492
0
{
13493
0
    PyObject *copy = _PyUnicode_Copy(v);
13494
0
    if (!copy)
13495
0
        return NULL;
13496
0
    return Py_BuildValue("(N)", copy);
13497
0
}
13498
13499
/*
13500
This function searchs the longest common leading whitespace
13501
of all lines in the [src, end).
13502
It returns the length of the common leading whitespace and sets `output` to
13503
point to the beginning of the common leading whitespace if length > 0.
13504
*/
13505
static Py_ssize_t
13506
search_longest_common_leading_whitespace(
13507
    const char *const src,
13508
    const char *const end,
13509
    const char **output)
13510
0
{
13511
    // [_start, _start + _len)
13512
    // describes the current longest common leading whitespace
13513
0
    const char *_start = NULL;
13514
0
    Py_ssize_t _len = 0;
13515
13516
0
    for (const char *iter = src; iter < end; ++iter) {
13517
0
        const char *line_start = iter;
13518
0
        const char *leading_whitespace_end = NULL;
13519
13520
        // scan the whole line
13521
0
        while (iter < end && *iter != '\n') {
13522
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13523
                /* `iter` points to the first non-whitespace character
13524
                   in this line */
13525
0
                if (iter == line_start) {
13526
                    // some line has no indent, fast exit!
13527
0
                    return 0;
13528
0
                }
13529
0
                leading_whitespace_end = iter;
13530
0
            }
13531
0
            ++iter;
13532
0
        }
13533
13534
        // if this line has all white space, skip it
13535
0
        if (!leading_whitespace_end) {
13536
0
            continue;
13537
0
        }
13538
13539
0
        if (!_start) {
13540
            // update the first leading whitespace
13541
0
            _start = line_start;
13542
0
            _len = leading_whitespace_end - line_start;
13543
0
            assert(_len > 0);
13544
0
        }
13545
0
        else {
13546
            /* We then compare with the current longest leading whitespace.
13547
13548
               [line_start, leading_whitespace_end) is the leading
13549
               whitespace of this line,
13550
13551
               [_start, _start + _len) is the leading whitespace of the
13552
               current longest leading whitespace. */
13553
0
            Py_ssize_t new_len = 0;
13554
0
            const char *_iter = _start, *line_iter = line_start;
13555
13556
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13557
0
                   && *_iter == *line_iter)
13558
0
            {
13559
0
                ++_iter;
13560
0
                ++line_iter;
13561
0
                ++new_len;
13562
0
            }
13563
13564
0
            _len = new_len;
13565
0
            if (_len == 0) {
13566
                // No common things now, fast exit!
13567
0
                return 0;
13568
0
            }
13569
0
        }
13570
0
    }
13571
13572
0
    assert(_len >= 0);
13573
0
    if (_len > 0) {
13574
0
        *output = _start;
13575
0
    }
13576
0
    return _len;
13577
0
}
13578
13579
/* Dedent a string.
13580
   Intended to dedent Python source. Unlike `textwrap.dedent`, this
13581
   only supports spaces and tabs and doesn't normalize empty lines.
13582
   Return a new reference on success, NULL with exception set on error.
13583
   */
13584
PyObject *
13585
_PyUnicode_Dedent(PyObject *unicode)
13586
0
{
13587
0
    Py_ssize_t src_len = 0;
13588
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13589
0
    if (!src) {
13590
0
        return NULL;
13591
0
    }
13592
0
    assert(src_len >= 0);
13593
0
    if (src_len == 0) {
13594
0
        return Py_NewRef(unicode);
13595
0
    }
13596
13597
0
    const char *const end = src + src_len;
13598
13599
    // [whitespace_start, whitespace_start + whitespace_len)
13600
    // describes the current longest common leading whitespace
13601
0
    const char *whitespace_start = NULL;
13602
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13603
0
        src, end, &whitespace_start);
13604
13605
0
    if (whitespace_len == 0) {
13606
0
        return Py_NewRef(unicode);
13607
0
    }
13608
13609
    // now we should trigger a dedent
13610
0
    char *dest = PyMem_Malloc(src_len);
13611
0
    if (!dest) {
13612
0
        PyErr_NoMemory();
13613
0
        return NULL;
13614
0
    }
13615
0
    char *dest_iter = dest;
13616
13617
0
    for (const char *iter = src; iter < end; ++iter) {
13618
0
        const char *line_start = iter;
13619
0
        bool in_leading_space = true;
13620
13621
        // iterate over a line to find the end of a line
13622
0
        while (iter < end && *iter != '\n') {
13623
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13624
0
                in_leading_space = false;
13625
0
            }
13626
0
            ++iter;
13627
0
        }
13628
13629
        // invariant: *iter == '\n' or iter == end
13630
0
        bool append_newline = iter < end;
13631
13632
        // if this line has all white space, write '\n' and continue
13633
0
        if (in_leading_space && append_newline) {
13634
0
            *dest_iter++ = '\n';
13635
0
            continue;
13636
0
        }
13637
13638
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13639
            conditionally append '\n' */
13640
13641
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13642
0
        assert(new_line_len >= 0);
13643
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13644
13645
0
        dest_iter += new_line_len;
13646
13647
0
        if (append_newline) {
13648
0
            *dest_iter++ = '\n';
13649
0
        }
13650
0
    }
13651
13652
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13653
0
    PyMem_Free(dest);
13654
0
    return res;
13655
0
}
13656
13657
static PyMethodDef unicode_methods[] = {
13658
    UNICODE_ENCODE_METHODDEF
13659
    UNICODE_REPLACE_METHODDEF
13660
    UNICODE_SPLIT_METHODDEF
13661
    UNICODE_RSPLIT_METHODDEF
13662
    UNICODE_JOIN_METHODDEF
13663
    UNICODE_CAPITALIZE_METHODDEF
13664
    UNICODE_CASEFOLD_METHODDEF
13665
    UNICODE_TITLE_METHODDEF
13666
    UNICODE_CENTER_METHODDEF
13667
    UNICODE_COUNT_METHODDEF
13668
    UNICODE_EXPANDTABS_METHODDEF
13669
    UNICODE_FIND_METHODDEF
13670
    UNICODE_PARTITION_METHODDEF
13671
    UNICODE_INDEX_METHODDEF
13672
    UNICODE_LJUST_METHODDEF
13673
    UNICODE_LOWER_METHODDEF
13674
    UNICODE_LSTRIP_METHODDEF
13675
    UNICODE_RFIND_METHODDEF
13676
    UNICODE_RINDEX_METHODDEF
13677
    UNICODE_RJUST_METHODDEF
13678
    UNICODE_RSTRIP_METHODDEF
13679
    UNICODE_RPARTITION_METHODDEF
13680
    UNICODE_SPLITLINES_METHODDEF
13681
    UNICODE_STRIP_METHODDEF
13682
    UNICODE_SWAPCASE_METHODDEF
13683
    UNICODE_TRANSLATE_METHODDEF
13684
    UNICODE_UPPER_METHODDEF
13685
    UNICODE_STARTSWITH_METHODDEF
13686
    UNICODE_ENDSWITH_METHODDEF
13687
    UNICODE_REMOVEPREFIX_METHODDEF
13688
    UNICODE_REMOVESUFFIX_METHODDEF
13689
    UNICODE_ISASCII_METHODDEF
13690
    UNICODE_ISLOWER_METHODDEF
13691
    UNICODE_ISUPPER_METHODDEF
13692
    UNICODE_ISTITLE_METHODDEF
13693
    UNICODE_ISSPACE_METHODDEF
13694
    UNICODE_ISDECIMAL_METHODDEF
13695
    UNICODE_ISDIGIT_METHODDEF
13696
    UNICODE_ISNUMERIC_METHODDEF
13697
    UNICODE_ISALPHA_METHODDEF
13698
    UNICODE_ISALNUM_METHODDEF
13699
    UNICODE_ISIDENTIFIER_METHODDEF
13700
    UNICODE_ISPRINTABLE_METHODDEF
13701
    UNICODE_ZFILL_METHODDEF
13702
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13703
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13704
    UNICODE___FORMAT___METHODDEF
13705
    UNICODE_MAKETRANS_METHODDEF
13706
    UNICODE_SIZEOF_METHODDEF
13707
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13708
    {NULL, NULL}
13709
};
13710
13711
static PyObject *
13712
unicode_mod(PyObject *v, PyObject *w)
13713
12.5M
{
13714
12.5M
    if (!PyUnicode_Check(v))
13715
0
        Py_RETURN_NOTIMPLEMENTED;
13716
12.5M
    return PyUnicode_Format(v, w);
13717
12.5M
}
13718
13719
static PyNumberMethods unicode_as_number = {
13720
    0,              /*nb_add*/
13721
    0,              /*nb_subtract*/
13722
    0,              /*nb_multiply*/
13723
    unicode_mod,            /*nb_remainder*/
13724
};
13725
13726
static PySequenceMethods unicode_as_sequence = {
13727
    unicode_length,     /* sq_length */
13728
    PyUnicode_Concat,   /* sq_concat */
13729
    unicode_repeat,     /* sq_repeat */
13730
    unicode_getitem,    /* sq_item */
13731
    0,                  /* sq_slice */
13732
    0,                  /* sq_ass_item */
13733
    0,                  /* sq_ass_slice */
13734
    PyUnicode_Contains, /* sq_contains */
13735
};
13736
13737
static PyObject*
13738
unicode_subscript(PyObject* self, PyObject* item)
13739
76.8M
{
13740
76.8M
    if (_PyIndex_Check(item)) {
13741
57.4M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13742
57.4M
        if (i == -1 && PyErr_Occurred())
13743
0
            return NULL;
13744
57.4M
        if (i < 0)
13745
67.6k
            i += PyUnicode_GET_LENGTH(self);
13746
57.4M
        return unicode_getitem(self, i);
13747
57.4M
    } else if (PySlice_Check(item)) {
13748
19.4M
        Py_ssize_t start, stop, step, slicelength, i;
13749
19.4M
        size_t cur;
13750
19.4M
        PyObject *result;
13751
19.4M
        const void *src_data;
13752
19.4M
        void *dest_data;
13753
19.4M
        int src_kind, dest_kind;
13754
19.4M
        Py_UCS4 ch, max_char, kind_limit;
13755
13756
19.4M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13757
0
            return NULL;
13758
0
        }
13759
19.4M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13760
19.4M
                                            &start, &stop, step);
13761
13762
19.4M
        if (slicelength <= 0) {
13763
1.36M
            _Py_RETURN_UNICODE_EMPTY();
13764
18.0M
        } else if (start == 0 && step == 1 &&
13765
5.79M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13766
3.87M
            return unicode_result_unchanged(self);
13767
14.1M
        } else if (step == 1) {
13768
14.1M
            return PyUnicode_Substring(self,
13769
14.1M
                                       start, start + slicelength);
13770
14.1M
        }
13771
        /* General case */
13772
0
        src_kind = PyUnicode_KIND(self);
13773
0
        src_data = PyUnicode_DATA(self);
13774
0
        if (!PyUnicode_IS_ASCII(self)) {
13775
0
            kind_limit = kind_maxchar_limit(src_kind);
13776
0
            max_char = 0;
13777
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13778
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13779
0
                if (ch > max_char) {
13780
0
                    max_char = ch;
13781
0
                    if (max_char >= kind_limit)
13782
0
                        break;
13783
0
                }
13784
0
            }
13785
0
        }
13786
0
        else
13787
0
            max_char = 127;
13788
0
        result = PyUnicode_New(slicelength, max_char);
13789
0
        if (result == NULL)
13790
0
            return NULL;
13791
0
        dest_kind = PyUnicode_KIND(result);
13792
0
        dest_data = PyUnicode_DATA(result);
13793
13794
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13795
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13796
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13797
0
        }
13798
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13799
0
        return result;
13800
0
    } else {
13801
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13802
0
                     Py_TYPE(item)->tp_name);
13803
0
        return NULL;
13804
0
    }
13805
76.8M
}
13806
13807
static PyMappingMethods unicode_as_mapping = {
13808
    unicode_length,     /* mp_length */
13809
    unicode_subscript,  /* mp_subscript */
13810
    0,                  /* mp_ass_subscript */
13811
};
13812
13813
13814
static PyObject *
13815
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13816
13817
/*[clinic input]
13818
@classmethod
13819
str.__new__ as unicode_new
13820
13821
    object as x: object = NULL
13822
    encoding: str = NULL
13823
    errors: str = NULL
13824
13825
[clinic start generated code]*/
13826
13827
static PyObject *
13828
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13829
                 const char *errors)
13830
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13831
16.0M
{
13832
16.0M
    PyObject *unicode;
13833
16.0M
    if (x == NULL) {
13834
0
        unicode = _PyUnicode_GetEmpty();
13835
0
    }
13836
16.0M
    else if (encoding == NULL && errors == NULL) {
13837
16.0M
        unicode = PyObject_Str(x);
13838
16.0M
    }
13839
0
    else {
13840
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13841
0
    }
13842
13843
16.0M
    if (unicode != NULL && type != &PyUnicode_Type) {
13844
16.0M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13845
16.0M
    }
13846
16.0M
    return unicode;
13847
16.0M
}
13848
13849
static const char *
13850
arg_as_utf8(PyObject *obj, const char *name)
13851
2.92M
{
13852
2.92M
    if (!PyUnicode_Check(obj)) {
13853
0
        PyErr_Format(PyExc_TypeError,
13854
0
                     "str() argument '%s' must be str, not %T",
13855
0
                     name, obj);
13856
0
        return NULL;
13857
0
    }
13858
2.92M
    return _PyUnicode_AsUTF8NoNUL(obj);
13859
2.92M
}
13860
13861
static PyObject *
13862
unicode_vectorcall(PyObject *type, PyObject *const *args,
13863
                   size_t nargsf, PyObject *kwnames)
13864
2.56M
{
13865
2.56M
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13866
13867
2.56M
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13868
2.56M
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13869
        // Fallback to unicode_new()
13870
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13871
0
        if (tuple == NULL) {
13872
0
            return NULL;
13873
0
        }
13874
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13875
0
        if (dict == NULL) {
13876
0
            Py_DECREF(tuple);
13877
0
            return NULL;
13878
0
        }
13879
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13880
0
        Py_DECREF(tuple);
13881
0
        Py_DECREF(dict);
13882
0
        return ret;
13883
0
    }
13884
2.56M
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13885
0
        return NULL;
13886
0
    }
13887
2.56M
    if (nargs == 0) {
13888
4.50k
        return _PyUnicode_GetEmpty();
13889
4.50k
    }
13890
2.55M
    PyObject *object = args[0];
13891
2.55M
    if (nargs == 1) {
13892
1.30k
        return PyObject_Str(object);
13893
1.30k
    }
13894
2.55M
    const char *encoding = arg_as_utf8(args[1], "encoding");
13895
2.55M
    if (encoding == NULL) {
13896
0
        return NULL;
13897
0
    }
13898
2.55M
    const char *errors = NULL;
13899
2.55M
    if (nargs == 3) {
13900
366k
        errors = arg_as_utf8(args[2], "errors");
13901
366k
        if (errors == NULL) {
13902
0
            return NULL;
13903
0
        }
13904
366k
    }
13905
2.55M
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13906
2.55M
}
13907
13908
static PyObject *
13909
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13910
16.0M
{
13911
16.0M
    PyObject *self;
13912
16.0M
    Py_ssize_t length, char_size;
13913
16.0M
    int share_utf8;
13914
16.0M
    int kind;
13915
16.0M
    void *data;
13916
13917
16.0M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13918
16.0M
    assert(_PyUnicode_CHECK(unicode));
13919
13920
16.0M
    self = type->tp_alloc(type, 0);
13921
16.0M
    if (self == NULL) {
13922
0
        return NULL;
13923
0
    }
13924
16.0M
    kind = PyUnicode_KIND(unicode);
13925
16.0M
    length = PyUnicode_GET_LENGTH(unicode);
13926
13927
16.0M
    _PyUnicode_LENGTH(self) = length;
13928
#ifdef Py_DEBUG
13929
    _PyUnicode_HASH(self) = -1;
13930
#else
13931
16.0M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13932
16.0M
#endif
13933
16.0M
    _PyUnicode_STATE(self).interned = 0;
13934
16.0M
    _PyUnicode_STATE(self).kind = kind;
13935
16.0M
    _PyUnicode_STATE(self).compact = 0;
13936
16.0M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13937
16.0M
    _PyUnicode_STATE(self).statically_allocated = 0;
13938
16.0M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13939
16.0M
    PyUnicode_SET_UTF8(self, NULL);
13940
16.0M
    _PyUnicode_DATA_ANY(self) = NULL;
13941
13942
16.0M
    share_utf8 = 0;
13943
16.0M
    if (kind == PyUnicode_1BYTE_KIND) {
13944
13.6M
        char_size = 1;
13945
13.6M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13946
13.5M
            share_utf8 = 1;
13947
13.6M
    }
13948
2.44M
    else if (kind == PyUnicode_2BYTE_KIND) {
13949
2.39M
        char_size = 2;
13950
2.39M
    }
13951
57.3k
    else {
13952
57.3k
        assert(kind == PyUnicode_4BYTE_KIND);
13953
57.3k
        char_size = 4;
13954
57.3k
    }
13955
13956
    /* Ensure we won't overflow the length. */
13957
16.0M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13958
0
        PyErr_NoMemory();
13959
0
        goto onError;
13960
0
    }
13961
16.0M
    data = PyMem_Malloc((length + 1) * char_size);
13962
16.0M
    if (data == NULL) {
13963
0
        PyErr_NoMemory();
13964
0
        goto onError;
13965
0
    }
13966
13967
16.0M
    _PyUnicode_DATA_ANY(self) = data;
13968
16.0M
    if (share_utf8) {
13969
13.5M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13970
13.5M
        PyUnicode_SET_UTF8(self, data);
13971
13.5M
    }
13972
13973
16.0M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13974
16.0M
    assert(_PyUnicode_CheckConsistency(self, 1));
13975
#ifdef Py_DEBUG
13976
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13977
#endif
13978
16.0M
    return self;
13979
13980
0
onError:
13981
0
    Py_DECREF(self);
13982
0
    return NULL;
13983
16.0M
}
13984
13985
static _PyObjectIndexPair
13986
unicode_iteritem(PyObject *obj, Py_ssize_t index)
13987
44.9M
{
13988
44.9M
    if (index >= PyUnicode_GET_LENGTH(obj)) {
13989
3.93M
        return (_PyObjectIndexPair) { .object = NULL, .index = index };
13990
3.93M
    }
13991
41.0M
    const void *data = PyUnicode_DATA(obj);
13992
41.0M
    int kind = PyUnicode_KIND(obj);
13993
41.0M
    Py_UCS4 ch = PyUnicode_READ(kind, data, index);
13994
41.0M
    PyObject *result = unicode_char(ch);
13995
41.0M
    index = (result == NULL) ? -1 : index + 1;
13996
41.0M
    return (_PyObjectIndexPair) { .object = result, .index = index };
13997
44.9M
}
13998
13999
void
14000
_PyUnicode_ExactDealloc(PyObject *op)
14001
76.2M
{
14002
76.2M
    assert(PyUnicode_CheckExact(op));
14003
76.2M
    unicode_dealloc(op);
14004
76.2M
}
14005
14006
PyDoc_STRVAR(unicode_doc,
14007
"str(object='') -> str\n\
14008
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14009
\n\
14010
Create a new string object from the given object. If encoding or\n\
14011
errors is specified, then the object must expose a data buffer\n\
14012
that will be decoded using the given encoding and error handler.\n\
14013
Otherwise, returns the result of object.__str__() (if defined)\n\
14014
or repr(object).\n\
14015
encoding defaults to 'utf-8'.\n\
14016
errors defaults to 'strict'.");
14017
14018
static PyObject *unicode_iter(PyObject *seq);
14019
14020
PyTypeObject PyUnicode_Type = {
14021
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14022
    "str",                        /* tp_name */
14023
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14024
    0,                            /* tp_itemsize */
14025
    /* Slots */
14026
    unicode_dealloc,              /* tp_dealloc */
14027
    0,                            /* tp_vectorcall_offset */
14028
    0,                            /* tp_getattr */
14029
    0,                            /* tp_setattr */
14030
    0,                            /* tp_as_async */
14031
    unicode_repr,                 /* tp_repr */
14032
    &unicode_as_number,           /* tp_as_number */
14033
    &unicode_as_sequence,         /* tp_as_sequence */
14034
    &unicode_as_mapping,          /* tp_as_mapping */
14035
    unicode_hash,                 /* tp_hash*/
14036
    0,                            /* tp_call*/
14037
    unicode_str,                  /* tp_str */
14038
    PyObject_GenericGetAttr,      /* tp_getattro */
14039
    0,                            /* tp_setattro */
14040
    0,                            /* tp_as_buffer */
14041
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14042
        Py_TPFLAGS_UNICODE_SUBCLASS |
14043
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14044
    unicode_doc,                  /* tp_doc */
14045
    0,                            /* tp_traverse */
14046
    0,                            /* tp_clear */
14047
    PyUnicode_RichCompare,        /* tp_richcompare */
14048
    0,                            /* tp_weaklistoffset */
14049
    unicode_iter,                 /* tp_iter */
14050
    0,                            /* tp_iternext */
14051
    unicode_methods,              /* tp_methods */
14052
    0,                            /* tp_members */
14053
    0,                            /* tp_getset */
14054
    0,                            /* tp_base */
14055
    0,                            /* tp_dict */
14056
    0,                            /* tp_descr_get */
14057
    0,                            /* tp_descr_set */
14058
    0,                            /* tp_dictoffset */
14059
    0,                            /* tp_init */
14060
    0,                            /* tp_alloc */
14061
    unicode_new,                  /* tp_new */
14062
    PyObject_Free,                /* tp_free */
14063
    .tp_vectorcall = unicode_vectorcall,
14064
    ._tp_iteritem = unicode_iteritem,
14065
};
14066
14067
/* Initialize the Unicode implementation */
14068
14069
static void
14070
_init_global_state(void)
14071
36
{
14072
36
    static int initialized = 0;
14073
36
    if (initialized) {
14074
0
        return;
14075
0
    }
14076
36
    initialized = 1;
14077
14078
    /* initialize the linebreak bloom filter */
14079
36
    const Py_UCS2 linebreak[] = {
14080
36
        0x000A, /* LINE FEED */
14081
36
        0x000D, /* CARRIAGE RETURN */
14082
36
        0x001C, /* FILE SEPARATOR */
14083
36
        0x001D, /* GROUP SEPARATOR */
14084
36
        0x001E, /* RECORD SEPARATOR */
14085
36
        0x0085, /* NEXT LINE */
14086
36
        0x2028, /* LINE SEPARATOR */
14087
36
        0x2029, /* PARAGRAPH SEPARATOR */
14088
36
    };
14089
36
    bloom_linebreak = make_bloom_mask(
14090
36
        PyUnicode_2BYTE_KIND, linebreak,
14091
36
        Py_ARRAY_LENGTH(linebreak));
14092
36
}
14093
14094
void
14095
_PyUnicode_InitState(PyInterpreterState *interp)
14096
36
{
14097
36
    if (!_Py_IsMainInterpreter(interp)) {
14098
0
        return;
14099
0
    }
14100
36
    _init_global_state();
14101
36
}
14102
14103
14104
PyStatus
14105
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14106
36
{
14107
36
    if (_Py_IsMainInterpreter(interp)) {
14108
36
        PyStatus status = init_global_interned_strings(interp);
14109
36
        if (_PyStatus_EXCEPTION(status)) {
14110
0
            return status;
14111
0
        }
14112
36
    }
14113
36
    assert(INTERNED_STRINGS);
14114
14115
36
    if (init_interned_dict(interp)) {
14116
0
        PyErr_Clear();
14117
0
        return _PyStatus_ERR("failed to create interned dict");
14118
0
    }
14119
14120
36
    return _PyStatus_OK();
14121
36
}
14122
14123
14124
PyStatus
14125
_PyUnicode_InitTypes(PyInterpreterState *interp)
14126
36
{
14127
36
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14128
0
        goto error;
14129
0
    }
14130
36
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14131
0
        goto error;
14132
0
    }
14133
36
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14134
0
        goto error;
14135
0
    }
14136
36
    return _PyStatus_OK();
14137
14138
0
error:
14139
0
    return _PyStatus_ERR("Can't initialize unicode types");
14140
36
}
14141
14142
static /* non-null */ PyObject*
14143
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14144
40.1k
{
14145
    // Note that this steals a reference to `s`, but in many cases that
14146
    // stolen ref is returned, requiring no decref/incref.
14147
14148
40.1k
    assert(s != NULL);
14149
40.1k
    assert(_PyUnicode_CHECK(s));
14150
40.1k
    assert(_PyUnicode_STATE(s).statically_allocated);
14151
40.1k
    assert(!PyUnicode_CHECK_INTERNED(s));
14152
14153
#ifdef Py_DEBUG
14154
    /* We must not add process-global interned string if there's already a
14155
     * per-interpreter interned_dict, which might contain duplicates.
14156
     */
14157
    PyObject *interned = get_interned_dict(interp);
14158
    assert(interned == NULL);
14159
#endif
14160
14161
    /* Look in the global cache first. */
14162
40.1k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14163
    /* We should only init each string once */
14164
40.1k
    assert(r == NULL);
14165
    /* but just in case (for the non-debug build), handle this */
14166
40.1k
    if (r != NULL && r != s) {
14167
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14168
0
        assert(_PyUnicode_CHECK(r));
14169
0
        Py_DECREF(s);
14170
0
        return Py_NewRef(r);
14171
0
    }
14172
14173
40.1k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14174
0
        Py_FatalError("failed to intern static string");
14175
0
    }
14176
14177
40.1k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14178
40.1k
    return s;
14179
40.1k
}
14180
14181
void
14182
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14183
40.1k
{
14184
    // This should only be called as part of runtime initialization
14185
40.1k
    assert(!Py_IsInitialized());
14186
14187
40.1k
    *p = intern_static(interp, *p);
14188
40.1k
    assert(*p);
14189
40.1k
}
14190
14191
static void
14192
immortalize_interned(PyObject *s)
14193
291k
{
14194
291k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14195
291k
    assert(!_Py_IsImmortal(s));
14196
#ifdef Py_REF_DEBUG
14197
    /* The reference count value should be excluded from the RefTotal.
14198
       The decrements to these objects will not be registered so they
14199
       need to be accounted for in here. */
14200
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14201
        _Py_DecRefTotal(_PyThreadState_GET());
14202
    }
14203
#endif
14204
291k
    _Py_SetImmortal(s);
14205
    // The switch to SSTATE_INTERNED_IMMORTAL must be the last thing done here
14206
    // to synchronize with the check in intern_common() that avoids locking if
14207
    // the string is already immortal.
14208
291k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14209
291k
}
14210
14211
static /* non-null */ PyObject*
14212
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14213
              bool immortalize)
14214
94.7M
{
14215
    // Note that this steals a reference to `s`, but in many cases that
14216
    // stolen ref is returned, requiring no decref/incref.
14217
14218
#ifdef Py_DEBUG
14219
    assert(s != NULL);
14220
    assert(_PyUnicode_CHECK(s));
14221
#else
14222
94.7M
    if (s == NULL || !PyUnicode_Check(s)) {
14223
0
        return s;
14224
0
    }
14225
94.7M
#endif
14226
14227
    /* If it's a subclass, we don't really know what putting
14228
       it in the interned dict might do. */
14229
94.7M
    if (!PyUnicode_CheckExact(s)) {
14230
0
        return s;
14231
0
    }
14232
14233
    /* Is it already interned? */
14234
94.7M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14235
6.32M
        case SSTATE_NOT_INTERNED:
14236
            // no, go on
14237
6.32M
            break;
14238
28.5k
        case SSTATE_INTERNED_MORTAL:
14239
            // yes but we might need to make it immortal
14240
28.5k
            if (immortalize) {
14241
1.80k
                immortalize_interned(s);
14242
1.80k
            }
14243
28.5k
            return s;
14244
88.4M
        default:
14245
            // all done
14246
88.4M
            return s;
14247
94.7M
    }
14248
14249
    /* Statically allocated strings must be already interned. */
14250
94.7M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14251
14252
#if Py_GIL_DISABLED
14253
    /* In the free-threaded build, all interned strings are immortal */
14254
    immortalize = 1;
14255
#endif
14256
14257
    /* If it's already immortal, intern it as such */
14258
6.32M
    if (_Py_IsImmortal(s)) {
14259
0
        immortalize = 1;
14260
0
    }
14261
14262
    /* if it's a short string, get the singleton */
14263
6.32M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14264
20.0k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14265
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14266
0
        assert(PyUnicode_CHECK_INTERNED(r));
14267
0
        Py_DECREF(s);
14268
0
        return r;
14269
0
    }
14270
#ifdef Py_DEBUG
14271
    assert(!unicode_is_singleton(s));
14272
#endif
14273
14274
    /* Look in the global cache now. */
14275
6.32M
    {
14276
6.32M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14277
6.32M
        if (r != NULL) {
14278
538k
            assert(_PyUnicode_STATE(r).statically_allocated);
14279
538k
            assert(r != s);  // r must be statically_allocated; s is not
14280
538k
            Py_DECREF(s);
14281
538k
            return Py_NewRef(r);
14282
538k
        }
14283
6.32M
    }
14284
14285
    /* Do a setdefault on the per-interpreter cache. */
14286
5.78M
    PyObject *interned = get_interned_dict(interp);
14287
5.78M
    assert(interned != NULL);
14288
#ifdef Py_GIL_DISABLED
14289
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14290
    // Lock-free fast path: check if there's already an interned copy that
14291
    // is in its final immortal state.
14292
    PyObject *r;
14293
    int res = PyDict_GetItemRef(interned, s, &r);
14294
    if (res < 0) {
14295
        PyErr_Clear();
14296
        return s;
14297
    }
14298
    if (res > 0) {
14299
        unsigned int state = _Py_atomic_load_uint8(&_PyUnicode_STATE(r).interned);
14300
        if (state == SSTATE_INTERNED_IMMORTAL) {
14301
            Py_DECREF(s);
14302
            return r;
14303
        }
14304
        // Not yet fully interned; fall through to the locking path.
14305
        Py_DECREF(r);
14306
    }
14307
#endif
14308
5.78M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14309
5.78M
    PyObject *t;
14310
5.78M
    {
14311
5.78M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14312
5.78M
        if (res < 0) {
14313
0
            PyErr_Clear();
14314
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14315
0
            return s;
14316
0
        }
14317
5.78M
        else if (res == 1) {
14318
            // value was already present (not inserted)
14319
5.01M
            Py_DECREF(s);
14320
5.01M
            if (immortalize &&
14321
1.10M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14322
10.0k
                immortalize_interned(t);
14323
10.0k
            }
14324
5.01M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14325
5.01M
            return t;
14326
5.01M
        }
14327
770k
        else {
14328
            // value was newly inserted
14329
770k
            assert (s == t);
14330
770k
            Py_DECREF(t);
14331
770k
        }
14332
5.78M
    }
14333
14334
    /* NOT_INTERNED -> INTERNED_MORTAL */
14335
14336
5.78M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14337
14338
770k
    if (!_Py_IsImmortal(s)) {
14339
        /* The two references in interned dict (key and value) are not counted.
14340
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14341
770k
        Py_DECREF(s);
14342
770k
        Py_DECREF(s);
14343
770k
    }
14344
770k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14345
14346
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14347
14348
#ifdef Py_DEBUG
14349
    if (_Py_IsImmortal(s)) {
14350
        assert(immortalize);
14351
    }
14352
#endif
14353
770k
    if (immortalize) {
14354
279k
        immortalize_interned(s);
14355
279k
    }
14356
14357
770k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14358
770k
    return s;
14359
5.78M
}
14360
14361
void
14362
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14363
14.8M
{
14364
14.8M
    *p = intern_common(interp, *p, 1);
14365
14.8M
    assert(*p);
14366
14.8M
}
14367
14368
void
14369
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14370
79.9M
{
14371
79.9M
    *p = intern_common(interp, *p, 0);
14372
79.9M
    assert(*p);
14373
79.9M
}
14374
14375
14376
void
14377
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14378
0
{
14379
0
    _PyUnicode_InternImmortal(interp, p);
14380
0
    return;
14381
0
}
14382
14383
void
14384
PyUnicode_InternInPlace(PyObject **p)
14385
0
{
14386
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14387
0
    _PyUnicode_InternMortal(interp, p);
14388
0
}
14389
14390
// Public-looking name kept for the stable ABI; user should not call this:
14391
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14392
void
14393
PyUnicode_InternImmortal(PyObject **p)
14394
0
{
14395
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14396
0
    _PyUnicode_InternImmortal(interp, p);
14397
0
}
14398
14399
PyObject *
14400
PyUnicode_InternFromString(const char *cp)
14401
1.33M
{
14402
1.33M
    PyObject *s = PyUnicode_FromString(cp);
14403
1.33M
    if (s == NULL) {
14404
0
        return NULL;
14405
0
    }
14406
1.33M
    PyInterpreterState *interp = _PyInterpreterState_GET();
14407
1.33M
    _PyUnicode_InternMortal(interp, &s);
14408
1.33M
    return s;
14409
1.33M
}
14410
14411
14412
void
14413
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14414
0
{
14415
0
    PyObject *interned = get_interned_dict(interp);
14416
0
    if (interned == NULL) {
14417
0
        return;
14418
0
    }
14419
0
    assert(PyDict_CheckExact(interned));
14420
14421
0
    if (has_shared_intern_dict(interp)) {
14422
        // the dict doesn't belong to this interpreter, skip the debug
14423
        // checks on it and just clear the pointer to it
14424
0
        clear_interned_dict(interp);
14425
0
        return;
14426
0
    }
14427
14428
#ifdef INTERNED_STATS
14429
    fprintf(stderr, "releasing %zd interned strings\n",
14430
            PyDict_GET_SIZE(interned));
14431
14432
    Py_ssize_t total_length = 0;
14433
#endif
14434
0
    Py_ssize_t pos = 0;
14435
0
    PyObject *s, *ignored_value;
14436
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14437
0
        int shared = 0;
14438
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14439
0
        case SSTATE_INTERNED_IMMORTAL:
14440
            /* Make immortal interned strings mortal again. */
14441
            // Skip the Immortal Instance check and restore
14442
            // the two references (key and value) ignored
14443
            // by PyUnicode_InternInPlace().
14444
0
            _Py_SetMortal(s, 2);
14445
#ifdef Py_REF_DEBUG
14446
            /* let's be pedantic with the ref total */
14447
            _Py_IncRefTotal(_PyThreadState_GET());
14448
            _Py_IncRefTotal(_PyThreadState_GET());
14449
#endif
14450
#ifdef INTERNED_STATS
14451
            total_length += PyUnicode_GET_LENGTH(s);
14452
#endif
14453
0
            break;
14454
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14455
            /* It is shared between interpreters, so we should unmark it
14456
               only when this is the last interpreter in which it's
14457
               interned.  We immortalize all the statically initialized
14458
               strings during startup, so we can rely on the
14459
               main interpreter to be the last one. */
14460
0
            if (!_Py_IsMainInterpreter(interp)) {
14461
0
                shared = 1;
14462
0
            }
14463
0
            break;
14464
0
        case SSTATE_INTERNED_MORTAL:
14465
            // Restore 2 references held by the interned dict; these will
14466
            // be decref'd by clear_interned_dict's PyDict_Clear.
14467
0
            _Py_RefcntAdd(s, 2);
14468
#ifdef Py_REF_DEBUG
14469
            /* let's be pedantic with the ref total */
14470
            _Py_IncRefTotal(_PyThreadState_GET());
14471
            _Py_IncRefTotal(_PyThreadState_GET());
14472
#endif
14473
0
            break;
14474
0
        case SSTATE_NOT_INTERNED:
14475
0
            _Py_FALLTHROUGH;
14476
0
        default:
14477
0
            Py_UNREACHABLE();
14478
0
        }
14479
0
        if (!shared) {
14480
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14481
0
        }
14482
0
    }
14483
#ifdef INTERNED_STATS
14484
    fprintf(stderr,
14485
            "total length of all interned strings: %zd characters\n",
14486
            total_length);
14487
#endif
14488
14489
0
    struct _Py_unicode_state *state = &interp->unicode;
14490
0
    struct _Py_unicode_ids *ids = &state->ids;
14491
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14492
0
        Py_XINCREF(ids->array[i]);
14493
0
    }
14494
0
    clear_interned_dict(interp);
14495
0
    if (_Py_IsMainInterpreter(interp)) {
14496
0
        clear_global_interned_strings();
14497
0
    }
14498
0
}
14499
14500
14501
/********************* Unicode Iterator **************************/
14502
14503
typedef struct {
14504
    PyObject_HEAD
14505
    Py_ssize_t it_index;
14506
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14507
} unicodeiterobject;
14508
14509
static void
14510
unicodeiter_dealloc(PyObject *op)
14511
1.73M
{
14512
1.73M
    unicodeiterobject *it = (unicodeiterobject *)op;
14513
1.73M
    _PyObject_GC_UNTRACK(it);
14514
1.73M
    Py_XDECREF(it->it_seq);
14515
1.73M
    PyObject_GC_Del(it);
14516
1.73M
}
14517
14518
static int
14519
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14520
2
{
14521
2
    unicodeiterobject *it = (unicodeiterobject *)op;
14522
2
    Py_VISIT(it->it_seq);
14523
2
    return 0;
14524
2
}
14525
14526
static PyObject *
14527
unicodeiter_next(PyObject *op)
14528
40.8M
{
14529
40.8M
    unicodeiterobject *it = (unicodeiterobject *)op;
14530
40.8M
    PyObject *seq;
14531
14532
40.8M
    assert(it != NULL);
14533
40.8M
    seq = it->it_seq;
14534
40.8M
    if (seq == NULL)
14535
0
        return NULL;
14536
40.8M
    assert(_PyUnicode_CHECK(seq));
14537
14538
40.8M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14539
39.3M
        int kind = PyUnicode_KIND(seq);
14540
39.3M
        const void *data = PyUnicode_DATA(seq);
14541
39.3M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14542
39.3M
        it->it_index++;
14543
39.3M
        return unicode_char(chr);
14544
39.3M
    }
14545
14546
1.53M
    it->it_seq = NULL;
14547
1.53M
    Py_DECREF(seq);
14548
1.53M
    return NULL;
14549
40.8M
}
14550
14551
static PyObject *
14552
unicode_ascii_iter_next(PyObject *op)
14553
4.48M
{
14554
4.48M
    unicodeiterobject *it = (unicodeiterobject *)op;
14555
4.48M
    assert(it != NULL);
14556
4.48M
    PyObject *seq = it->it_seq;
14557
4.48M
    if (seq == NULL) {
14558
0
        return NULL;
14559
0
    }
14560
4.48M
    assert(_PyUnicode_CHECK(seq));
14561
4.48M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14562
4.48M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14563
4.29M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14564
4.29M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14565
4.29M
                                              data, it->it_index);
14566
4.29M
        it->it_index++;
14567
4.29M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14568
4.29M
    }
14569
193k
    it->it_seq = NULL;
14570
193k
    Py_DECREF(seq);
14571
193k
    return NULL;
14572
4.48M
}
14573
14574
static PyObject *
14575
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14576
1.29M
{
14577
1.29M
    unicodeiterobject *it = (unicodeiterobject *)op;
14578
1.29M
    Py_ssize_t len = 0;
14579
1.29M
    if (it->it_seq)
14580
1.29M
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14581
1.29M
    return PyLong_FromSsize_t(len);
14582
1.29M
}
14583
14584
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14585
14586
static PyObject *
14587
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14588
0
{
14589
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14590
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14591
14592
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14593
     * call must be before access of iterator pointers.
14594
     * see issue #101765 */
14595
14596
0
    if (it->it_seq != NULL) {
14597
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14598
0
    } else {
14599
0
        PyObject *u = _PyUnicode_GetEmpty();
14600
0
        if (u == NULL) {
14601
0
            Py_XDECREF(iter);
14602
0
            return NULL;
14603
0
        }
14604
0
        return Py_BuildValue("N(N)", iter, u);
14605
0
    }
14606
0
}
14607
14608
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14609
14610
static PyObject *
14611
unicodeiter_setstate(PyObject *op, PyObject *state)
14612
0
{
14613
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14614
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14615
0
    if (index == -1 && PyErr_Occurred())
14616
0
        return NULL;
14617
0
    if (it->it_seq != NULL) {
14618
0
        if (index < 0)
14619
0
            index = 0;
14620
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14621
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14622
0
        it->it_index = index;
14623
0
    }
14624
0
    Py_RETURN_NONE;
14625
0
}
14626
14627
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14628
14629
static PyMethodDef unicodeiter_methods[] = {
14630
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14631
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14632
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14633
    {NULL,      NULL}       /* sentinel */
14634
};
14635
14636
PyTypeObject PyUnicodeIter_Type = {
14637
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14638
    "str_iterator",         /* tp_name */
14639
    sizeof(unicodeiterobject),      /* tp_basicsize */
14640
    0,                  /* tp_itemsize */
14641
    /* methods */
14642
    unicodeiter_dealloc,/* tp_dealloc */
14643
    0,                  /* tp_vectorcall_offset */
14644
    0,                  /* tp_getattr */
14645
    0,                  /* tp_setattr */
14646
    0,                  /* tp_as_async */
14647
    0,                  /* tp_repr */
14648
    0,                  /* tp_as_number */
14649
    0,                  /* tp_as_sequence */
14650
    0,                  /* tp_as_mapping */
14651
    0,                  /* tp_hash */
14652
    0,                  /* tp_call */
14653
    0,                  /* tp_str */
14654
    PyObject_GenericGetAttr,        /* tp_getattro */
14655
    0,                  /* tp_setattro */
14656
    0,                  /* tp_as_buffer */
14657
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14658
    0,                  /* tp_doc */
14659
    unicodeiter_traverse, /* tp_traverse */
14660
    0,                  /* tp_clear */
14661
    0,                  /* tp_richcompare */
14662
    0,                  /* tp_weaklistoffset */
14663
    PyObject_SelfIter,          /* tp_iter */
14664
    unicodeiter_next,   /* tp_iternext */
14665
    unicodeiter_methods,            /* tp_methods */
14666
    0,
14667
};
14668
14669
PyTypeObject _PyUnicodeASCIIIter_Type = {
14670
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14671
    .tp_name = "str_ascii_iterator",
14672
    .tp_basicsize = sizeof(unicodeiterobject),
14673
    .tp_dealloc = unicodeiter_dealloc,
14674
    .tp_getattro = PyObject_GenericGetAttr,
14675
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14676
    .tp_traverse = unicodeiter_traverse,
14677
    .tp_iter = PyObject_SelfIter,
14678
    .tp_iternext = unicode_ascii_iter_next,
14679
    .tp_methods = unicodeiter_methods,
14680
};
14681
14682
static PyObject *
14683
unicode_iter(PyObject *seq)
14684
1.73M
{
14685
1.73M
    unicodeiterobject *it;
14686
14687
1.73M
    if (!PyUnicode_Check(seq)) {
14688
0
        PyErr_BadInternalCall();
14689
0
        return NULL;
14690
0
    }
14691
1.73M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14692
203k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14693
203k
    }
14694
1.53M
    else {
14695
1.53M
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14696
1.53M
    }
14697
1.73M
    if (it == NULL)
14698
0
        return NULL;
14699
1.73M
    it->it_index = 0;
14700
1.73M
    it->it_seq = Py_NewRef(seq);
14701
1.73M
    _PyObject_GC_TRACK(it);
14702
1.73M
    return (PyObject *)it;
14703
1.73M
}
14704
14705
static int
14706
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14707
144
{
14708
144
    int res;
14709
144
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14710
144
    if (res == -2) {
14711
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14712
0
        return -1;
14713
0
    }
14714
144
    if (res < 0) {
14715
0
        PyErr_NoMemory();
14716
0
        return -1;
14717
0
    }
14718
144
    return 0;
14719
144
}
14720
14721
14722
static int
14723
config_get_codec_name(wchar_t **config_encoding)
14724
72
{
14725
72
    char *encoding;
14726
72
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14727
0
        return -1;
14728
0
    }
14729
14730
72
    PyObject *name_obj = NULL;
14731
72
    PyObject *codec = _PyCodec_Lookup(encoding);
14732
72
    PyMem_RawFree(encoding);
14733
14734
72
    if (!codec)
14735
0
        goto error;
14736
14737
72
    name_obj = PyObject_GetAttrString(codec, "name");
14738
72
    Py_CLEAR(codec);
14739
72
    if (!name_obj) {
14740
0
        goto error;
14741
0
    }
14742
14743
72
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14744
72
    Py_DECREF(name_obj);
14745
72
    if (wname == NULL) {
14746
0
        goto error;
14747
0
    }
14748
14749
72
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14750
72
    if (raw_wname == NULL) {
14751
0
        PyMem_Free(wname);
14752
0
        PyErr_NoMemory();
14753
0
        goto error;
14754
0
    }
14755
14756
72
    PyMem_RawFree(*config_encoding);
14757
72
    *config_encoding = raw_wname;
14758
14759
72
    PyMem_Free(wname);
14760
72
    return 0;
14761
14762
0
error:
14763
0
    Py_XDECREF(codec);
14764
0
    Py_XDECREF(name_obj);
14765
0
    return -1;
14766
72
}
14767
14768
14769
static PyStatus
14770
init_stdio_encoding(PyInterpreterState *interp)
14771
36
{
14772
    /* Update the stdio encoding to the normalized Python codec name. */
14773
36
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14774
36
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14775
0
        return _PyStatus_ERR("failed to get the Python codec name "
14776
0
                             "of the stdio encoding");
14777
0
    }
14778
36
    return _PyStatus_OK();
14779
36
}
14780
14781
14782
static int
14783
init_fs_codec(PyInterpreterState *interp)
14784
36
{
14785
36
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14786
14787
36
    _Py_error_handler error_handler;
14788
36
    error_handler = get_error_handler_wide(config->filesystem_errors);
14789
36
    if (error_handler == _Py_ERROR_UNKNOWN) {
14790
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14791
0
        return -1;
14792
0
    }
14793
14794
36
    char *encoding, *errors;
14795
36
    if (encode_wstr_utf8(config->filesystem_encoding,
14796
36
                         &encoding,
14797
36
                         "filesystem_encoding") < 0) {
14798
0
        return -1;
14799
0
    }
14800
14801
36
    if (encode_wstr_utf8(config->filesystem_errors,
14802
36
                         &errors,
14803
36
                         "filesystem_errors") < 0) {
14804
0
        PyMem_RawFree(encoding);
14805
0
        return -1;
14806
0
    }
14807
14808
36
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14809
36
    PyMem_RawFree(fs_codec->encoding);
14810
36
    fs_codec->encoding = encoding;
14811
    /* encoding has been normalized by init_fs_encoding() */
14812
36
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14813
36
    PyMem_RawFree(fs_codec->errors);
14814
36
    fs_codec->errors = errors;
14815
36
    fs_codec->error_handler = error_handler;
14816
14817
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14818
    assert(fs_codec->utf8 == 1);
14819
#endif
14820
14821
    /* At this point, PyUnicode_EncodeFSDefault() and
14822
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14823
       the C implementation of the filesystem encoding. */
14824
14825
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14826
       global configuration variables. */
14827
36
    if (_Py_IsMainInterpreter(interp)) {
14828
14829
36
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14830
36
                                      fs_codec->errors) < 0) {
14831
0
            PyErr_NoMemory();
14832
0
            return -1;
14833
0
        }
14834
36
    }
14835
36
    return 0;
14836
36
}
14837
14838
14839
static PyStatus
14840
init_fs_encoding(PyThreadState *tstate)
14841
36
{
14842
36
    PyInterpreterState *interp = tstate->interp;
14843
14844
    /* Update the filesystem encoding to the normalized Python codec name.
14845
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14846
       (Python codec name). */
14847
36
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14848
36
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14849
0
        _Py_DumpPathConfig(tstate);
14850
0
        return _PyStatus_ERR("failed to get the Python codec "
14851
0
                             "of the filesystem encoding");
14852
0
    }
14853
14854
36
    if (init_fs_codec(interp) < 0) {
14855
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14856
0
    }
14857
36
    return _PyStatus_OK();
14858
36
}
14859
14860
14861
PyStatus
14862
_PyUnicode_InitEncodings(PyThreadState *tstate)
14863
36
{
14864
36
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14865
36
    if (_PyStatus_EXCEPTION(status)) {
14866
0
        return status;
14867
0
    }
14868
36
    status = init_fs_encoding(tstate);
14869
36
    if (_PyStatus_EXCEPTION(status)) {
14870
0
        return status;
14871
0
    }
14872
14873
36
    return init_stdio_encoding(tstate->interp);
14874
36
}
14875
14876
14877
static void
14878
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14879
0
{
14880
0
    PyMem_RawFree(fs_codec->encoding);
14881
0
    fs_codec->encoding = NULL;
14882
0
    fs_codec->utf8 = 0;
14883
0
    PyMem_RawFree(fs_codec->errors);
14884
0
    fs_codec->errors = NULL;
14885
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14886
0
}
14887
14888
14889
#ifdef MS_WINDOWS
14890
int
14891
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
14892
{
14893
    PyInterpreterState *interp = _PyInterpreterState_GET();
14894
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
14895
14896
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
14897
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
14898
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
14899
    if (encoding == NULL || errors == NULL) {
14900
        PyMem_RawFree(encoding);
14901
        PyMem_RawFree(errors);
14902
        PyErr_NoMemory();
14903
        return -1;
14904
    }
14905
14906
    PyMem_RawFree(config->filesystem_encoding);
14907
    config->filesystem_encoding = encoding;
14908
    PyMem_RawFree(config->filesystem_errors);
14909
    config->filesystem_errors = errors;
14910
14911
    return init_fs_codec(interp);
14912
}
14913
#endif
14914
14915
14916
#ifdef Py_DEBUG
14917
static inline int
14918
unicode_is_finalizing(void)
14919
{
14920
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14921
}
14922
#endif
14923
14924
14925
void
14926
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14927
0
{
14928
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14929
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14930
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14931
0
}
14932
14933
14934
void
14935
_PyUnicode_Fini(PyInterpreterState *interp)
14936
0
{
14937
0
    struct _Py_unicode_state *state = &interp->unicode;
14938
14939
0
    if (!has_shared_intern_dict(interp)) {
14940
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14941
0
        assert(get_interned_dict(interp) == NULL);
14942
0
    }
14943
14944
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14945
14946
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14947
    // subsequent initialization of interpreter.
14948
0
    interp->unicode.ucnhash_capi = NULL;
14949
14950
0
    unicode_clear_identifiers(state);
14951
0
}
14952
14953
/* A _string module, to export formatter_parser and formatter_field_name_split
14954
   to the string.Formatter class implemented in Python. */
14955
14956
static PyMethodDef _string_methods[] = {
14957
    {"formatter_field_name_split", formatter_field_name_split,
14958
     METH_O, PyDoc_STR("split the argument as a field name")},
14959
    {"formatter_parser", formatter_parser,
14960
     METH_O, PyDoc_STR("parse the argument as a format string")},
14961
    {NULL, NULL}
14962
};
14963
14964
static PyModuleDef_Slot module_slots[] = {
14965
    _Py_ABI_SLOT,
14966
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14967
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14968
    {0, NULL}
14969
};
14970
14971
static struct PyModuleDef _string_module = {
14972
    PyModuleDef_HEAD_INIT,
14973
    .m_name = "_string",
14974
    .m_doc = PyDoc_STR("string helper module"),
14975
    .m_size = 0,
14976
    .m_methods = _string_methods,
14977
    .m_slots = module_slots,
14978
};
14979
14980
PyMODINIT_FUNC
14981
PyInit__string(void)
14982
8
{
14983
8
    return PyModuleDef_Init(&_string_module);
14984
8
}
14985
14986
14987
#undef PyUnicode_KIND
14988
int PyUnicode_KIND(PyObject *op)
14989
0
{
14990
0
    if (!PyUnicode_Check(op)) {
14991
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
14992
0
        return -1;
14993
0
    }
14994
0
    return _PyASCIIObject_CAST(op)->state.kind;
14995
0
}
14996
14997
#undef PyUnicode_DATA
14998
void* PyUnicode_DATA(PyObject *op)
14999
0
{
15000
0
    if (!PyUnicode_Check(op)) {
15001
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15002
0
        return NULL;
15003
0
    }
15004
0
    return _PyUnicode_DATA(op);
15005
0
}