Coverage Report

Created: 2026-05-30 06:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_RepeatBuffer()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_initconfig.h"    // _PyStatus_OK()
50
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
51
#include "pycore_long.h"          // _PyLong_FormatWriter()
52
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
53
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
54
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
55
#include "pycore_pyhash.h"        // _Py_HashSecret_t
56
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
57
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
58
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
59
#include "pycore_unicodectype.h"  // _PyUnicode_IsXidStart
60
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
61
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
62
63
#include "stringlib/eq.h"         // unicode_eq()
64
#include <stddef.h>               // ptrdiff_t
65
66
#ifdef MS_WINDOWS
67
#include <windows.h>
68
#endif
69
70
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
71
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
72
#endif
73
74
/* Uncomment to display statistics on interned strings at exit
75
   in _PyUnicode_ClearInterned(). */
76
/* #define INTERNED_STATS 1 */
77
78
79
/*[clinic input]
80
class str "PyObject *" "&PyUnicode_Type"
81
[clinic start generated code]*/
82
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
83
84
/*[python input]
85
class Py_UCS4_converter(CConverter):
86
    type = 'Py_UCS4'
87
    converter = 'convert_uc'
88
89
    def c_default_init(self):
90
        import libclinic
91
        self.c_default = libclinic.c_unichar_repr(self.default)
92
93
[python start generated code]*/
94
/*[python end generated code: output=da39a3ee5e6b4b0d input=22f057b68fd9a65a]*/
95
96
/* --- Globals ------------------------------------------------------------
97
98
NOTE: In the interpreter's initialization phase, some globals are currently
99
      initialized dynamically as needed. In the process Unicode objects may
100
      be created before the Unicode type is ready.
101
102
*/
103
104
23.0M
#define MAX_UNICODE _Py_MAX_UNICODE
105
268M
#define ensure_unicode _PyUnicode_EnsureUnicode
106
107
#ifdef Py_DEBUG
108
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
109
#else
110
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
111
#endif
112
113
static inline char* _PyUnicode_UTF8(PyObject *op)
114
255M
{
115
255M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
116
255M
}
117
118
static inline char* PyUnicode_UTF8(PyObject *op)
119
154M
{
120
154M
    assert(_PyUnicode_CHECK(op));
121
154M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
122
139M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
123
139M
    }
124
15.0M
    else {
125
15.0M
         return _PyUnicode_UTF8(op);
126
15.0M
    }
127
154M
}
128
129
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
130
30.8M
{
131
30.8M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
132
30.8M
}
133
134
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
135
72.5M
{
136
72.5M
    assert(_PyUnicode_CHECK(op));
137
72.5M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
138
69.5M
         return _PyASCIIObject_CAST(op)->length;
139
69.5M
    }
140
2.98M
    else {
141
2.98M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
142
2.98M
    }
143
72.5M
}
144
145
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
146
30.8M
{
147
30.8M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
148
30.8M
}
149
150
#define _PyUnicode_LENGTH(op)                           \
151
612M
    (_PyASCIIObject_CAST(op)->length)
152
#define _PyUnicode_STATE(op)                            \
153
3.90G
    (_PyASCIIObject_CAST(op)->state)
154
#define _PyUnicode_HASH(op)                             \
155
572M
    (_PyASCIIObject_CAST(op)->hash)
156
157
181M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
158
159
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
160
51.1M
{
161
51.1M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
162
51.1M
}
163
164
#define _PyUnicode_DATA_ANY(op)                         \
165
66.6M
    (_PyUnicodeObject_CAST(op)->data.any)
166
167
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
168
0
{
169
0
    assert(_PyUnicode_CHECK(op));
170
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
171
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
172
0
}
173
174
/* true if the Unicode object has an allocated UTF-8 memory block
175
   (not shared with other data) */
176
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
177
611M
{
178
611M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
179
225M
            && _PyUnicode_UTF8(op) != NULL
180
14.1M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
181
611M
}
182
183
184
241M
#define LATIN1 _Py_LATIN1_CHR
185
186
/* Forward declaration */
187
static PyObject *
188
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
189
                    const char *errors);
190
static PyObject *
191
unicode_decode_utf8(const char *s, Py_ssize_t size,
192
                    _Py_error_handler error_handler, const char *errors,
193
                    Py_ssize_t *consumed);
194
#ifdef Py_DEBUG
195
static inline int unicode_is_finalizing(void);
196
static int unicode_is_singleton(PyObject *unicode);
197
#endif
198
199
200
// Return a reference to the immortal empty string singleton.
201
PyObject*
202
_PyUnicode_GetEmpty(void)
203
108M
{
204
108M
    _Py_DECLARE_STR(empty, "");
205
108M
    return &_Py_STR(empty);
206
108M
}
207
208
/* This dictionary holds per-interpreter interned strings.
209
 * See InternalDocs/string_interning.md for details.
210
 */
211
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
212
6.07M
{
213
6.07M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
214
6.07M
}
215
216
/* This hashtable holds statically allocated interned strings.
217
 * See InternalDocs/string_interning.md for details.
218
 */
219
6.16M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
220
221
/* Get number of all interned strings for the current interpreter. */
222
Py_ssize_t
223
_PyUnicode_InternedSize(void)
224
0
{
225
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
226
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
227
0
}
228
229
/* Get number of immortal interned strings for the current interpreter. */
230
Py_ssize_t
231
_PyUnicode_InternedSize_Immortal(void)
232
0
{
233
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
234
0
    PyObject *key, *value;
235
0
    Py_ssize_t pos = 0;
236
0
    Py_ssize_t count = 0;
237
238
    // It's tempting to keep a count and avoid a loop here. But, this function
239
    // is intended for refleak tests. It spends extra work to report the true
240
    // value, to help detect bugs in optimizations.
241
242
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
243
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
244
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
245
0
           count++;
246
0
       }
247
0
    }
248
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
249
0
}
250
251
static Py_hash_t unicode_hash(PyObject *);
252
253
static Py_uhash_t
254
hashtable_unicode_hash(const void *key)
255
6.16M
{
256
6.16M
    return unicode_hash((PyObject *)key);
257
6.16M
}
258
259
static int
260
hashtable_unicode_compare(const void *key1, const void *key2)
261
542k
{
262
542k
    PyObject *obj1 = (PyObject *)key1;
263
542k
    PyObject *obj2 = (PyObject *)key2;
264
542k
    if (obj1 != NULL && obj2 != NULL) {
265
542k
        return unicode_eq(obj1, obj2);
266
542k
    }
267
0
    else {
268
0
        return obj1 == obj2;
269
0
    }
270
542k
}
271
272
/* Return true if this interpreter should share the main interpreter's
273
   intern_dict.  That's important for interpreters which load basic
274
   single-phase init extension modules (m_size == -1).  There could be interned
275
   immortal strings that are shared between interpreters, due to the
276
   PyDict_Update(mdict, m_copy) call in import_find_extension().
277
278
   It's not safe to deallocate those strings until all interpreters that
279
   potentially use them are freed.  By storing them in the main interpreter, we
280
   ensure they get freed after all other interpreters are freed.
281
*/
282
static bool
283
has_shared_intern_dict(PyInterpreterState *interp)
284
37
{
285
37
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
286
37
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
287
37
}
288
289
static int
290
init_interned_dict(PyInterpreterState *interp)
291
37
{
292
37
    assert(get_interned_dict(interp) == NULL);
293
37
    PyObject *interned;
294
37
    if (has_shared_intern_dict(interp)) {
295
0
        interned = get_interned_dict(_PyInterpreterState_Main());
296
0
        Py_INCREF(interned);
297
0
    }
298
37
    else {
299
37
        interned = PyDict_New();
300
37
        if (interned == NULL) {
301
0
            return -1;
302
0
        }
303
37
    }
304
37
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
305
37
    return 0;
306
37
}
307
308
static void
309
clear_interned_dict(PyInterpreterState *interp)
310
0
{
311
0
    PyObject *interned = get_interned_dict(interp);
312
0
    if (interned != NULL) {
313
0
        if (!has_shared_intern_dict(interp)) {
314
            // only clear if the dict belongs to this interpreter
315
0
            PyDict_Clear(interned);
316
0
        }
317
0
        Py_DECREF(interned);
318
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
319
0
    }
320
0
}
321
322
static PyStatus
323
init_global_interned_strings(PyInterpreterState *interp)
324
37
{
325
37
    assert(INTERNED_STRINGS == NULL);
326
37
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
327
328
37
    INTERNED_STRINGS = _Py_hashtable_new_full(
329
37
        hashtable_unicode_hash,
330
37
        hashtable_unicode_compare,
331
        // Objects stored here are immortal and statically allocated,
332
        // so we don't need key_destroy_func & value_destroy_func:
333
37
        NULL,
334
37
        NULL,
335
37
        &hashtable_alloc
336
37
    );
337
37
    if (INTERNED_STRINGS == NULL) {
338
0
        PyErr_Clear();
339
0
        return _PyStatus_ERR("failed to create global interned dict");
340
0
    }
341
342
    /* Intern statically allocated string identifiers, deepfreeze strings,
343
        * and one-byte latin-1 strings.
344
        * This must be done before any module initialization so that statically
345
        * allocated string identifiers are used instead of heap allocated strings.
346
        * Deepfreeze uses the interned identifiers if present to save space
347
        * else generates them and they are interned to speed up dict lookups.
348
    */
349
37
    _PyUnicode_InitStaticStrings(interp);
350
351
9.50k
    for (int i = 0; i < 256; i++) {
352
9.47k
        PyObject *s = LATIN1(i);
353
9.47k
        _PyUnicode_InternStatic(interp, &s);
354
9.47k
        assert(s == LATIN1(i));
355
9.47k
    }
356
#ifdef Py_DEBUG
357
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
358
359
    for (int i = 0; i < 256; i++) {
360
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
361
    }
362
#endif
363
37
    return _PyStatus_OK();
364
37
}
365
366
static void clear_global_interned_strings(void)
367
0
{
368
0
    if (INTERNED_STRINGS != NULL) {
369
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
370
0
        INTERNED_STRINGS = NULL;
371
0
    }
372
0
}
373
374
#define _Py_RETURN_UNICODE_EMPTY()   \
375
50.8M
    do {                             \
376
50.8M
        return _PyUnicode_GetEmpty();\
377
50.8M
    } while (0)
378
379
380
/* Fast detection of the most frequent whitespace characters */
381
const unsigned char _Py_ascii_whitespace[] = {
382
    0, 0, 0, 0, 0, 0, 0, 0,
383
/*     case 0x0009: * CHARACTER TABULATION */
384
/*     case 0x000A: * LINE FEED */
385
/*     case 0x000B: * LINE TABULATION */
386
/*     case 0x000C: * FORM FEED */
387
/*     case 0x000D: * CARRIAGE RETURN */
388
    0, 1, 1, 1, 1, 1, 0, 0,
389
    0, 0, 0, 0, 0, 0, 0, 0,
390
/*     case 0x001C: * FILE SEPARATOR */
391
/*     case 0x001D: * GROUP SEPARATOR */
392
/*     case 0x001E: * RECORD SEPARATOR */
393
/*     case 0x001F: * UNIT SEPARATOR */
394
    0, 0, 0, 0, 1, 1, 1, 1,
395
/*     case 0x0020: * SPACE */
396
    1, 0, 0, 0, 0, 0, 0, 0,
397
    0, 0, 0, 0, 0, 0, 0, 0,
398
    0, 0, 0, 0, 0, 0, 0, 0,
399
    0, 0, 0, 0, 0, 0, 0, 0,
400
401
    0, 0, 0, 0, 0, 0, 0, 0,
402
    0, 0, 0, 0, 0, 0, 0, 0,
403
    0, 0, 0, 0, 0, 0, 0, 0,
404
    0, 0, 0, 0, 0, 0, 0, 0,
405
    0, 0, 0, 0, 0, 0, 0, 0,
406
    0, 0, 0, 0, 0, 0, 0, 0,
407
    0, 0, 0, 0, 0, 0, 0, 0,
408
    0, 0, 0, 0, 0, 0, 0, 0
409
};
410
411
/* forward */
412
static PyObject* get_latin1_char(unsigned char ch);
413
414
415
static PyObject *
416
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
417
static PyObject *
418
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
419
static PyObject *
420
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
421
422
static PyObject *
423
unicode_encode_call_errorhandler(const char *errors,
424
       PyObject **errorHandler,const char *encoding, const char *reason,
425
       PyObject *unicode, PyObject **exceptionObject,
426
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
427
428
static void
429
raise_encode_exception(PyObject **exceptionObject,
430
                       const char *encoding,
431
                       PyObject *unicode,
432
                       Py_ssize_t startpos, Py_ssize_t endpos,
433
                       const char *reason);
434
435
/* Same for linebreaks */
436
static const unsigned char ascii_linebreak[] = {
437
    0, 0, 0, 0, 0, 0, 0, 0,
438
/*         0x000A, * LINE FEED */
439
/*         0x000B, * LINE TABULATION */
440
/*         0x000C, * FORM FEED */
441
/*         0x000D, * CARRIAGE RETURN */
442
    0, 0, 1, 1, 1, 1, 0, 0,
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
/*         0x001C, * FILE SEPARATOR */
445
/*         0x001D, * GROUP SEPARATOR */
446
/*         0x001E, * RECORD SEPARATOR */
447
    0, 0, 0, 0, 1, 1, 1, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0,
451
    0, 0, 0, 0, 0, 0, 0, 0,
452
453
    0, 0, 0, 0, 0, 0, 0, 0,
454
    0, 0, 0, 0, 0, 0, 0, 0,
455
    0, 0, 0, 0, 0, 0, 0, 0,
456
    0, 0, 0, 0, 0, 0, 0, 0,
457
    0, 0, 0, 0, 0, 0, 0, 0,
458
    0, 0, 0, 0, 0, 0, 0, 0,
459
    0, 0, 0, 0, 0, 0, 0, 0,
460
    0, 0, 0, 0, 0, 0, 0, 0
461
};
462
463
static int convert_uc(PyObject *obj, void *addr);
464
465
struct encoding_map;
466
#include "clinic/unicodeobject.c.h"
467
468
_Py_error_handler
469
_Py_GetErrorHandler(const char *errors)
470
3.70M
{
471
3.70M
    if (errors == NULL || strcmp(errors, "strict") == 0) {
472
2.98M
        return _Py_ERROR_STRICT;
473
2.98M
    }
474
715k
    if (strcmp(errors, "surrogateescape") == 0) {
475
494k
        return _Py_ERROR_SURROGATEESCAPE;
476
494k
    }
477
220k
    if (strcmp(errors, "replace") == 0) {
478
220k
        return _Py_ERROR_REPLACE;
479
220k
    }
480
4
    if (strcmp(errors, "ignore") == 0) {
481
0
        return _Py_ERROR_IGNORE;
482
0
    }
483
4
    if (strcmp(errors, "backslashreplace") == 0) {
484
0
        return _Py_ERROR_BACKSLASHREPLACE;
485
0
    }
486
4
    if (strcmp(errors, "surrogatepass") == 0) {
487
4
        return _Py_ERROR_SURROGATEPASS;
488
4
    }
489
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
490
0
        return _Py_ERROR_XMLCHARREFREPLACE;
491
0
    }
492
0
    return _Py_ERROR_OTHER;
493
0
}
494
495
496
static _Py_error_handler
497
get_error_handler_wide(const wchar_t *errors)
498
74
{
499
74
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
500
0
        return _Py_ERROR_STRICT;
501
0
    }
502
74
    if (wcscmp(errors, L"surrogateescape") == 0) {
503
74
        return _Py_ERROR_SURROGATEESCAPE;
504
74
    }
505
0
    if (wcscmp(errors, L"replace") == 0) {
506
0
        return _Py_ERROR_REPLACE;
507
0
    }
508
0
    if (wcscmp(errors, L"ignore") == 0) {
509
0
        return _Py_ERROR_IGNORE;
510
0
    }
511
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
512
0
        return _Py_ERROR_BACKSLASHREPLACE;
513
0
    }
514
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
515
0
        return _Py_ERROR_SURROGATEPASS;
516
0
    }
517
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
518
0
        return _Py_ERROR_XMLCHARREFREPLACE;
519
0
    }
520
0
    return _Py_ERROR_OTHER;
521
0
}
522
523
524
static inline int
525
unicode_check_encoding_errors(const char *encoding, const char *errors)
526
41.7M
{
527
41.7M
    if (encoding == NULL && errors == NULL) {
528
12.8M
        return 0;
529
12.8M
    }
530
531
28.8M
    PyInterpreterState *interp = _PyInterpreterState_GET();
532
28.8M
#ifndef Py_DEBUG
533
    /* In release mode, only check in development mode (-X dev) */
534
28.8M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
535
28.8M
        return 0;
536
28.8M
    }
537
#else
538
    /* Always check in debug mode */
539
#endif
540
541
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
542
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
543
0
    if (!interp->unicode.fs_codec.encoding) {
544
0
        return 0;
545
0
    }
546
547
    /* Disable checks during Python finalization. For example, it allows to
548
     * call PyObject_Dump() during finalization for debugging purpose.
549
     */
550
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
551
0
        return 0;
552
0
    }
553
554
0
    if (encoding != NULL
555
        // Fast path for the most common built-in encodings. Even if the codec
556
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
557
        // create a temporary Unicode string (the key in the cache).
558
0
        && strcmp(encoding, "utf-8") != 0
559
0
        && strcmp(encoding, "utf8") != 0
560
0
        && strcmp(encoding, "ascii") != 0)
561
0
    {
562
0
        PyObject *handler = _PyCodec_Lookup(encoding);
563
0
        if (handler == NULL) {
564
0
            return -1;
565
0
        }
566
0
        Py_DECREF(handler);
567
0
    }
568
569
0
    if (errors != NULL
570
        // Fast path for the most common built-in error handlers.
571
0
        && strcmp(errors, "strict") != 0
572
0
        && strcmp(errors, "ignore") != 0
573
0
        && strcmp(errors, "replace") != 0
574
0
        && strcmp(errors, "surrogateescape") != 0
575
0
        && strcmp(errors, "surrogatepass") != 0)
576
0
    {
577
0
        PyObject *handler = PyCodec_LookupError(errors);
578
0
        if (handler == NULL) {
579
0
            return -1;
580
0
        }
581
0
        Py_DECREF(handler);
582
0
    }
583
0
    return 0;
584
0
}
585
586
587
int
588
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
589
0
{
590
0
#define CHECK(expr) \
591
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
592
#ifdef Py_GIL_DISABLED
593
# define CHECK_IF_GIL(expr) (void)(expr)
594
# define CHECK_IF_FT(expr) CHECK(expr)
595
#else
596
0
# define CHECK_IF_GIL(expr) CHECK(expr)
597
0
# define CHECK_IF_FT(expr) (void)(expr)
598
0
#endif
599
600
601
0
    assert(op != NULL);
602
0
    CHECK(PyUnicode_Check(op));
603
604
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
605
0
    int kind = ascii->state.kind;
606
607
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
608
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
609
0
    }
610
0
    else {
611
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
612
0
        void *data;
613
614
0
        if (ascii->state.compact == 1) {
615
0
            data = compact + 1;
616
0
            CHECK(kind == PyUnicode_1BYTE_KIND
617
0
                                 || kind == PyUnicode_2BYTE_KIND
618
0
                                 || kind == PyUnicode_4BYTE_KIND);
619
0
            CHECK(ascii->state.ascii == 0);
620
0
            CHECK(_PyUnicode_UTF8(op) != data);
621
0
        }
622
0
        else {
623
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
624
625
0
            data = unicode->data.any;
626
0
            CHECK(kind == PyUnicode_1BYTE_KIND
627
0
                     || kind == PyUnicode_2BYTE_KIND
628
0
                     || kind == PyUnicode_4BYTE_KIND);
629
0
            CHECK(ascii->state.compact == 0);
630
0
            CHECK(data != NULL);
631
0
            if (ascii->state.ascii) {
632
0
                CHECK(_PyUnicode_UTF8(op) == data);
633
0
                CHECK(compact->utf8_length == ascii->length);
634
0
            }
635
0
            else {
636
0
                CHECK(_PyUnicode_UTF8(op) != data);
637
0
            }
638
0
        }
639
0
#ifndef Py_GIL_DISABLED
640
0
        if (_PyUnicode_UTF8(op) == NULL)
641
0
            CHECK(compact->utf8_length == 0);
642
0
#endif
643
0
    }
644
645
    /* check that the best kind is used: O(n) operation */
646
0
    if (check_content) {
647
0
        Py_ssize_t i;
648
0
        Py_UCS4 maxchar = 0;
649
0
        const void *data;
650
0
        Py_UCS4 ch;
651
652
0
        data = PyUnicode_DATA(ascii);
653
0
        for (i=0; i < ascii->length; i++)
654
0
        {
655
0
            ch = PyUnicode_READ(kind, data, i);
656
0
            if (ch > maxchar)
657
0
                maxchar = ch;
658
0
        }
659
0
        if (kind == PyUnicode_1BYTE_KIND) {
660
0
            if (ascii->state.ascii == 0) {
661
0
                CHECK(maxchar >= 128);
662
0
                CHECK(maxchar <= 255);
663
0
            }
664
0
            else
665
0
                CHECK(maxchar < 128);
666
0
        }
667
0
        else if (kind == PyUnicode_2BYTE_KIND) {
668
0
            CHECK(maxchar >= 0x100);
669
0
            CHECK(maxchar <= 0xFFFF);
670
0
        }
671
0
        else {
672
0
            CHECK(maxchar >= 0x10000);
673
0
            CHECK(maxchar <= MAX_UNICODE);
674
0
        }
675
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
676
0
    }
677
678
    /* Check interning state */
679
#ifdef Py_DEBUG
680
    // Note that we do not check `_Py_IsImmortal(op)` in the GIL-enabled build
681
    // since stable ABI extensions can make immortal strings mortal (but with a
682
    // high enough refcount).
683
    switch (PyUnicode_CHECK_INTERNED(op)) {
684
        case SSTATE_NOT_INTERNED:
685
            if (ascii->state.statically_allocated) {
686
                // This state is for two exceptions:
687
                // - strings are currently checked before they're interned
688
                // - the 256 one-latin1-character strings
689
                //   are static but use SSTATE_NOT_INTERNED
690
            }
691
            else {
692
                CHECK_IF_GIL(!_Py_IsImmortal(op));
693
            }
694
            break;
695
        case SSTATE_INTERNED_MORTAL:
696
            CHECK(!ascii->state.statically_allocated);
697
            CHECK_IF_GIL(!_Py_IsImmortal(op));
698
            break;
699
        case SSTATE_INTERNED_IMMORTAL:
700
            CHECK(!ascii->state.statically_allocated);
701
            CHECK_IF_FT(_Py_IsImmortal(op));
702
            break;
703
        case SSTATE_INTERNED_IMMORTAL_STATIC:
704
            CHECK(ascii->state.statically_allocated);
705
            CHECK_IF_FT(_Py_IsImmortal(op));
706
            break;
707
        default:
708
            Py_UNREACHABLE();
709
    }
710
#endif
711
712
0
    return 1;
713
714
0
#undef CHECK
715
0
}
716
717
PyObject*
718
_PyUnicode_Result(PyObject *unicode)
719
55.3M
{
720
55.3M
    assert(_PyUnicode_CHECK(unicode));
721
722
55.3M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
723
55.3M
    if (length == 0) {
724
230
        PyObject *empty = _PyUnicode_GetEmpty();
725
230
        if (unicode != empty) {
726
0
            Py_DECREF(unicode);
727
0
        }
728
230
        return empty;
729
230
    }
730
731
55.3M
    if (length == 1) {
732
2.57M
        int kind = PyUnicode_KIND(unicode);
733
2.57M
        if (kind == PyUnicode_1BYTE_KIND) {
734
131k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
735
131k
            Py_UCS1 ch = data[0];
736
131k
            PyObject *latin1_char = LATIN1(ch);
737
131k
            if (unicode != latin1_char) {
738
126k
                Py_DECREF(unicode);
739
126k
            }
740
131k
            return latin1_char;
741
131k
        }
742
2.57M
    }
743
744
55.3M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
745
55.2M
    return unicode;
746
55.3M
}
747
1.52M
#define unicode_result _PyUnicode_Result
748
749
static PyObject*
750
unicode_result_unchanged(PyObject *unicode)
751
96.6M
{
752
96.6M
    if (PyUnicode_CheckExact(unicode)) {
753
93.4M
        return Py_NewRef(unicode);
754
93.4M
    }
755
3.24M
    else
756
        /* Subtype -- return genuine unicode string with the same value. */
757
3.24M
        return _PyUnicode_Copy(unicode);
758
96.6M
}
759
760
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
761
   ASCII, Latin1, UTF-8, etc. */
762
static char*
763
backslashreplace(PyBytesWriter *writer, char *str,
764
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
765
0
{
766
0
    Py_ssize_t size, i;
767
0
    Py_UCS4 ch;
768
0
    int kind;
769
0
    const void *data;
770
771
0
    kind = PyUnicode_KIND(unicode);
772
0
    data = PyUnicode_DATA(unicode);
773
774
0
    size = 0;
775
    /* determine replacement size */
776
0
    for (i = collstart; i < collend; ++i) {
777
0
        Py_ssize_t incr;
778
779
0
        ch = PyUnicode_READ(kind, data, i);
780
0
        if (ch < 0x100)
781
0
            incr = 2+2;
782
0
        else if (ch < 0x10000)
783
0
            incr = 2+4;
784
0
        else {
785
0
            assert(ch <= MAX_UNICODE);
786
0
            incr = 2+8;
787
0
        }
788
0
        if (size > PY_SSIZE_T_MAX - incr) {
789
0
            PyErr_SetString(PyExc_OverflowError,
790
0
                            "encoded result is too long for a Python string");
791
0
            return NULL;
792
0
        }
793
0
        size += incr;
794
0
    }
795
796
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
797
0
    if (str == NULL) {
798
0
        return NULL;
799
0
    }
800
801
    /* generate replacement */
802
0
    for (i = collstart; i < collend; ++i) {
803
0
        ch = PyUnicode_READ(kind, data, i);
804
0
        *str++ = '\\';
805
0
        if (ch >= 0x00010000) {
806
0
            *str++ = 'U';
807
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
808
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
809
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
810
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
811
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
812
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
813
0
        }
814
0
        else if (ch >= 0x100) {
815
0
            *str++ = 'u';
816
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
817
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
818
0
        }
819
0
        else
820
0
            *str++ = 'x';
821
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
822
0
        *str++ = Py_hexdigits[ch&0xf];
823
0
    }
824
0
    return str;
825
0
}
826
827
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
828
   ASCII, Latin1, UTF-8, etc. */
829
static char*
830
xmlcharrefreplace(PyBytesWriter *writer, char *str,
831
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
832
0
{
833
0
    Py_ssize_t size, i;
834
0
    Py_UCS4 ch;
835
0
    int kind;
836
0
    const void *data;
837
838
0
    kind = PyUnicode_KIND(unicode);
839
0
    data = PyUnicode_DATA(unicode);
840
841
0
    size = 0;
842
    /* determine replacement size */
843
0
    for (i = collstart; i < collend; ++i) {
844
0
        Py_ssize_t incr;
845
846
0
        ch = PyUnicode_READ(kind, data, i);
847
0
        if (ch < 10)
848
0
            incr = 2+1+1;
849
0
        else if (ch < 100)
850
0
            incr = 2+2+1;
851
0
        else if (ch < 1000)
852
0
            incr = 2+3+1;
853
0
        else if (ch < 10000)
854
0
            incr = 2+4+1;
855
0
        else if (ch < 100000)
856
0
            incr = 2+5+1;
857
0
        else if (ch < 1000000)
858
0
            incr = 2+6+1;
859
0
        else {
860
0
            assert(ch <= MAX_UNICODE);
861
0
            incr = 2+7+1;
862
0
        }
863
0
        if (size > PY_SSIZE_T_MAX - incr) {
864
0
            PyErr_SetString(PyExc_OverflowError,
865
0
                            "encoded result is too long for a Python string");
866
0
            return NULL;
867
0
        }
868
0
        size += incr;
869
0
    }
870
871
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
872
0
    if (str == NULL) {
873
0
        return NULL;
874
0
    }
875
876
    /* generate replacement */
877
0
    for (i = collstart; i < collend; ++i) {
878
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
879
0
        if (size < 0) {
880
0
            return NULL;
881
0
        }
882
0
        str += size;
883
0
    }
884
0
    return str;
885
0
}
886
887
/* --- Bloom Filters ----------------------------------------------------- */
888
889
/* stuff to implement simple "bloom filters" for Unicode characters.
890
   to keep things simple, we use a single bitmask, using the least 5
891
   bits from each unicode characters as the bit index. */
892
893
/* the linebreak mask is set up by _PyUnicode_Init() below */
894
895
#if LONG_BIT >= 128
896
#define BLOOM_WIDTH 128
897
#elif LONG_BIT >= 64
898
22.0M
#define BLOOM_WIDTH 64
899
#elif LONG_BIT >= 32
900
#define BLOOM_WIDTH 32
901
#else
902
#error "LONG_BIT is smaller than 32"
903
#endif
904
905
8.61M
#define BLOOM_MASK unsigned long
906
907
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
908
909
28.0M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
910
911
#define BLOOM_LINEBREAK(ch)                                             \
912
124M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
913
124M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
914
915
static inline BLOOM_MASK
916
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
917
4.30M
{
918
4.30M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
919
4.30M
    do {                                               \
920
4.30M
        TYPE *data = (TYPE *)PTR;                      \
921
4.30M
        TYPE *end = data + LEN;                        \
922
4.30M
        Py_UCS4 ch;                                    \
923
10.3M
        for (; data != end; data++) {                  \
924
6.07M
            ch = *data;                                \
925
6.07M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
926
6.07M
        }                                              \
927
4.30M
        break;                                         \
928
4.30M
    } while (0)
929
930
    /* calculate simple bloom-style bitmask for a given unicode string */
931
932
4.30M
    BLOOM_MASK mask;
933
934
4.30M
    mask = 0;
935
4.30M
    switch (kind) {
936
4.30M
    case PyUnicode_1BYTE_KIND:
937
4.30M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
938
4.30M
        break;
939
37
    case PyUnicode_2BYTE_KIND:
940
37
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
941
37
        break;
942
0
    case PyUnicode_4BYTE_KIND:
943
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
944
0
        break;
945
0
    default:
946
0
        Py_UNREACHABLE();
947
4.30M
    }
948
4.30M
    return mask;
949
950
4.30M
#undef BLOOM_UPDATE
951
4.30M
}
952
953
/* Compilation of templated routines */
954
955
797k
#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
956
957
#include "stringlib/asciilib.h"
958
#include "stringlib/fastsearch.h"
959
#include "stringlib/partition.h"
960
#include "stringlib/split.h"
961
#include "stringlib/count.h"
962
#include "stringlib/find.h"
963
#include "stringlib/find_max_char.h"
964
#include "stringlib/undef.h"
965
966
#include "stringlib/ucs1lib.h"
967
#include "stringlib/fastsearch.h"
968
#include "stringlib/partition.h"
969
#include "stringlib/split.h"
970
#include "stringlib/count.h"
971
#include "stringlib/find.h"
972
#include "stringlib/replace.h"
973
#include "stringlib/repr.h"
974
#include "stringlib/find_max_char.h"
975
#include "stringlib/undef.h"
976
977
#include "stringlib/ucs2lib.h"
978
#include "stringlib/fastsearch.h"
979
#include "stringlib/partition.h"
980
#include "stringlib/split.h"
981
#include "stringlib/count.h"
982
#include "stringlib/find.h"
983
#include "stringlib/replace.h"
984
#include "stringlib/repr.h"
985
#include "stringlib/find_max_char.h"
986
#include "stringlib/undef.h"
987
988
#include "stringlib/ucs4lib.h"
989
#include "stringlib/fastsearch.h"
990
#include "stringlib/partition.h"
991
#include "stringlib/split.h"
992
#include "stringlib/count.h"
993
#include "stringlib/find.h"
994
#include "stringlib/replace.h"
995
#include "stringlib/repr.h"
996
#include "stringlib/find_max_char.h"
997
#include "stringlib/undef.h"
998
999
#undef STRINGLIB_GET_EMPTY
1000
1001
/* --- Unicode Object ----------------------------------------------------- */
1002
1003
static inline Py_ssize_t
1004
findchar(const void *s, int kind,
1005
         Py_ssize_t size, Py_UCS4 ch,
1006
         int direction)
1007
213M
{
1008
213M
    switch (kind) {
1009
206M
    case PyUnicode_1BYTE_KIND:
1010
206M
        if ((Py_UCS1) ch != ch)
1011
3.52k
            return -1;
1012
206M
        if (direction > 0)
1013
206M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1014
64.6k
        else
1015
64.6k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1016
4.82M
    case PyUnicode_2BYTE_KIND:
1017
4.82M
        if ((Py_UCS2) ch != ch)
1018
0
            return -1;
1019
4.82M
        if (direction > 0)
1020
4.63M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1021
187k
        else
1022
187k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1023
2.81M
    case PyUnicode_4BYTE_KIND:
1024
2.81M
        if (direction > 0)
1025
2.68M
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1026
128k
        else
1027
128k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1028
0
    default:
1029
0
        Py_UNREACHABLE();
1030
213M
    }
1031
213M
}
1032
1033
#ifdef Py_DEBUG
1034
/* Fill the data of a Unicode string with invalid characters to detect bugs
1035
   earlier.
1036
1037
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1038
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1039
   invalid character in Unicode 6.0. */
1040
static void
1041
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1042
{
1043
    int kind = PyUnicode_KIND(unicode);
1044
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1045
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1046
    if (length <= old_length)
1047
        return;
1048
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1049
}
1050
#endif
1051
1052
static PyObject*
1053
resize_copy(PyObject *unicode, Py_ssize_t length)
1054
0
{
1055
0
    Py_ssize_t copy_length;
1056
0
    PyObject *copy;
1057
1058
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1059
0
    if (copy == NULL)
1060
0
        return NULL;
1061
1062
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1063
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1064
0
    return copy;
1065
0
}
1066
1067
PyObject*
1068
_PyUnicode_ResizeCompact(PyObject *unicode, Py_ssize_t length)
1069
56.6M
{
1070
56.6M
    Py_ssize_t char_size;
1071
56.6M
    Py_ssize_t struct_size;
1072
56.6M
    Py_ssize_t new_size;
1073
56.6M
    PyObject *new_unicode;
1074
#ifdef Py_DEBUG
1075
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1076
#endif
1077
1078
56.6M
    if (!_PyUnicode_IsModifiable(unicode)) {
1079
0
        PyObject *copy = resize_copy(unicode, length);
1080
0
        if (copy == NULL) {
1081
0
            return NULL;
1082
0
        }
1083
0
        Py_DECREF(unicode);
1084
0
        return copy;
1085
0
    }
1086
56.6M
    assert(PyUnicode_IS_COMPACT(unicode));
1087
1088
56.6M
    char_size = PyUnicode_KIND(unicode);
1089
56.6M
    if (PyUnicode_IS_ASCII(unicode))
1090
36.9M
        struct_size = sizeof(PyASCIIObject);
1091
19.6M
    else
1092
19.6M
        struct_size = sizeof(PyCompactUnicodeObject);
1093
1094
56.6M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1095
0
        PyErr_NoMemory();
1096
0
        return NULL;
1097
0
    }
1098
56.6M
    new_size = (struct_size + (length + 1) * char_size);
1099
1100
56.6M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1101
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1102
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1103
0
        PyUnicode_SET_UTF8(unicode, NULL);
1104
0
    }
1105
#ifdef Py_TRACE_REFS
1106
    _Py_ForgetReference(unicode);
1107
#endif
1108
56.6M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1109
1110
56.6M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1111
56.6M
    if (new_unicode == NULL) {
1112
0
        _Py_NewReferenceNoTotal(unicode);
1113
0
        PyErr_NoMemory();
1114
0
        return NULL;
1115
0
    }
1116
56.6M
    unicode = new_unicode;
1117
56.6M
    _Py_NewReferenceNoTotal(unicode);
1118
1119
56.6M
    _PyUnicode_LENGTH(unicode) = length;
1120
#ifdef Py_DEBUG
1121
    unicode_fill_invalid(unicode, old_length);
1122
#endif
1123
56.6M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1124
56.6M
                    length, 0);
1125
56.6M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1126
56.6M
    return unicode;
1127
56.6M
}
1128
1129
static int
1130
resize_inplace(PyObject *unicode, Py_ssize_t length)
1131
0
{
1132
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1133
0
    assert(Py_REFCNT(unicode) == 1);
1134
1135
0
    Py_ssize_t new_size;
1136
0
    Py_ssize_t char_size;
1137
0
    int share_utf8;
1138
0
    void *data;
1139
#ifdef Py_DEBUG
1140
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1141
#endif
1142
1143
0
    data = _PyUnicode_DATA_ANY(unicode);
1144
0
    char_size = PyUnicode_KIND(unicode);
1145
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1146
1147
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1148
0
        PyErr_NoMemory();
1149
0
        return -1;
1150
0
    }
1151
0
    new_size = (length + 1) * char_size;
1152
1153
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1154
0
    {
1155
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1156
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1157
0
        PyUnicode_SET_UTF8(unicode, NULL);
1158
0
    }
1159
1160
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1161
0
    if (data == NULL) {
1162
0
        PyErr_NoMemory();
1163
0
        return -1;
1164
0
    }
1165
0
    _PyUnicode_DATA_ANY(unicode) = data;
1166
0
    if (share_utf8) {
1167
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1168
0
        PyUnicode_SET_UTF8(unicode, data);
1169
0
    }
1170
0
    _PyUnicode_LENGTH(unicode) = length;
1171
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1172
#ifdef Py_DEBUG
1173
    unicode_fill_invalid(unicode, old_length);
1174
#endif
1175
1176
    /* check for integer overflow */
1177
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1178
0
        PyErr_NoMemory();
1179
0
        return -1;
1180
0
    }
1181
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1182
0
    return 0;
1183
0
}
1184
1185
static const char*
1186
unicode_kind_name(PyObject *unicode)
1187
0
{
1188
    /* don't check consistency: unicode_kind_name() is called from
1189
       _PyUnicode_Dump() */
1190
0
    if (!PyUnicode_IS_COMPACT(unicode))
1191
0
    {
1192
0
        switch (PyUnicode_KIND(unicode))
1193
0
        {
1194
0
        case PyUnicode_1BYTE_KIND:
1195
0
            if (PyUnicode_IS_ASCII(unicode))
1196
0
                return "legacy ascii";
1197
0
            else
1198
0
                return "legacy latin1";
1199
0
        case PyUnicode_2BYTE_KIND:
1200
0
            return "legacy UCS2";
1201
0
        case PyUnicode_4BYTE_KIND:
1202
0
            return "legacy UCS4";
1203
0
        default:
1204
0
            return "<legacy invalid kind>";
1205
0
        }
1206
0
    }
1207
0
    switch (PyUnicode_KIND(unicode)) {
1208
0
    case PyUnicode_1BYTE_KIND:
1209
0
        if (PyUnicode_IS_ASCII(unicode))
1210
0
            return "ascii";
1211
0
        else
1212
0
            return "latin1";
1213
0
    case PyUnicode_2BYTE_KIND:
1214
0
        return "UCS2";
1215
0
    case PyUnicode_4BYTE_KIND:
1216
0
        return "UCS4";
1217
0
    default:
1218
0
        return "<invalid compact kind>";
1219
0
    }
1220
0
}
1221
1222
#ifdef Py_DEBUG
1223
/* Functions wrapping macros for use in debugger */
1224
const char *_PyUnicode_utf8(void *unicode_raw){
1225
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1226
    return PyUnicode_UTF8(unicode);
1227
}
1228
1229
const void *_PyUnicode_compact_data(void *unicode_raw) {
1230
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1231
    return _PyUnicode_COMPACT_DATA(unicode);
1232
}
1233
const void *_PyUnicode_data(void *unicode_raw) {
1234
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1235
    printf("obj %p\n", (void*)unicode);
1236
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1237
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1238
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1239
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1240
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1241
    return PyUnicode_DATA(unicode);
1242
}
1243
1244
void
1245
_PyUnicode_Dump(PyObject *op)
1246
{
1247
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1248
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1249
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1250
    const void *data;
1251
1252
    if (ascii->state.compact)
1253
    {
1254
        if (ascii->state.ascii)
1255
            data = (ascii + 1);
1256
        else
1257
            data = (compact + 1);
1258
    }
1259
    else
1260
        data = unicode->data.any;
1261
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1262
1263
    if (!ascii->state.ascii) {
1264
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1265
    }
1266
    printf(", data=%p\n", data);
1267
}
1268
#endif
1269
1270
1271
PyObject *
1272
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1273
563M
{
1274
    /* Optimization for empty strings */
1275
563M
    if (size == 0) {
1276
24.6M
        return _PyUnicode_GetEmpty();
1277
24.6M
    }
1278
1279
538M
    PyObject *obj;
1280
538M
    PyCompactUnicodeObject *unicode;
1281
538M
    void *data;
1282
538M
    int kind;
1283
538M
    int is_ascii;
1284
538M
    Py_ssize_t char_size;
1285
538M
    Py_ssize_t struct_size;
1286
1287
538M
    is_ascii = 0;
1288
538M
    struct_size = sizeof(PyCompactUnicodeObject);
1289
538M
    if (maxchar < 128) {
1290
349M
        kind = PyUnicode_1BYTE_KIND;
1291
349M
        char_size = 1;
1292
349M
        is_ascii = 1;
1293
349M
        struct_size = sizeof(PyASCIIObject);
1294
349M
    }
1295
189M
    else if (maxchar < 256) {
1296
13.9M
        kind = PyUnicode_1BYTE_KIND;
1297
13.9M
        char_size = 1;
1298
13.9M
    }
1299
175M
    else if (maxchar < 65536) {
1300
164M
        kind = PyUnicode_2BYTE_KIND;
1301
164M
        char_size = 2;
1302
164M
    }
1303
10.7M
    else {
1304
10.7M
        if (maxchar > MAX_UNICODE) {
1305
0
            PyErr_SetString(PyExc_SystemError,
1306
0
                            "invalid maximum character passed to PyUnicode_New");
1307
0
            return NULL;
1308
0
        }
1309
10.7M
        kind = PyUnicode_4BYTE_KIND;
1310
10.7M
        char_size = 4;
1311
10.7M
    }
1312
1313
    /* Ensure we won't overflow the size. */
1314
538M
    if (size < 0) {
1315
0
        PyErr_SetString(PyExc_SystemError,
1316
0
                        "Negative size passed to PyUnicode_New");
1317
0
        return NULL;
1318
0
    }
1319
538M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1320
0
        return PyErr_NoMemory();
1321
1322
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1323
     * PyObject_New() so we are able to allocate space for the object and
1324
     * it's data buffer.
1325
     */
1326
538M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1327
538M
    if (obj == NULL) {
1328
0
        return PyErr_NoMemory();
1329
0
    }
1330
538M
    _PyObject_Init(obj, &PyUnicode_Type);
1331
1332
538M
    unicode = (PyCompactUnicodeObject *)obj;
1333
538M
    if (is_ascii)
1334
349M
        data = ((PyASCIIObject*)obj) + 1;
1335
189M
    else
1336
189M
        data = unicode + 1;
1337
538M
    _PyUnicode_LENGTH(unicode) = size;
1338
538M
    _PyUnicode_HASH(unicode) = -1;
1339
538M
    _PyUnicode_STATE(unicode).interned = 0;
1340
538M
    _PyUnicode_STATE(unicode).kind = kind;
1341
538M
    _PyUnicode_STATE(unicode).compact = 1;
1342
538M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1343
538M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1344
538M
    if (is_ascii) {
1345
349M
        ((char*)data)[size] = 0;
1346
349M
    }
1347
189M
    else if (kind == PyUnicode_1BYTE_KIND) {
1348
13.9M
        ((char*)data)[size] = 0;
1349
13.9M
        unicode->utf8 = NULL;
1350
13.9M
        unicode->utf8_length = 0;
1351
13.9M
    }
1352
175M
    else {
1353
175M
        unicode->utf8 = NULL;
1354
175M
        unicode->utf8_length = 0;
1355
175M
        if (kind == PyUnicode_2BYTE_KIND)
1356
164M
            ((Py_UCS2*)data)[size] = 0;
1357
10.7M
        else /* kind == PyUnicode_4BYTE_KIND */
1358
10.7M
            ((Py_UCS4*)data)[size] = 0;
1359
175M
    }
1360
#ifdef Py_DEBUG
1361
    unicode_fill_invalid((PyObject*)unicode, 0);
1362
#endif
1363
538M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1364
538M
    return obj;
1365
538M
}
1366
1367
static int
1368
unicode_check_modifiable(PyObject *unicode)
1369
648
{
1370
648
    if (!_PyUnicode_IsModifiable(unicode)) {
1371
0
        PyErr_SetString(PyExc_SystemError,
1372
0
                        "Cannot modify a string currently used");
1373
0
        return -1;
1374
0
    }
1375
648
    return 0;
1376
648
}
1377
1378
static int
1379
_copy_characters(PyObject *to, Py_ssize_t to_start,
1380
                 PyObject *from, Py_ssize_t from_start,
1381
                 Py_ssize_t how_many, int check_maxchar)
1382
277M
{
1383
277M
    int from_kind, to_kind;
1384
277M
    const void *from_data;
1385
277M
    void *to_data;
1386
1387
277M
    assert(0 <= how_many);
1388
277M
    assert(0 <= from_start);
1389
277M
    assert(0 <= to_start);
1390
277M
    assert(PyUnicode_Check(from));
1391
277M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1392
1393
277M
    assert(to == NULL || PyUnicode_Check(to));
1394
1395
277M
    if (how_many == 0) {
1396
4.49M
        return 0;
1397
4.49M
    }
1398
1399
277M
    assert(to != NULL);
1400
272M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1401
1402
272M
    from_kind = PyUnicode_KIND(from);
1403
272M
    from_data = PyUnicode_DATA(from);
1404
272M
    to_kind = PyUnicode_KIND(to);
1405
272M
    to_data = PyUnicode_DATA(to);
1406
1407
#ifdef Py_DEBUG
1408
    if (!check_maxchar
1409
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1410
    {
1411
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1412
        Py_UCS4 ch;
1413
        Py_ssize_t i;
1414
        for (i=0; i < how_many; i++) {
1415
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1416
            assert(ch <= to_maxchar);
1417
        }
1418
    }
1419
#endif
1420
1421
272M
    if (from_kind == to_kind) {
1422
168M
        if (check_maxchar
1423
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1424
0
        {
1425
            /* Writing Latin-1 characters into an ASCII string requires to
1426
               check that all written characters are pure ASCII */
1427
0
            Py_UCS4 max_char;
1428
0
            max_char = ucs1lib_find_max_char(from_data,
1429
0
                                             (const Py_UCS1*)from_data + how_many);
1430
0
            if (max_char >= 128)
1431
0
                return -1;
1432
0
        }
1433
168M
        memcpy((char*)to_data + to_kind * to_start,
1434
168M
                  (const char*)from_data + from_kind * from_start,
1435
168M
                  to_kind * how_many);
1436
168M
    }
1437
103M
    else if (from_kind == PyUnicode_1BYTE_KIND
1438
101M
             && to_kind == PyUnicode_2BYTE_KIND)
1439
84.5M
    {
1440
84.5M
        _PyUnicode_CONVERT_BYTES(
1441
84.5M
            Py_UCS1, Py_UCS2,
1442
84.5M
            PyUnicode_1BYTE_DATA(from) + from_start,
1443
84.5M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1444
84.5M
            PyUnicode_2BYTE_DATA(to) + to_start
1445
84.5M
            );
1446
84.5M
    }
1447
19.3M
    else if (from_kind == PyUnicode_1BYTE_KIND
1448
16.4M
             && to_kind == PyUnicode_4BYTE_KIND)
1449
16.4M
    {
1450
16.4M
        _PyUnicode_CONVERT_BYTES(
1451
16.4M
            Py_UCS1, Py_UCS4,
1452
16.4M
            PyUnicode_1BYTE_DATA(from) + from_start,
1453
16.4M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1454
16.4M
            PyUnicode_4BYTE_DATA(to) + to_start
1455
16.4M
            );
1456
16.4M
    }
1457
2.92M
    else if (from_kind == PyUnicode_2BYTE_KIND
1458
2.91M
             && to_kind == PyUnicode_4BYTE_KIND)
1459
2.91M
    {
1460
2.91M
        _PyUnicode_CONVERT_BYTES(
1461
2.91M
            Py_UCS2, Py_UCS4,
1462
2.91M
            PyUnicode_2BYTE_DATA(from) + from_start,
1463
2.91M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1464
2.91M
            PyUnicode_4BYTE_DATA(to) + to_start
1465
2.91M
            );
1466
2.91M
    }
1467
12.6k
    else {
1468
12.6k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1469
1470
12.6k
        if (!check_maxchar) {
1471
12.6k
            if (from_kind == PyUnicode_2BYTE_KIND
1472
2.64k
                && to_kind == PyUnicode_1BYTE_KIND)
1473
2.64k
            {
1474
2.64k
                _PyUnicode_CONVERT_BYTES(
1475
2.64k
                    Py_UCS2, Py_UCS1,
1476
2.64k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1477
2.64k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1478
2.64k
                    PyUnicode_1BYTE_DATA(to) + to_start
1479
2.64k
                    );
1480
2.64k
            }
1481
9.96k
            else if (from_kind == PyUnicode_4BYTE_KIND
1482
9.96k
                     && to_kind == PyUnicode_1BYTE_KIND)
1483
6.20k
            {
1484
6.20k
                _PyUnicode_CONVERT_BYTES(
1485
6.20k
                    Py_UCS4, Py_UCS1,
1486
6.20k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1487
6.20k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1488
6.20k
                    PyUnicode_1BYTE_DATA(to) + to_start
1489
6.20k
                    );
1490
6.20k
            }
1491
3.75k
            else if (from_kind == PyUnicode_4BYTE_KIND
1492
3.75k
                     && to_kind == PyUnicode_2BYTE_KIND)
1493
3.75k
            {
1494
3.75k
                _PyUnicode_CONVERT_BYTES(
1495
3.75k
                    Py_UCS4, Py_UCS2,
1496
3.75k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1497
3.75k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1498
3.75k
                    PyUnicode_2BYTE_DATA(to) + to_start
1499
3.75k
                    );
1500
3.75k
            }
1501
0
            else {
1502
0
                Py_UNREACHABLE();
1503
0
            }
1504
12.6k
        }
1505
0
        else {
1506
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1507
0
            Py_UCS4 ch;
1508
0
            Py_ssize_t i;
1509
1510
0
            for (i=0; i < how_many; i++) {
1511
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1512
0
                if (ch > to_maxchar)
1513
0
                    return -1;
1514
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1515
0
            }
1516
0
        }
1517
12.6k
    }
1518
272M
    return 0;
1519
272M
}
1520
1521
void
1522
_PyUnicode_FastCopyCharacters(
1523
    PyObject *to, Py_ssize_t to_start,
1524
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1525
277M
{
1526
277M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1527
277M
}
1528
1529
Py_ssize_t
1530
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1531
                         PyObject *from, Py_ssize_t from_start,
1532
                         Py_ssize_t how_many)
1533
0
{
1534
0
    int err;
1535
1536
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1537
0
        PyErr_BadInternalCall();
1538
0
        return -1;
1539
0
    }
1540
1541
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1542
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1543
0
        return -1;
1544
0
    }
1545
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1546
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1547
0
        return -1;
1548
0
    }
1549
0
    if (how_many < 0) {
1550
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1551
0
        return -1;
1552
0
    }
1553
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1554
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1555
0
        PyErr_Format(PyExc_SystemError,
1556
0
                     "Cannot write %zi characters at %zi "
1557
0
                     "in a string of %zi characters",
1558
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1559
0
        return -1;
1560
0
    }
1561
1562
0
    if (how_many == 0)
1563
0
        return 0;
1564
1565
0
    if (unicode_check_modifiable(to))
1566
0
        return -1;
1567
1568
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1569
0
    if (err) {
1570
0
        PyErr_Format(PyExc_SystemError,
1571
0
                     "Cannot copy %s characters "
1572
0
                     "into a string of %s characters",
1573
0
                     unicode_kind_name(from),
1574
0
                     unicode_kind_name(to));
1575
0
        return -1;
1576
0
    }
1577
0
    return how_many;
1578
0
}
1579
1580
/* Find the maximum code point and count the number of surrogate pairs so a
1581
   correct string length can be computed before converting a string to UCS4.
1582
   This function counts single surrogates as a character and not as a pair.
1583
1584
   Return 0 on success, or -1 on error. */
1585
static int
1586
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1587
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1588
423k
{
1589
423k
    const wchar_t *iter;
1590
423k
    Py_UCS4 ch;
1591
1592
423k
    assert(num_surrogates != NULL && maxchar != NULL);
1593
423k
    *num_surrogates = 0;
1594
423k
    *maxchar = 0;
1595
1596
12.7M
    for (iter = begin; iter < end; ) {
1597
#if SIZEOF_WCHAR_T == 2
1598
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1599
            && (iter+1) < end
1600
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1601
        {
1602
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1603
            ++(*num_surrogates);
1604
            iter += 2;
1605
        }
1606
        else
1607
#endif
1608
12.3M
        {
1609
12.3M
            ch = *iter;
1610
12.3M
            iter++;
1611
12.3M
        }
1612
12.3M
        if (ch > *maxchar) {
1613
1.75M
            *maxchar = ch;
1614
1.75M
            if (*maxchar > MAX_UNICODE) {
1615
0
                PyErr_Format(PyExc_ValueError,
1616
0
                             "character U+%x is not in range [U+0000; U+%x]",
1617
0
                             ch, MAX_UNICODE);
1618
0
                return -1;
1619
0
            }
1620
1.75M
        }
1621
12.3M
    }
1622
423k
    return 0;
1623
423k
}
1624
1625
static void
1626
unicode_dealloc(PyObject *unicode)
1627
554M
{
1628
#ifdef Py_DEBUG
1629
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1630
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1631
    }
1632
#endif
1633
554M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1634
        /* This should never get called, but we also don't want to SEGV if
1635
        * we accidentally decref an immortal string out of existence. Since
1636
        * the string is an immortal object, just re-set the reference count.
1637
        */
1638
#ifdef Py_DEBUG
1639
        Py_UNREACHABLE();
1640
#endif
1641
0
        _Py_SetImmortal(unicode);
1642
0
        return;
1643
0
    }
1644
554M
    switch (_PyUnicode_STATE(unicode).interned) {
1645
553M
        case SSTATE_NOT_INTERNED:
1646
553M
            break;
1647
536k
        case SSTATE_INTERNED_MORTAL:
1648
            /* Remove the object from the intern dict.
1649
             * Before doing so, we set the refcount to 2: the key and value
1650
             * in the interned_dict.
1651
             */
1652
536k
            assert(Py_REFCNT(unicode) == 0);
1653
536k
            Py_SET_REFCNT(unicode, 2);
1654
#ifdef Py_REF_DEBUG
1655
            /* let's be pedantic with the ref total */
1656
            _Py_IncRefTotal(_PyThreadState_GET());
1657
            _Py_IncRefTotal(_PyThreadState_GET());
1658
#endif
1659
536k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1660
536k
            PyObject *interned = get_interned_dict(interp);
1661
536k
            assert(interned != NULL);
1662
536k
            PyObject *popped;
1663
536k
            int r = PyDict_Pop(interned, unicode, &popped);
1664
536k
            if (r == -1) {
1665
0
                PyErr_FormatUnraisable("Exception ignored while "
1666
0
                                       "removing an interned string %R",
1667
0
                                       unicode);
1668
                // We don't know what happened to the string. It's probably
1669
                // best to leak it:
1670
                // - if it was popped, there are no more references to it
1671
                //   so it can't cause trouble (except wasted memory)
1672
                // - if it wasn't popped, it'll remain interned
1673
0
                _Py_SetImmortal(unicode);
1674
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1675
0
                return;
1676
0
            }
1677
536k
            if (r == 0) {
1678
                // The interned string was not found in the interned_dict.
1679
#ifdef Py_DEBUG
1680
                Py_UNREACHABLE();
1681
#endif
1682
0
                _Py_SetImmortal(unicode);
1683
0
                return;
1684
0
            }
1685
            // Successfully popped.
1686
536k
            assert(popped == unicode);
1687
            // Only our `popped` reference should be left; remove it too.
1688
536k
            assert(Py_REFCNT(unicode) == 1);
1689
536k
            Py_SET_REFCNT(unicode, 0);
1690
#ifdef Py_REF_DEBUG
1691
            /* let's be pedantic with the ref total */
1692
            _Py_DecRefTotal(_PyThreadState_GET());
1693
#endif
1694
536k
            break;
1695
0
        default:
1696
            // As with `statically_allocated` above.
1697
#ifdef Py_REF_DEBUG
1698
            Py_UNREACHABLE();
1699
#endif
1700
0
            _Py_SetImmortal(unicode);
1701
0
            return;
1702
554M
    }
1703
554M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1704
159k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1705
159k
    }
1706
554M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1707
16.6M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1708
16.6M
    }
1709
1710
554M
    Py_TYPE(unicode)->tp_free(unicode);
1711
554M
}
1712
1713
#ifdef Py_DEBUG
1714
static int
1715
unicode_is_singleton(PyObject *unicode)
1716
{
1717
    if (unicode == &_Py_STR(empty)) {
1718
        return 1;
1719
    }
1720
1721
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1722
    if (ascii->length == 1) {
1723
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1724
        if (ch < 256 && LATIN1(ch) == unicode) {
1725
            return 1;
1726
        }
1727
    }
1728
    return 0;
1729
}
1730
#endif
1731
1732
int
1733
_PyUnicode_IsModifiable(PyObject *unicode)
1734
63.6M
{
1735
63.6M
    assert(_PyUnicode_CHECK(unicode));
1736
63.6M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1737
3.17M
        return 0;
1738
60.5M
    if (PyUnicode_HASH(unicode) != -1)
1739
0
        return 0;
1740
60.5M
    if (PyUnicode_CHECK_INTERNED(unicode))
1741
0
        return 0;
1742
60.5M
    if (!PyUnicode_CheckExact(unicode))
1743
0
        return 0;
1744
#ifdef Py_DEBUG
1745
    /* singleton refcount is greater than 1 */
1746
    assert(!unicode_is_singleton(unicode));
1747
#endif
1748
60.5M
    return 1;
1749
60.5M
}
1750
1751
static int
1752
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1753
1.93M
{
1754
1.93M
    PyObject *unicode;
1755
1.93M
    Py_ssize_t old_length;
1756
1757
1.93M
    assert(p_unicode != NULL);
1758
1.93M
    unicode = *p_unicode;
1759
1760
1.93M
    assert(unicode != NULL);
1761
1.93M
    assert(PyUnicode_Check(unicode));
1762
1.93M
    assert(0 <= length);
1763
1764
1.93M
    old_length = PyUnicode_GET_LENGTH(unicode);
1765
1.93M
    if (old_length == length)
1766
0
        return 0;
1767
1768
1.93M
    if (length == 0) {
1769
0
        PyObject *empty = _PyUnicode_GetEmpty();
1770
0
        Py_SETREF(*p_unicode, empty);
1771
0
        return 0;
1772
0
    }
1773
1774
1.93M
    if (!_PyUnicode_IsModifiable(unicode)) {
1775
0
        PyObject *copy = resize_copy(unicode, length);
1776
0
        if (copy == NULL)
1777
0
            return -1;
1778
0
        Py_SETREF(*p_unicode, copy);
1779
0
        return 0;
1780
0
    }
1781
1782
1.93M
    if (PyUnicode_IS_COMPACT(unicode)) {
1783
1.93M
        PyObject *new_unicode = _PyUnicode_ResizeCompact(unicode, length);
1784
1.93M
        if (new_unicode == NULL)
1785
0
            return -1;
1786
1.93M
        *p_unicode = new_unicode;
1787
1.93M
        return 0;
1788
1.93M
    }
1789
0
    return resize_inplace(unicode, length);
1790
1.93M
}
1791
1792
int
1793
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1794
0
{
1795
0
    PyObject *unicode;
1796
0
    if (p_unicode == NULL) {
1797
0
        PyErr_BadInternalCall();
1798
0
        return -1;
1799
0
    }
1800
0
    unicode = *p_unicode;
1801
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1802
0
    {
1803
0
        PyErr_BadInternalCall();
1804
0
        return -1;
1805
0
    }
1806
0
    return unicode_resize(p_unicode, length);
1807
0
}
1808
1809
static PyObject*
1810
get_latin1_char(Py_UCS1 ch)
1811
240M
{
1812
240M
    PyObject *o = LATIN1(ch);
1813
240M
    return o;
1814
240M
}
1815
1816
static PyObject*
1817
unicode_char(Py_UCS4 ch)
1818
267M
{
1819
267M
    PyObject *unicode;
1820
1821
267M
    assert(ch <= MAX_UNICODE);
1822
1823
267M
    if (ch < 256) {
1824
146M
        return get_latin1_char(ch);
1825
146M
    }
1826
1827
120M
    unicode = PyUnicode_New(1, ch);
1828
120M
    if (unicode == NULL)
1829
0
        return NULL;
1830
1831
120M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1832
120M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1833
112M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1834
112M
    } else {
1835
8.00M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1836
8.00M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1837
8.00M
    }
1838
120M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1839
120M
    return unicode;
1840
120M
}
1841
1842
1843
static inline void
1844
unicode_write_widechar(int kind, void *data,
1845
                       const wchar_t *u, Py_ssize_t size,
1846
                       Py_ssize_t num_surrogates)
1847
423k
{
1848
423k
    switch (kind) {
1849
395k
    case PyUnicode_1BYTE_KIND:
1850
395k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1851
395k
        break;
1852
1853
26.7k
    case PyUnicode_2BYTE_KIND:
1854
#if SIZEOF_WCHAR_T == 2
1855
        memcpy(data, u, size * 2);
1856
#else
1857
26.7k
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1858
26.7k
#endif
1859
26.7k
        break;
1860
1861
1.17k
    case PyUnicode_4BYTE_KIND:
1862
1.17k
    {
1863
#if SIZEOF_WCHAR_T == 2
1864
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1865
        // surrogate pairs.
1866
        const wchar_t *end = u + size;
1867
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1868
#  ifndef NDEBUG
1869
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1870
#  endif
1871
        for (const wchar_t *iter = u; iter < end; ) {
1872
            assert(ucs4_out < ucs4_end);
1873
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1874
                && (iter+1) < end
1875
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1876
            {
1877
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1878
                iter += 2;
1879
            }
1880
            else {
1881
                *ucs4_out++ = *iter;
1882
                iter++;
1883
            }
1884
        }
1885
        assert(ucs4_out == ucs4_end);
1886
#else
1887
1.17k
        assert(num_surrogates == 0);
1888
1.17k
        memcpy(data, u, size * 4);
1889
1.17k
#endif
1890
1.17k
        break;
1891
0
    }
1892
0
    default:
1893
0
        Py_UNREACHABLE();
1894
423k
    }
1895
423k
}
1896
1897
1898
PyObject *
1899
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1900
746k
{
1901
746k
    PyObject *unicode;
1902
746k
    Py_UCS4 maxchar = 0;
1903
746k
    Py_ssize_t num_surrogates;
1904
1905
746k
    if (u == NULL && size != 0) {
1906
0
        PyErr_BadInternalCall();
1907
0
        return NULL;
1908
0
    }
1909
1910
746k
    if (size == -1) {
1911
1.33k
        size = wcslen(u);
1912
1.33k
    }
1913
1914
    /* If the Unicode data is known at construction time, we can apply
1915
       some optimizations which share commonly used objects. */
1916
1917
    /* Optimization for empty strings */
1918
746k
    if (size == 0)
1919
260k
        _Py_RETURN_UNICODE_EMPTY();
1920
1921
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1922
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1923
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1924
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1925
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
1926
        if (!converted) {
1927
            return NULL;
1928
        }
1929
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
1930
        PyMem_Free(converted);
1931
        return unicode;
1932
    }
1933
#endif
1934
1935
    /* Single character Unicode objects in the Latin-1 range are
1936
       shared when using this constructor */
1937
485k
    if (size == 1 && (Py_UCS4)*u < 256)
1938
62.8k
        return get_latin1_char((unsigned char)*u);
1939
1940
    /* If not empty and not single character, copy the Unicode data
1941
       into the new object */
1942
423k
    if (find_maxchar_surrogates(u, u + size,
1943
423k
                                &maxchar, &num_surrogates) == -1)
1944
0
        return NULL;
1945
1946
423k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1947
423k
    if (!unicode)
1948
0
        return NULL;
1949
1950
423k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1951
423k
                           u, size, num_surrogates);
1952
1953
423k
    return unicode_result(unicode);
1954
423k
}
1955
1956
1957
int
1958
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
1959
                              const wchar_t *str,
1960
                              Py_ssize_t size)
1961
0
{
1962
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
1963
1964
0
    if (size < 0) {
1965
0
        size = wcslen(str);
1966
0
    }
1967
1968
0
    if (size == 0) {
1969
0
        return 0;
1970
0
    }
1971
1972
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
1973
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
1974
       non-Unicode locales and hence needs conversion to UCS-4 first. */
1975
    if (_Py_LocaleUsesNonUnicodeWchar()) {
1976
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
1977
        if (!converted) {
1978
            return -1;
1979
        }
1980
1981
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
1982
        PyMem_Free(converted);
1983
        return res;
1984
    }
1985
#endif
1986
1987
0
    Py_UCS4 maxchar = 0;
1988
0
    Py_ssize_t num_surrogates;
1989
0
    if (find_maxchar_surrogates(str, str + size,
1990
0
                                &maxchar, &num_surrogates) == -1) {
1991
0
        return -1;
1992
0
    }
1993
1994
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
1995
0
        return -1;
1996
0
    }
1997
1998
0
    int kind = writer->kind;
1999
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2000
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2001
2002
0
    writer->pos += size - num_surrogates;
2003
0
    return 0;
2004
0
}
2005
2006
2007
PyObject *
2008
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2009
7.50M
{
2010
7.50M
    if (size < 0) {
2011
0
        PyErr_SetString(PyExc_SystemError,
2012
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2013
0
        return NULL;
2014
0
    }
2015
7.50M
    if (u != NULL) {
2016
7.50M
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2017
7.50M
    }
2018
0
    if (size > 0) {
2019
0
        PyErr_SetString(PyExc_SystemError,
2020
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2021
0
        return NULL;
2022
0
    }
2023
0
    return _PyUnicode_GetEmpty();
2024
0
}
2025
2026
PyObject *
2027
PyUnicode_FromString(const char *u)
2028
21.3M
{
2029
21.3M
    size_t size = strlen(u);
2030
21.3M
    if (size > PY_SSIZE_T_MAX) {
2031
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2032
0
        return NULL;
2033
0
    }
2034
21.3M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2035
21.3M
}
2036
2037
2038
PyObject *
2039
_PyUnicode_FromId(_Py_Identifier *id)
2040
0
{
2041
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2042
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2043
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2044
2045
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2046
0
    if (index < 0) {
2047
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2048
2049
0
        PyMutex_Lock(&rt_ids->mutex);
2050
        // Check again to detect concurrent access. Another thread can have
2051
        // initialized the index while this thread waited for the lock.
2052
0
        index = _Py_atomic_load_ssize(&id->index);
2053
0
        if (index < 0) {
2054
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2055
0
            index = rt_ids->next_index;
2056
0
            rt_ids->next_index++;
2057
0
            _Py_atomic_store_ssize(&id->index, index);
2058
0
        }
2059
0
        PyMutex_Unlock(&rt_ids->mutex);
2060
0
    }
2061
0
    assert(index >= 0);
2062
2063
0
    PyObject *obj;
2064
0
    if (index < ids->size) {
2065
0
        obj = ids->array[index];
2066
0
        if (obj) {
2067
            // Return a borrowed reference
2068
0
            goto end;
2069
0
        }
2070
0
    }
2071
2072
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2073
0
                                       NULL, NULL);
2074
0
    if (!obj) {
2075
0
        goto end;
2076
0
    }
2077
0
    _PyUnicode_InternImmortal(interp, &obj);
2078
2079
0
    if (index >= ids->size) {
2080
        // Overallocate to reduce the number of realloc
2081
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2082
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2083
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2084
0
        if (new_array == NULL) {
2085
0
            PyErr_NoMemory();
2086
0
            obj = NULL;
2087
0
            goto end;
2088
0
        }
2089
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2090
0
        ids->array = new_array;
2091
0
        ids->size = new_size;
2092
0
    }
2093
2094
    // The array stores a strong reference
2095
0
    ids->array[index] = obj;
2096
2097
0
end:
2098
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2099
    // Return a borrowed reference
2100
0
    return obj;
2101
0
}
2102
2103
2104
static void
2105
unicode_clear_identifiers(struct _Py_unicode_state *state)
2106
0
{
2107
0
    struct _Py_unicode_ids *ids = &state->ids;
2108
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2109
0
        Py_XDECREF(ids->array[i]);
2110
0
    }
2111
0
    ids->size = 0;
2112
0
    PyMem_Free(ids->array);
2113
0
    ids->array = NULL;
2114
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2115
    // after Py_Finalize().
2116
0
}
2117
2118
2119
/* Internal function, doesn't check maximum character */
2120
2121
PyObject*
2122
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2123
119M
{
2124
119M
    const unsigned char *s = (const unsigned char *)buffer;
2125
119M
    PyObject *unicode;
2126
119M
    if (size == 1) {
2127
#ifdef Py_DEBUG
2128
        assert((unsigned char)s[0] < 128);
2129
#endif
2130
44.2M
        return get_latin1_char(s[0]);
2131
44.2M
    }
2132
75.0M
    unicode = PyUnicode_New(size, 127);
2133
75.0M
    if (!unicode)
2134
0
        return NULL;
2135
75.0M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2136
75.0M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2137
75.0M
    return unicode;
2138
75.0M
}
2139
2140
static Py_UCS4
2141
kind_maxchar_limit(int kind)
2142
0
{
2143
0
    switch (kind) {
2144
0
    case PyUnicode_1BYTE_KIND:
2145
0
        return 0x80;
2146
0
    case PyUnicode_2BYTE_KIND:
2147
0
        return 0x100;
2148
0
    case PyUnicode_4BYTE_KIND:
2149
0
        return 0x10000;
2150
0
    default:
2151
0
        Py_UNREACHABLE();
2152
0
    }
2153
0
}
2154
2155
static PyObject*
2156
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2157
47.5M
{
2158
47.5M
    PyObject *res;
2159
47.5M
    unsigned char max_char;
2160
2161
47.5M
    if (size == 0) {
2162
7.21M
        _Py_RETURN_UNICODE_EMPTY();
2163
7.21M
    }
2164
47.5M
    assert(size > 0);
2165
40.3M
    if (size == 1) {
2166
10.6M
        return get_latin1_char(u[0]);
2167
10.6M
    }
2168
2169
29.6M
    max_char = ucs1lib_find_max_char(u, u + size);
2170
29.6M
    res = PyUnicode_New(size, max_char);
2171
29.6M
    if (!res)
2172
0
        return NULL;
2173
29.6M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2174
29.6M
    assert(_PyUnicode_CheckConsistency(res, 1));
2175
29.6M
    return res;
2176
29.6M
}
2177
2178
static PyObject*
2179
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2180
105M
{
2181
105M
    PyObject *res;
2182
105M
    Py_UCS2 max_char;
2183
2184
105M
    if (size == 0)
2185
15.3M
        _Py_RETURN_UNICODE_EMPTY();
2186
105M
    assert(size > 0);
2187
89.9M
    if (size == 1)
2188
61.4M
        return unicode_char(u[0]);
2189
2190
28.4M
    max_char = ucs2lib_find_max_char(u, u + size);
2191
28.4M
    res = PyUnicode_New(size, max_char);
2192
28.4M
    if (!res)
2193
0
        return NULL;
2194
28.4M
    if (max_char >= 256)
2195
17.7M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2196
10.6M
    else {
2197
10.6M
        _PyUnicode_CONVERT_BYTES(
2198
10.6M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2199
10.6M
    }
2200
28.4M
    assert(_PyUnicode_CheckConsistency(res, 1));
2201
28.4M
    return res;
2202
28.4M
}
2203
2204
static PyObject*
2205
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2206
68.2M
{
2207
68.2M
    PyObject *res;
2208
68.2M
    Py_UCS4 max_char;
2209
2210
68.2M
    if (size == 0)
2211
9.22M
        _Py_RETURN_UNICODE_EMPTY();
2212
68.2M
    assert(size > 0);
2213
58.9M
    if (size == 1)
2214
38.6M
        return unicode_char(u[0]);
2215
2216
20.3M
    max_char = ucs4lib_find_max_char(u, u + size);
2217
20.3M
    res = PyUnicode_New(size, max_char);
2218
20.3M
    if (!res)
2219
0
        return NULL;
2220
20.3M
    if (max_char < 256)
2221
14.3M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2222
20.3M
                                 PyUnicode_1BYTE_DATA(res));
2223
5.95M
    else if (max_char < 0x10000)
2224
4.36M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2225
5.95M
                                 PyUnicode_2BYTE_DATA(res));
2226
1.58M
    else
2227
1.58M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2228
20.3M
    assert(_PyUnicode_CheckConsistency(res, 1));
2229
20.3M
    return res;
2230
20.3M
}
2231
2232
2233
int
2234
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2235
                          const Py_UCS4 *str,
2236
                          Py_ssize_t size)
2237
0
{
2238
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2239
2240
0
    if (size < 0) {
2241
0
        PyErr_SetString(PyExc_ValueError,
2242
0
                        "size must be positive");
2243
0
        return -1;
2244
0
    }
2245
2246
0
    if (size == 0) {
2247
0
        return 0;
2248
0
    }
2249
2250
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2251
2252
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2253
0
        return -1;
2254
0
    }
2255
2256
0
    int kind = writer->kind;
2257
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2258
0
    if (kind == PyUnicode_1BYTE_KIND) {
2259
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2260
0
                                 str, str + size,
2261
0
                                 data);
2262
0
    }
2263
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2264
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2265
0
                                 str, str + size,
2266
0
                                 data);
2267
0
    }
2268
0
    else {
2269
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2270
0
    }
2271
0
    writer->pos += size;
2272
2273
0
    return 0;
2274
0
}
2275
2276
2277
PyObject*
2278
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2279
167M
{
2280
167M
    if (size < 0) {
2281
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2282
0
        return NULL;
2283
0
    }
2284
167M
    switch (kind) {
2285
22.3M
    case PyUnicode_1BYTE_KIND:
2286
22.3M
        return _PyUnicode_FromUCS1(buffer, size);
2287
88.6M
    case PyUnicode_2BYTE_KIND:
2288
88.6M
        return _PyUnicode_FromUCS2(buffer, size);
2289
56.7M
    case PyUnicode_4BYTE_KIND:
2290
56.7M
        return _PyUnicode_FromUCS4(buffer, size);
2291
0
    default:
2292
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2293
0
        return NULL;
2294
167M
    }
2295
167M
}
2296
2297
Py_UCS4
2298
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2299
11.7M
{
2300
11.7M
    int kind;
2301
11.7M
    const void *startptr, *endptr;
2302
2303
11.7M
    assert(0 <= start);
2304
11.7M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2305
11.7M
    assert(start <= end);
2306
2307
11.7M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2308
74.5k
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2309
2310
11.7M
    if (start == end)
2311
0
        return 127;
2312
2313
11.7M
    if (PyUnicode_IS_ASCII(unicode))
2314
11.6M
        return 127;
2315
2316
23.4k
    kind = PyUnicode_KIND(unicode);
2317
23.4k
    startptr = PyUnicode_DATA(unicode);
2318
23.4k
    endptr = (char *)startptr + end * kind;
2319
23.4k
    startptr = (char *)startptr + start * kind;
2320
23.4k
    switch(kind) {
2321
1.63k
    case PyUnicode_1BYTE_KIND:
2322
1.63k
        return ucs1lib_find_max_char(startptr, endptr);
2323
5.61k
    case PyUnicode_2BYTE_KIND:
2324
5.61k
        return ucs2lib_find_max_char(startptr, endptr);
2325
16.2k
    case PyUnicode_4BYTE_KIND:
2326
16.2k
        return ucs4lib_find_max_char(startptr, endptr);
2327
0
    default:
2328
0
        Py_UNREACHABLE();
2329
23.4k
    }
2330
23.4k
}
2331
2332
/* Ensure that a string uses the most efficient storage, if it is not the
2333
   case: create a new string with of the right kind. Write NULL into *p_unicode
2334
   on error. */
2335
static void
2336
unicode_adjust_maxchar(PyObject **p_unicode)
2337
0
{
2338
0
    PyObject *unicode, *copy;
2339
0
    Py_UCS4 max_char;
2340
0
    Py_ssize_t len;
2341
0
    int kind;
2342
2343
0
    assert(p_unicode != NULL);
2344
0
    unicode = *p_unicode;
2345
0
    if (PyUnicode_IS_ASCII(unicode))
2346
0
        return;
2347
2348
0
    len = PyUnicode_GET_LENGTH(unicode);
2349
0
    kind = PyUnicode_KIND(unicode);
2350
0
    if (kind == PyUnicode_1BYTE_KIND) {
2351
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2352
0
        max_char = ucs1lib_find_max_char(u, u + len);
2353
0
        if (max_char >= 128)
2354
0
            return;
2355
0
    }
2356
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2357
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2358
0
        max_char = ucs2lib_find_max_char(u, u + len);
2359
0
        if (max_char >= 256)
2360
0
            return;
2361
0
    }
2362
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2363
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2364
0
        max_char = ucs4lib_find_max_char(u, u + len);
2365
0
        if (max_char >= 0x10000)
2366
0
            return;
2367
0
    }
2368
0
    else
2369
0
        Py_UNREACHABLE();
2370
2371
0
    copy = PyUnicode_New(len, max_char);
2372
0
    if (copy != NULL)
2373
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2374
0
    Py_DECREF(unicode);
2375
0
    *p_unicode = copy;
2376
0
}
2377
2378
PyObject*
2379
_PyUnicode_Copy(PyObject *unicode)
2380
3.24M
{
2381
3.24M
    Py_ssize_t length;
2382
3.24M
    PyObject *copy;
2383
2384
3.24M
    if (!PyUnicode_Check(unicode)) {
2385
0
        PyErr_BadInternalCall();
2386
0
        return NULL;
2387
0
    }
2388
2389
3.24M
    length = PyUnicode_GET_LENGTH(unicode);
2390
3.24M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2391
3.24M
    if (!copy)
2392
0
        return NULL;
2393
3.24M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2394
2395
3.24M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2396
3.24M
              length * PyUnicode_KIND(unicode));
2397
3.24M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2398
3.24M
    return copy;
2399
3.24M
}
2400
2401
2402
/* Widen Unicode objects to larger buffers. Don't write terminating null
2403
   character. Return NULL on error. */
2404
2405
static void*
2406
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2407
7.49M
{
2408
7.49M
    void *result;
2409
2410
7.49M
    assert(skind < kind);
2411
7.49M
    switch (kind) {
2412
4.85M
    case PyUnicode_2BYTE_KIND:
2413
4.85M
        result = PyMem_New(Py_UCS2, len);
2414
4.85M
        if (!result)
2415
0
            return PyErr_NoMemory();
2416
4.85M
        assert(skind == PyUnicode_1BYTE_KIND);
2417
4.85M
        _PyUnicode_CONVERT_BYTES(
2418
4.85M
            Py_UCS1, Py_UCS2,
2419
4.85M
            (const Py_UCS1 *)data,
2420
4.85M
            ((const Py_UCS1 *)data) + len,
2421
4.85M
            result);
2422
4.85M
        return result;
2423
2.64M
    case PyUnicode_4BYTE_KIND:
2424
2.64M
        result = PyMem_New(Py_UCS4, len);
2425
2.64M
        if (!result)
2426
0
            return PyErr_NoMemory();
2427
2.64M
        if (skind == PyUnicode_2BYTE_KIND) {
2428
0
            _PyUnicode_CONVERT_BYTES(
2429
0
                Py_UCS2, Py_UCS4,
2430
0
                (const Py_UCS2 *)data,
2431
0
                ((const Py_UCS2 *)data) + len,
2432
0
                result);
2433
0
        }
2434
2.64M
        else {
2435
2.64M
            assert(skind == PyUnicode_1BYTE_KIND);
2436
2.64M
            _PyUnicode_CONVERT_BYTES(
2437
2.64M
                Py_UCS1, Py_UCS4,
2438
2.64M
                (const Py_UCS1 *)data,
2439
2.64M
                ((const Py_UCS1 *)data) + len,
2440
2.64M
                result);
2441
2.64M
        }
2442
2.64M
        return result;
2443
0
    default:
2444
0
        Py_UNREACHABLE();
2445
0
        return NULL;
2446
7.49M
    }
2447
7.49M
}
2448
2449
static Py_UCS4*
2450
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2451
        int copy_null)
2452
77.3k
{
2453
77.3k
    int kind;
2454
77.3k
    const void *data;
2455
77.3k
    Py_ssize_t len, targetlen;
2456
77.3k
    kind = PyUnicode_KIND(string);
2457
77.3k
    data = PyUnicode_DATA(string);
2458
77.3k
    len = PyUnicode_GET_LENGTH(string);
2459
77.3k
    targetlen = len;
2460
77.3k
    if (copy_null)
2461
0
        targetlen++;
2462
77.3k
    if (!target) {
2463
0
        target = PyMem_New(Py_UCS4, targetlen);
2464
0
        if (!target) {
2465
0
            PyErr_NoMemory();
2466
0
            return NULL;
2467
0
        }
2468
0
    }
2469
77.3k
    else {
2470
77.3k
        if (targetsize < targetlen) {
2471
0
            PyErr_Format(PyExc_SystemError,
2472
0
                         "string is longer than the buffer");
2473
0
            if (copy_null && 0 < targetsize)
2474
0
                target[0] = 0;
2475
0
            return NULL;
2476
0
        }
2477
77.3k
    }
2478
77.3k
    if (kind == PyUnicode_1BYTE_KIND) {
2479
53.8k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2480
53.8k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2481
53.8k
    }
2482
23.5k
    else if (kind == PyUnicode_2BYTE_KIND) {
2483
17.1k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2484
17.1k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2485
17.1k
    }
2486
6.46k
    else if (kind == PyUnicode_4BYTE_KIND) {
2487
6.46k
        memcpy(target, data, len * sizeof(Py_UCS4));
2488
6.46k
    }
2489
0
    else {
2490
0
        Py_UNREACHABLE();
2491
0
    }
2492
77.3k
    if (copy_null)
2493
0
        target[len] = 0;
2494
77.3k
    return target;
2495
77.3k
}
2496
2497
Py_UCS4*
2498
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2499
                 int copy_null)
2500
77.3k
{
2501
77.3k
    if (target == NULL || targetsize < 0) {
2502
0
        PyErr_BadInternalCall();
2503
0
        return NULL;
2504
0
    }
2505
77.3k
    return as_ucs4(string, target, targetsize, copy_null);
2506
77.3k
}
2507
2508
Py_UCS4*
2509
PyUnicode_AsUCS4Copy(PyObject *string)
2510
0
{
2511
0
    return as_ucs4(string, NULL, 0, 1);
2512
0
}
2513
2514
/* maximum number of characters required for output of %jo or %jd or %p.
2515
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2516
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2517
   plus 1 for the terminal NUL. */
2518
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2519
2520
static int
2521
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2522
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2523
7.14M
{
2524
7.14M
    Py_ssize_t length, fill, arglen;
2525
7.14M
    Py_UCS4 maxchar;
2526
2527
7.14M
    length = PyUnicode_GET_LENGTH(str);
2528
7.14M
    if ((precision == -1 || precision >= length)
2529
7.14M
        && width <= length)
2530
7.14M
        return _PyUnicodeWriter_WriteStr(writer, str);
2531
2532
47
    if (precision != -1)
2533
47
        length = Py_MIN(precision, length);
2534
2535
47
    arglen = Py_MAX(length, width);
2536
47
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2537
19
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2538
28
    else
2539
28
        maxchar = writer->maxchar;
2540
2541
47
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2542
0
        return -1;
2543
2544
47
    fill = Py_MAX(width - length, 0);
2545
47
    if (fill && !(flags & F_LJUST)) {
2546
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2547
0
            return -1;
2548
0
        writer->pos += fill;
2549
0
    }
2550
2551
47
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2552
47
                                  str, 0, length);
2553
47
    writer->pos += length;
2554
2555
47
    if (fill && (flags & F_LJUST)) {
2556
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2557
0
            return -1;
2558
0
        writer->pos += fill;
2559
0
    }
2560
2561
47
    return 0;
2562
47
}
2563
2564
static int
2565
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2566
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2567
3.14M
{
2568
    /* UTF-8 */
2569
3.14M
    Py_ssize_t *pconsumed = NULL;
2570
3.14M
    Py_ssize_t length;
2571
3.14M
    if (precision == -1) {
2572
783k
        length = strlen(str);
2573
783k
    }
2574
2.36M
    else {
2575
2.36M
        length = 0;
2576
37.5M
        while (length < precision && str[length]) {
2577
35.2M
            length++;
2578
35.2M
        }
2579
2.36M
        if (length == precision) {
2580
            /* The input string is not NUL-terminated.  If it ends with an
2581
             * incomplete UTF-8 sequence, truncate the string just before it.
2582
             * Incomplete sequences in the middle and sequences which cannot
2583
             * be valid prefixes are still treated as errors and replaced
2584
             * with \xfffd. */
2585
1.85k
            pconsumed = &length;
2586
1.85k
        }
2587
2.36M
    }
2588
2589
3.14M
    if (width < 0) {
2590
3.14M
        return _PyUnicode_DecodeUTF8Writer(writer, str, length,
2591
3.14M
                                           _Py_ERROR_REPLACE, "replace", pconsumed);
2592
3.14M
    }
2593
2594
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2595
0
                                                     "replace", pconsumed);
2596
0
    if (unicode == NULL)
2597
0
        return -1;
2598
2599
0
    int res = unicode_fromformat_write_str(writer, unicode,
2600
0
                                           width, -1, flags);
2601
0
    Py_DECREF(unicode);
2602
0
    return res;
2603
0
}
2604
2605
static int
2606
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2607
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2608
0
{
2609
0
    Py_ssize_t length;
2610
0
    if (precision == -1) {
2611
0
        length = wcslen(str);
2612
0
    }
2613
0
    else {
2614
0
        length = 0;
2615
0
        while (length < precision && str[length]) {
2616
0
            length++;
2617
0
        }
2618
0
    }
2619
2620
0
    if (width < 0) {
2621
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2622
0
                                             str, length);
2623
0
    }
2624
2625
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2626
0
    if (unicode == NULL)
2627
0
        return -1;
2628
2629
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2630
0
    Py_DECREF(unicode);
2631
0
    return res;
2632
0
}
2633
2634
0
#define F_LONG 1
2635
0
#define F_LONGLONG 2
2636
232k
#define F_SIZE 3
2637
0
#define F_PTRDIFF 4
2638
0
#define F_INTMAX 5
2639
2640
static const char*
2641
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2642
                       const char *f, va_list *vargs)
2643
24.4M
{
2644
24.4M
    const char *p;
2645
24.4M
    Py_ssize_t len;
2646
24.4M
    int flags = 0;
2647
24.4M
    Py_ssize_t width;
2648
24.4M
    Py_ssize_t precision;
2649
2650
24.4M
    p = f;
2651
24.4M
    f++;
2652
24.4M
    if (*f == '%') {
2653
945k
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2654
0
            return NULL;
2655
945k
        f++;
2656
945k
        return f;
2657
945k
    }
2658
2659
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2660
    /* Flags '+', ' ' and '#' are not particularly useful.
2661
     * They are not worth the implementation and maintenance costs.
2662
     * In addition, '#' should add "0" for "o" conversions for compatibility
2663
     * with printf, but it would confuse Python users. */
2664
23.5M
    while (1) {
2665
23.5M
        switch (*f++) {
2666
0
        case '-': flags |= F_LJUST; continue;
2667
1.66k
        case '0': flags |= F_ZERO; continue;
2668
0
        case '#': flags |= F_ALT; continue;
2669
23.5M
        }
2670
23.5M
        f--;
2671
23.5M
        break;
2672
23.5M
    }
2673
2674
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2675
23.5M
    width = -1;
2676
23.5M
    if (*f == '*') {
2677
0
        width = va_arg(*vargs, int);
2678
0
        if (width < 0) {
2679
0
            flags |= F_LJUST;
2680
0
            width = -width;
2681
0
        }
2682
0
        f++;
2683
0
    }
2684
23.5M
    else if (Py_ISDIGIT((unsigned)*f)) {
2685
1.66k
        width = *f - '0';
2686
1.66k
        f++;
2687
1.66k
        while (Py_ISDIGIT((unsigned)*f)) {
2688
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2689
0
                PyErr_SetString(PyExc_ValueError,
2690
0
                                "width too big");
2691
0
                return NULL;
2692
0
            }
2693
0
            width = (width * 10) + (*f - '0');
2694
0
            f++;
2695
0
        }
2696
1.66k
    }
2697
23.5M
    precision = -1;
2698
23.5M
    if (*f == '.') {
2699
4.73M
        f++;
2700
4.73M
        if (*f == '*') {
2701
0
            precision = va_arg(*vargs, int);
2702
0
            if (precision < 0) {
2703
0
                precision = -2;
2704
0
            }
2705
0
            f++;
2706
0
        }
2707
4.73M
        else if (Py_ISDIGIT((unsigned)*f)) {
2708
4.73M
            precision = (*f - '0');
2709
4.73M
            f++;
2710
14.1M
            while (Py_ISDIGIT((unsigned)*f)) {
2711
9.46M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2712
0
                    PyErr_SetString(PyExc_ValueError,
2713
0
                                    "precision too big");
2714
0
                    return NULL;
2715
0
                }
2716
9.46M
                precision = (precision * 10) + (*f - '0');
2717
9.46M
                f++;
2718
9.46M
            }
2719
4.73M
        }
2720
4.73M
    }
2721
2722
23.5M
    int sizemod = 0;
2723
23.5M
    if (*f == 'l') {
2724
0
        if (f[1] == 'l') {
2725
0
            sizemod = F_LONGLONG;
2726
0
            f += 2;
2727
0
        }
2728
0
        else {
2729
0
            sizemod = F_LONG;
2730
0
            ++f;
2731
0
        }
2732
0
    }
2733
23.5M
    else if (*f == 'z') {
2734
116k
        sizemod = F_SIZE;
2735
116k
        ++f;
2736
116k
    }
2737
23.4M
    else if (*f == 't') {
2738
0
        sizemod = F_PTRDIFF;
2739
0
        ++f;
2740
0
    }
2741
23.4M
    else if (*f == 'j') {
2742
0
        sizemod = F_INTMAX;
2743
0
        ++f;
2744
0
    }
2745
23.5M
    if (f[0] != '\0' && f[1] == '\0')
2746
3.92M
        writer->overallocate = 0;
2747
2748
23.5M
    switch (*f) {
2749
11.7M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2750
11.7M
        break;
2751
1.44M
    case 'c': case 'p':
2752
1.44M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2753
1.44M
        break;
2754
3.14M
    case 's':
2755
3.14M
    case 'V':
2756
3.14M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2757
3.14M
        break;
2758
7.14M
    default:
2759
7.14M
        if (sizemod) goto invalid_format;
2760
7.14M
        break;
2761
23.5M
    }
2762
2763
23.5M
    switch (*f) {
2764
1.44M
    case 'c':
2765
1.44M
    {
2766
1.44M
        int ordinal = va_arg(*vargs, int);
2767
1.44M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2768
0
            PyErr_SetString(PyExc_OverflowError,
2769
0
                            "character argument not in range(0x110000)");
2770
0
            return NULL;
2771
0
        }
2772
1.44M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2773
0
            return NULL;
2774
1.44M
        break;
2775
1.44M
    }
2776
2777
11.7M
    case 'd': case 'i':
2778
11.7M
    case 'o': case 'u': case 'x': case 'X':
2779
11.7M
    {
2780
11.7M
        char buffer[MAX_INTMAX_CHARS];
2781
2782
        // Fill buffer using sprinf, with one of many possible format
2783
        // strings, like "%llX" for `long long` in hexadecimal.
2784
        // The type/size is in `sizemod`; the format is in `*f`.
2785
2786
        // Use macros with nested switches to keep the sprintf format strings
2787
        // as compile-time literals, avoiding warnings and maybe allowing
2788
        // optimizations.
2789
2790
        // `SPRINT` macro does one sprintf
2791
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2792
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2793
11.7M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2794
11.7M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2795
2796
        // One inner switch to handle all format variants
2797
11.7M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2798
11.7M
            switch (*f) {                                                     \
2799
96
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2800
18.8k
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2801
1.28k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2802
936
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2803
11.7M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2804
11.7M
            }
2805
2806
        // Outer switch to handle all the sizes/types
2807
11.7M
        switch (sizemod) {
2808
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2809
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2810
116k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2811
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2812
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2813
11.6M
            default:         DO_SPRINTS("", int, unsigned int); break;
2814
11.7M
        }
2815
11.7M
        #undef SPRINT
2816
11.7M
        #undef DO_SPRINTS
2817
2818
11.7M
        assert(len >= 0);
2819
2820
11.7M
        int sign = (buffer[0] == '-');
2821
11.7M
        len -= sign;
2822
2823
11.7M
        precision = Py_MAX(precision, len);
2824
11.7M
        width = Py_MAX(width, precision + sign);
2825
11.7M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2826
1.66k
            precision = width - sign;
2827
1.66k
        }
2828
2829
11.7M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2830
11.7M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2831
2832
11.7M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2833
0
            return NULL;
2834
2835
11.7M
        if (spacepad && !(flags & F_LJUST)) {
2836
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2837
0
                return NULL;
2838
0
            writer->pos += spacepad;
2839
0
        }
2840
2841
11.7M
        if (sign) {
2842
811
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2843
0
                return NULL;
2844
811
        }
2845
2846
11.7M
        if (zeropad) {
2847
648
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2848
0
                return NULL;
2849
648
            writer->pos += zeropad;
2850
648
        }
2851
2852
11.7M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2853
0
            return NULL;
2854
2855
11.7M
        if (spacepad && (flags & F_LJUST)) {
2856
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2857
0
                return NULL;
2858
0
            writer->pos += spacepad;
2859
0
        }
2860
11.7M
        break;
2861
11.7M
    }
2862
2863
11.7M
    case 'p':
2864
2.92k
    {
2865
2.92k
        char number[MAX_INTMAX_CHARS];
2866
2867
2.92k
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2868
2.92k
        assert(len >= 0);
2869
2870
        /* %p is ill-defined:  ensure leading 0x. */
2871
2.92k
        if (number[1] == 'X')
2872
0
            number[1] = 'x';
2873
2.92k
        else if (number[1] != 'x') {
2874
0
            memmove(number + 2, number,
2875
0
                    strlen(number) + 1);
2876
0
            number[0] = '0';
2877
0
            number[1] = 'x';
2878
0
            len += 2;
2879
0
        }
2880
2881
2.92k
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2882
0
            return NULL;
2883
2.92k
        break;
2884
2.92k
    }
2885
2886
3.14M
    case 's':
2887
3.14M
    {
2888
3.14M
        if (sizemod) {
2889
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2890
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2891
0
                return NULL;
2892
0
        }
2893
3.14M
        else {
2894
            /* UTF-8 */
2895
3.14M
            const char *s = va_arg(*vargs, const char*);
2896
3.14M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2897
0
                return NULL;
2898
3.14M
        }
2899
3.14M
        break;
2900
3.14M
    }
2901
2902
3.33M
    case 'U':
2903
3.33M
    {
2904
3.33M
        PyObject *obj = va_arg(*vargs, PyObject *);
2905
3.33M
        assert(obj && _PyUnicode_CHECK(obj));
2906
2907
3.33M
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2908
0
            return NULL;
2909
3.33M
        break;
2910
3.33M
    }
2911
2912
3.33M
    case 'V':
2913
565
    {
2914
565
        PyObject *obj = va_arg(*vargs, PyObject *);
2915
565
        const char *str;
2916
565
        const wchar_t *wstr;
2917
565
        if (sizemod) {
2918
0
            wstr = va_arg(*vargs, const wchar_t*);
2919
0
        }
2920
565
        else {
2921
565
            str = va_arg(*vargs, const char *);
2922
565
        }
2923
565
        if (obj) {
2924
0
            assert(_PyUnicode_CHECK(obj));
2925
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
2926
0
                return NULL;
2927
0
        }
2928
565
        else if (sizemod) {
2929
0
            assert(wstr != NULL);
2930
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
2931
0
                return NULL;
2932
0
        }
2933
565
        else {
2934
565
            assert(str != NULL);
2935
565
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
2936
0
                return NULL;
2937
565
        }
2938
565
        break;
2939
565
    }
2940
2941
1.53k
    case 'S':
2942
1.53k
    {
2943
1.53k
        PyObject *obj = va_arg(*vargs, PyObject *);
2944
1.53k
        PyObject *str;
2945
1.53k
        assert(obj);
2946
1.53k
        str = PyObject_Str(obj);
2947
1.53k
        if (!str)
2948
0
            return NULL;
2949
1.53k
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
2950
0
            Py_DECREF(str);
2951
0
            return NULL;
2952
0
        }
2953
1.53k
        Py_DECREF(str);
2954
1.53k
        break;
2955
1.53k
    }
2956
2957
2.37M
    case 'R':
2958
2.37M
    {
2959
2.37M
        PyObject *obj = va_arg(*vargs, PyObject *);
2960
2.37M
        PyObject *repr;
2961
2.37M
        assert(obj);
2962
2.37M
        repr = PyObject_Repr(obj);
2963
2.37M
        if (!repr)
2964
0
            return NULL;
2965
2.37M
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
2966
0
            Py_DECREF(repr);
2967
0
            return NULL;
2968
0
        }
2969
2.37M
        Py_DECREF(repr);
2970
2.37M
        break;
2971
2.37M
    }
2972
2973
0
    case 'A':
2974
0
    {
2975
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2976
0
        PyObject *ascii;
2977
0
        assert(obj);
2978
0
        ascii = PyObject_ASCII(obj);
2979
0
        if (!ascii)
2980
0
            return NULL;
2981
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
2982
0
            Py_DECREF(ascii);
2983
0
            return NULL;
2984
0
        }
2985
0
        Py_DECREF(ascii);
2986
0
        break;
2987
0
    }
2988
2989
1.43M
    case 'T':
2990
1.43M
    {
2991
1.43M
        PyObject *obj = va_arg(*vargs, PyObject *);
2992
1.43M
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
2993
2994
1.43M
        PyObject *type_name;
2995
1.43M
        if (flags & F_ALT) {
2996
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
2997
0
        }
2998
1.43M
        else {
2999
1.43M
            type_name = PyType_GetFullyQualifiedName(type);
3000
1.43M
        }
3001
1.43M
        Py_DECREF(type);
3002
1.43M
        if (!type_name) {
3003
0
            return NULL;
3004
0
        }
3005
3006
1.43M
        if (unicode_fromformat_write_str(writer, type_name,
3007
1.43M
                                         width, precision, flags) == -1) {
3008
0
            Py_DECREF(type_name);
3009
0
            return NULL;
3010
0
        }
3011
1.43M
        Py_DECREF(type_name);
3012
1.43M
        break;
3013
1.43M
    }
3014
3015
0
    case 'N':
3016
0
    {
3017
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3018
0
        assert(type_raw != NULL);
3019
3020
0
        if (!PyType_Check(type_raw)) {
3021
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3022
0
            return NULL;
3023
0
        }
3024
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3025
3026
0
        PyObject *type_name;
3027
0
        if (flags & F_ALT) {
3028
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3029
0
        }
3030
0
        else {
3031
0
            type_name = PyType_GetFullyQualifiedName(type);
3032
0
        }
3033
0
        if (!type_name) {
3034
0
            return NULL;
3035
0
        }
3036
0
        if (unicode_fromformat_write_str(writer, type_name,
3037
0
                                         width, precision, flags) == -1) {
3038
0
            Py_DECREF(type_name);
3039
0
            return NULL;
3040
0
        }
3041
0
        Py_DECREF(type_name);
3042
0
        break;
3043
0
    }
3044
3045
0
    default:
3046
0
    invalid_format:
3047
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3048
0
        return NULL;
3049
23.5M
    }
3050
3051
23.5M
    f++;
3052
23.5M
    return f;
3053
23.5M
}
3054
3055
static int
3056
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3057
11.6M
{
3058
11.6M
    Py_ssize_t len = strlen(format);
3059
11.6M
    writer->min_length += len + 100;
3060
11.6M
    writer->overallocate = 1;
3061
3062
    // Copy varags to be able to pass a reference to a subfunction.
3063
11.6M
    va_list vargs2;
3064
11.6M
    va_copy(vargs2, vargs);
3065
3066
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3067
    // to be encoded to ASCII.
3068
11.6M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3069
11.6M
    if (!is_ascii) {
3070
0
        Py_ssize_t i;
3071
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3072
0
        PyErr_Format(PyExc_ValueError,
3073
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3074
0
            "string, got a non-ASCII byte: 0x%02x",
3075
0
            (unsigned char)format[i]);
3076
0
        goto fail;
3077
0
    }
3078
3079
65.7M
    for (const char *f = format; *f; ) {
3080
54.1M
        if (*f == '%') {
3081
24.4M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3082
24.4M
            if (f == NULL)
3083
0
                goto fail;
3084
24.4M
        }
3085
29.6M
        else {
3086
29.6M
            const char *p = strchr(f, '%');
3087
29.6M
            if (p != NULL) {
3088
21.9M
                len = p - f;
3089
21.9M
            }
3090
7.72M
            else {
3091
7.72M
                len = strlen(f);
3092
7.72M
                writer->overallocate = 0;
3093
7.72M
            }
3094
3095
29.6M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3096
0
                goto fail;
3097
0
            }
3098
29.6M
            f += len;
3099
29.6M
        }
3100
54.1M
    }
3101
11.6M
    va_end(vargs2);
3102
11.6M
    return 0;
3103
3104
0
  fail:
3105
0
    va_end(vargs2);
3106
0
    return -1;
3107
11.6M
}
3108
3109
PyObject *
3110
PyUnicode_FromFormatV(const char *format, va_list vargs)
3111
11.6M
{
3112
11.6M
    _PyUnicodeWriter writer;
3113
11.6M
    _PyUnicodeWriter_Init(&writer);
3114
3115
11.6M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3116
0
        _PyUnicodeWriter_Dealloc(&writer);
3117
0
        return NULL;
3118
0
    }
3119
11.6M
    return _PyUnicodeWriter_Finish(&writer);
3120
11.6M
}
3121
3122
PyObject *
3123
PyUnicode_FromFormat(const char *format, ...)
3124
541k
{
3125
541k
    PyObject* ret;
3126
541k
    va_list vargs;
3127
3128
541k
    va_start(vargs, format);
3129
541k
    ret = PyUnicode_FromFormatV(format, vargs);
3130
541k
    va_end(vargs);
3131
541k
    return ret;
3132
541k
}
3133
3134
int
3135
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3136
0
{
3137
0
    va_list vargs;
3138
0
    va_start(vargs, format);
3139
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3140
0
    va_end(vargs);
3141
0
    return res;
3142
0
}
3143
3144
int
3145
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3146
                         va_list vargs)
3147
0
{
3148
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3149
0
    Py_ssize_t old_pos = _writer->pos;
3150
3151
0
    int res = unicode_from_format(_writer, format, vargs);
3152
3153
0
    if (res < 0) {
3154
0
        _writer->pos = old_pos;
3155
0
    }
3156
0
    return res;
3157
0
}
3158
3159
static Py_ssize_t
3160
unicode_get_widechar_size(PyObject *unicode)
3161
176k
{
3162
176k
    Py_ssize_t res;
3163
3164
176k
    assert(unicode != NULL);
3165
176k
    assert(_PyUnicode_CHECK(unicode));
3166
3167
176k
    res = _PyUnicode_LENGTH(unicode);
3168
#if SIZEOF_WCHAR_T == 2
3169
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3170
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3171
        const Py_UCS4 *end = s + res;
3172
        for (; s < end; ++s) {
3173
            if (*s > 0xFFFF) {
3174
                ++res;
3175
            }
3176
        }
3177
    }
3178
#endif
3179
176k
    return res;
3180
176k
}
3181
3182
static void
3183
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3184
176k
{
3185
176k
    assert(unicode != NULL);
3186
176k
    assert(_PyUnicode_CHECK(unicode));
3187
3188
176k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3189
1.17k
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3190
1.17k
        return;
3191
1.17k
    }
3192
3193
175k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3194
148k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3195
4.27M
        for (; size--; ++s, ++w) {
3196
4.12M
            *w = *s;
3197
4.12M
        }
3198
148k
    }
3199
26.7k
    else {
3200
26.7k
#if SIZEOF_WCHAR_T == 4
3201
26.7k
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3202
26.7k
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3203
1.67M
        for (; size--; ++s, ++w) {
3204
1.64M
            *w = *s;
3205
1.64M
        }
3206
#else
3207
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3208
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3209
        for (; size--; ++s, ++w) {
3210
            Py_UCS4 ch = *s;
3211
            if (ch > 0xFFFF) {
3212
                assert(ch <= MAX_UNICODE);
3213
                /* encode surrogate pair in this case */
3214
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3215
                if (!size--)
3216
                    break;
3217
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3218
            }
3219
            else {
3220
                *w = ch;
3221
            }
3222
        }
3223
#endif
3224
26.7k
    }
3225
175k
}
3226
3227
#ifdef HAVE_WCHAR_H
3228
3229
/* Convert a Unicode object to a wide character string.
3230
3231
   - If w is NULL: return the number of wide characters (including the null
3232
     character) required to convert the unicode object. Ignore size argument.
3233
3234
   - Otherwise: return the number of wide characters (excluding the null
3235
     character) written into w. Write at most size wide characters (including
3236
     the null character). */
3237
Py_ssize_t
3238
PyUnicode_AsWideChar(PyObject *unicode,
3239
                     wchar_t *w,
3240
                     Py_ssize_t size)
3241
1.41k
{
3242
1.41k
    Py_ssize_t res;
3243
3244
1.41k
    if (unicode == NULL) {
3245
0
        PyErr_BadInternalCall();
3246
0
        return -1;
3247
0
    }
3248
1.41k
    if (!PyUnicode_Check(unicode)) {
3249
0
        PyErr_BadArgument();
3250
0
        return -1;
3251
0
    }
3252
3253
1.41k
    res = unicode_get_widechar_size(unicode);
3254
1.41k
    if (w == NULL) {
3255
0
        return res + 1;
3256
0
    }
3257
3258
1.41k
    if (size > res) {
3259
1.41k
        size = res + 1;
3260
1.41k
    }
3261
0
    else {
3262
0
        res = size;
3263
0
    }
3264
1.41k
    unicode_copy_as_widechar(unicode, w, size);
3265
3266
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3267
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3268
       non-Unicode locales and hence needs conversion first. */
3269
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3270
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3271
            return -1;
3272
        }
3273
    }
3274
#endif
3275
3276
1.41k
    return res;
3277
1.41k
}
3278
3279
wchar_t*
3280
PyUnicode_AsWideCharString(PyObject *unicode,
3281
                           Py_ssize_t *size)
3282
175k
{
3283
175k
    wchar_t *buffer;
3284
175k
    Py_ssize_t buflen;
3285
3286
175k
    if (unicode == NULL) {
3287
0
        PyErr_BadInternalCall();
3288
0
        return NULL;
3289
0
    }
3290
175k
    if (!PyUnicode_Check(unicode)) {
3291
0
        PyErr_BadArgument();
3292
0
        return NULL;
3293
0
    }
3294
3295
175k
    buflen = unicode_get_widechar_size(unicode);
3296
175k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3297
175k
    if (buffer == NULL) {
3298
0
        PyErr_NoMemory();
3299
0
        return NULL;
3300
0
    }
3301
175k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3302
3303
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3304
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3305
       non-Unicode locales and hence needs conversion first. */
3306
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3307
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3308
            return NULL;
3309
        }
3310
    }
3311
#endif
3312
3313
175k
    if (size != NULL) {
3314
173k
        *size = buflen;
3315
173k
    }
3316
1.14k
    else if (wcslen(buffer) != (size_t)buflen) {
3317
0
        PyMem_Free(buffer);
3318
0
        PyErr_SetString(PyExc_ValueError,
3319
0
                        "embedded null character");
3320
0
        return NULL;
3321
0
    }
3322
175k
    return buffer;
3323
175k
}
3324
3325
#endif /* HAVE_WCHAR_H */
3326
3327
int
3328
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3329
0
{
3330
0
    wchar_t **p = (wchar_t **)ptr;
3331
0
    if (obj == NULL) {
3332
0
        PyMem_Free(*p);
3333
0
        *p = NULL;
3334
0
        return 1;
3335
0
    }
3336
0
    if (PyUnicode_Check(obj)) {
3337
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3338
0
        if (*p == NULL) {
3339
0
            return 0;
3340
0
        }
3341
0
        return Py_CLEANUP_SUPPORTED;
3342
0
    }
3343
0
    PyErr_Format(PyExc_TypeError,
3344
0
                 "argument must be str, not %.50s",
3345
0
                 Py_TYPE(obj)->tp_name);
3346
0
    return 0;
3347
0
}
3348
3349
int
3350
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3351
0
{
3352
0
    wchar_t **p = (wchar_t **)ptr;
3353
0
    if (obj == NULL) {
3354
0
        PyMem_Free(*p);
3355
0
        *p = NULL;
3356
0
        return 1;
3357
0
    }
3358
0
    if (obj == Py_None) {
3359
0
        *p = NULL;
3360
0
        return 1;
3361
0
    }
3362
0
    if (PyUnicode_Check(obj)) {
3363
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3364
0
        if (*p == NULL) {
3365
0
            return 0;
3366
0
        }
3367
0
        return Py_CLEANUP_SUPPORTED;
3368
0
    }
3369
0
    PyErr_Format(PyExc_TypeError,
3370
0
                 "argument must be str or None, not %.50s",
3371
0
                 Py_TYPE(obj)->tp_name);
3372
0
    return 0;
3373
0
}
3374
3375
PyObject *
3376
PyUnicode_FromOrdinal(int ordinal)
3377
9.07M
{
3378
9.07M
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3379
34
        PyErr_SetString(PyExc_ValueError,
3380
34
                        "chr() arg not in range(0x110000)");
3381
34
        return NULL;
3382
34
    }
3383
3384
9.07M
    return unicode_char((Py_UCS4)ordinal);
3385
9.07M
}
3386
3387
PyObject *
3388
PyUnicode_FromObject(PyObject *obj)
3389
2.06M
{
3390
    /* XXX Perhaps we should make this API an alias of
3391
       PyObject_Str() instead ?! */
3392
2.06M
    if (PyUnicode_CheckExact(obj)) {
3393
2.06M
        return Py_NewRef(obj);
3394
2.06M
    }
3395
0
    if (PyUnicode_Check(obj)) {
3396
        /* For a Unicode subtype that's not a Unicode object,
3397
           return a true Unicode object with the same data. */
3398
0
        return _PyUnicode_Copy(obj);
3399
0
    }
3400
0
    PyErr_Format(PyExc_TypeError,
3401
0
                 "Can't convert '%.100s' object to str implicitly",
3402
0
                 Py_TYPE(obj)->tp_name);
3403
0
    return NULL;
3404
0
}
3405
3406
PyObject *
3407
PyUnicode_FromEncodedObject(PyObject *obj,
3408
                            const char *encoding,
3409
                            const char *errors)
3410
21.9M
{
3411
21.9M
    Py_buffer buffer;
3412
21.9M
    PyObject *v;
3413
3414
21.9M
    if (obj == NULL) {
3415
0
        PyErr_BadInternalCall();
3416
0
        return NULL;
3417
0
    }
3418
3419
    /* Decoding bytes objects is the most common case and should be fast */
3420
21.9M
    if (PyBytes_Check(obj)) {
3421
21.5M
        if (PyBytes_GET_SIZE(obj) == 0) {
3422
2.50M
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3423
0
                return NULL;
3424
0
            }
3425
2.50M
            _Py_RETURN_UNICODE_EMPTY();
3426
2.50M
        }
3427
19.0M
        return PyUnicode_Decode(
3428
19.0M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3429
19.0M
                encoding, errors);
3430
21.5M
    }
3431
3432
479k
    if (PyUnicode_Check(obj)) {
3433
0
        PyErr_SetString(PyExc_TypeError,
3434
0
                        "decoding str is not supported");
3435
0
        return NULL;
3436
0
    }
3437
3438
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3439
479k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3440
0
        PyErr_Format(PyExc_TypeError,
3441
0
                     "decoding to str: need a bytes-like object, %.80s found",
3442
0
                     Py_TYPE(obj)->tp_name);
3443
0
        return NULL;
3444
0
    }
3445
3446
479k
    if (buffer.len == 0) {
3447
0
        PyBuffer_Release(&buffer);
3448
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3449
0
            return NULL;
3450
0
        }
3451
0
        _Py_RETURN_UNICODE_EMPTY();
3452
0
    }
3453
3454
479k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3455
479k
    PyBuffer_Release(&buffer);
3456
479k
    return v;
3457
479k
}
3458
3459
/* Normalize an encoding name like encodings.normalize_encoding()
3460
   but allow to convert to lowercase if *to_lower* is true.
3461
   Return 1 on success, or 0 on error (encoding is longer than lower_len-1). */
3462
int
3463
_Py_normalize_encoding(const char *encoding,
3464
                       char *lower,
3465
                       size_t lower_len,
3466
                       int to_lower)
3467
26.3M
{
3468
26.3M
    const char *e;
3469
26.3M
    char *l;
3470
26.3M
    char *l_end;
3471
26.3M
    int punct;
3472
3473
26.3M
    assert(encoding != NULL);
3474
3475
26.3M
    e = encoding;
3476
26.3M
    l = lower;
3477
26.3M
    l_end = &lower[lower_len - 1];
3478
26.3M
    punct = 0;
3479
173M
    while (1) {
3480
173M
        char c = *e;
3481
173M
        if (c == 0) {
3482
25.7M
            break;
3483
25.7M
        }
3484
3485
147M
        if (Py_ISALNUM(c) || c == '.') {
3486
131M
            if (punct && l != lower) {
3487
12.3M
                if (l == l_end) {
3488
1.40k
                    return 0;
3489
1.40k
                }
3490
12.3M
                *l++ = '_';
3491
12.3M
            }
3492
131M
            punct = 0;
3493
3494
131M
            if (l == l_end) {
3495
581k
                return 0;
3496
581k
            }
3497
130M
            *l++ = to_lower ? Py_TOLOWER(c) : c;
3498
130M
        }
3499
16.2M
        else {
3500
16.2M
            punct = 1;
3501
16.2M
        }
3502
3503
147M
        e++;
3504
147M
    }
3505
25.7M
    *l = '\0';
3506
25.7M
    return 1;
3507
26.3M
}
3508
3509
PyObject *
3510
PyUnicode_Decode(const char *s,
3511
                 Py_ssize_t size,
3512
                 const char *encoding,
3513
                 const char *errors)
3514
19.5M
{
3515
19.5M
    PyObject *buffer = NULL, *unicode;
3516
19.5M
    Py_buffer info;
3517
19.5M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3518
3519
19.5M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3520
0
        return NULL;
3521
0
    }
3522
3523
19.5M
    if (size == 0) {
3524
0
        _Py_RETURN_UNICODE_EMPTY();
3525
0
    }
3526
3527
19.5M
    if (encoding == NULL) {
3528
45.1k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3529
45.1k
    }
3530
3531
    /* Shortcuts for common default encodings */
3532
19.5M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3533
19.5M
        char *lower = buflower;
3534
3535
        /* Fast paths */
3536
19.5M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3537
3.49M
            lower += 3;
3538
3.49M
            if (*lower == '_') {
3539
                /* Match "utf8" and "utf_8" */
3540
3.49M
                lower++;
3541
3.49M
            }
3542
3543
3.49M
            if (lower[0] == '8' && lower[1] == 0) {
3544
3.48M
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3545
3.48M
            }
3546
1.18k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3547
183
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3548
183
            }
3549
1.00k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3550
160
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3551
160
            }
3552
3.49M
        }
3553
16.0M
        else {
3554
16.0M
            if (strcmp(lower, "ascii") == 0
3555
11.9M
                || strcmp(lower, "us_ascii") == 0) {
3556
11.9M
                return PyUnicode_DecodeASCII(s, size, errors);
3557
11.9M
            }
3558
    #ifdef MS_WINDOWS
3559
            else if (strcmp(lower, "mbcs") == 0) {
3560
                return PyUnicode_DecodeMBCS(s, size, errors);
3561
            }
3562
    #endif
3563
4.11M
            else if (strcmp(lower, "latin1") == 0
3564
4.11M
                     || strcmp(lower, "latin_1") == 0
3565
1.18M
                     || strcmp(lower, "iso_8859_1") == 0
3566
2.95M
                     || strcmp(lower, "iso8859_1") == 0) {
3567
2.95M
                return PyUnicode_DecodeLatin1(s, size, errors);
3568
2.95M
            }
3569
16.0M
        }
3570
19.5M
    }
3571
3572
    /* Decode via the codec registry */
3573
1.17M
    buffer = NULL;
3574
1.17M
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3575
0
        goto onError;
3576
1.17M
    buffer = PyMemoryView_FromBuffer(&info);
3577
1.17M
    if (buffer == NULL)
3578
0
        goto onError;
3579
1.17M
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3580
1.17M
    if (unicode == NULL)
3581
94.0k
        goto onError;
3582
1.07M
    if (!PyUnicode_Check(unicode)) {
3583
0
        PyErr_Format(PyExc_TypeError,
3584
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3585
0
                     "use codecs.decode() to decode to arbitrary types",
3586
0
                     encoding,
3587
0
                     Py_TYPE(unicode)->tp_name);
3588
0
        Py_DECREF(unicode);
3589
0
        goto onError;
3590
0
    }
3591
1.07M
    Py_DECREF(buffer);
3592
1.07M
    return unicode_result(unicode);
3593
3594
94.0k
  onError:
3595
94.0k
    Py_XDECREF(buffer);
3596
94.0k
    return NULL;
3597
1.07M
}
3598
3599
PyAPI_FUNC(PyObject *)
3600
PyUnicode_AsDecodedObject(PyObject *unicode,
3601
                          const char *encoding,
3602
                          const char *errors)
3603
0
{
3604
0
    if (!PyUnicode_Check(unicode)) {
3605
0
        PyErr_BadArgument();
3606
0
        return NULL;
3607
0
    }
3608
3609
0
    if (encoding == NULL)
3610
0
        encoding = PyUnicode_GetDefaultEncoding();
3611
3612
    /* Decode via the codec registry */
3613
0
    return PyCodec_Decode(unicode, encoding, errors);
3614
0
}
3615
3616
PyAPI_FUNC(PyObject *)
3617
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3618
                           const char *encoding,
3619
                           const char *errors)
3620
0
{
3621
0
    PyObject *v;
3622
3623
0
    if (!PyUnicode_Check(unicode)) {
3624
0
        PyErr_BadArgument();
3625
0
        goto onError;
3626
0
    }
3627
3628
0
    if (encoding == NULL)
3629
0
        encoding = PyUnicode_GetDefaultEncoding();
3630
3631
    /* Decode via the codec registry */
3632
0
    v = PyCodec_Decode(unicode, encoding, errors);
3633
0
    if (v == NULL)
3634
0
        goto onError;
3635
0
    if (!PyUnicode_Check(v)) {
3636
0
        PyErr_Format(PyExc_TypeError,
3637
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3638
0
                     "use codecs.decode() to decode to arbitrary types",
3639
0
                     encoding,
3640
0
                     Py_TYPE(unicode)->tp_name);
3641
0
        Py_DECREF(v);
3642
0
        goto onError;
3643
0
    }
3644
0
    return unicode_result(v);
3645
3646
0
  onError:
3647
0
    return NULL;
3648
0
}
3649
3650
PyAPI_FUNC(PyObject *)
3651
PyUnicode_AsEncodedObject(PyObject *unicode,
3652
                          const char *encoding,
3653
                          const char *errors)
3654
0
{
3655
0
    PyObject *v;
3656
3657
0
    if (!PyUnicode_Check(unicode)) {
3658
0
        PyErr_BadArgument();
3659
0
        goto onError;
3660
0
    }
3661
3662
0
    if (encoding == NULL)
3663
0
        encoding = PyUnicode_GetDefaultEncoding();
3664
3665
    /* Encode via the codec registry */
3666
0
    v = PyCodec_Encode(unicode, encoding, errors);
3667
0
    if (v == NULL)
3668
0
        goto onError;
3669
0
    return v;
3670
3671
0
  onError:
3672
0
    return NULL;
3673
0
}
3674
3675
3676
static PyObject *
3677
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3678
                      int current_locale)
3679
0
{
3680
0
    Py_ssize_t wlen;
3681
0
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3682
0
    if (wstr == NULL) {
3683
0
        return NULL;
3684
0
    }
3685
3686
0
    if ((size_t)wlen != wcslen(wstr)) {
3687
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3688
0
        PyMem_Free(wstr);
3689
0
        return NULL;
3690
0
    }
3691
3692
0
    char *str;
3693
0
    size_t error_pos;
3694
0
    const char *reason;
3695
0
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3696
0
                                 current_locale, error_handler);
3697
0
    PyMem_Free(wstr);
3698
3699
0
    if (res != 0) {
3700
0
        if (res == -2) {
3701
0
            PyObject *exc;
3702
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3703
0
                    "locale", unicode,
3704
0
                    (Py_ssize_t)error_pos,
3705
0
                    (Py_ssize_t)(error_pos+1),
3706
0
                    reason);
3707
0
            if (exc != NULL) {
3708
0
                PyCodec_StrictErrors(exc);
3709
0
                Py_DECREF(exc);
3710
0
            }
3711
0
        }
3712
0
        else if (res == -3) {
3713
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3714
0
        }
3715
0
        else {
3716
0
            PyErr_NoMemory();
3717
0
        }
3718
0
        return NULL;
3719
0
    }
3720
3721
0
    PyObject *bytes = PyBytes_FromString(str);
3722
0
    PyMem_RawFree(str);
3723
0
    return bytes;
3724
0
}
3725
3726
PyObject *
3727
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3728
0
{
3729
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3730
0
    return unicode_encode_locale(unicode, error_handler, 1);
3731
0
}
3732
3733
PyObject *
3734
PyUnicode_EncodeFSDefault(PyObject *unicode)
3735
787k
{
3736
787k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3737
787k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3738
787k
    if (fs_codec->utf8) {
3739
787k
        return unicode_encode_utf8(unicode,
3740
787k
                                   fs_codec->error_handler,
3741
787k
                                   fs_codec->errors);
3742
787k
    }
3743
0
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3744
0
    else if (fs_codec->encoding) {
3745
0
        return PyUnicode_AsEncodedString(unicode,
3746
0
                                         fs_codec->encoding,
3747
0
                                         fs_codec->errors);
3748
0
    }
3749
0
#endif
3750
0
    else {
3751
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3752
           machinery is not ready and so cannot be used:
3753
           use wcstombs() in this case. */
3754
0
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3755
0
        const wchar_t *filesystem_errors = config->filesystem_errors;
3756
0
        assert(filesystem_errors != NULL);
3757
0
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3758
0
        assert(errors != _Py_ERROR_UNKNOWN);
3759
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3760
        return unicode_encode_utf8(unicode, errors, NULL);
3761
#else
3762
0
        return unicode_encode_locale(unicode, errors, 0);
3763
0
#endif
3764
0
    }
3765
787k
}
3766
3767
PyObject *
3768
PyUnicode_AsEncodedString(PyObject *unicode,
3769
                          const char *encoding,
3770
                          const char *errors)
3771
19.6M
{
3772
19.6M
    PyObject *v;
3773
19.6M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3774
3775
19.6M
    if (!PyUnicode_Check(unicode)) {
3776
0
        PyErr_BadArgument();
3777
0
        return NULL;
3778
0
    }
3779
3780
19.6M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3781
0
        return NULL;
3782
0
    }
3783
3784
19.6M
    if (encoding == NULL) {
3785
12.8M
        return _PyUnicode_AsUTF8String(unicode, errors);
3786
12.8M
    }
3787
3788
    /* Shortcuts for common default encodings */
3789
6.81M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower), 1)) {
3790
6.23M
        char *lower = buflower;
3791
3792
        /* Fast paths */
3793
6.23M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3794
5.13M
            lower += 3;
3795
5.13M
            if (*lower == '_') {
3796
                /* Match "utf8" and "utf_8" */
3797
5.13M
                lower++;
3798
5.13M
            }
3799
3800
5.13M
            if (lower[0] == '8' && lower[1] == 0) {
3801
5.12M
                return _PyUnicode_AsUTF8String(unicode, errors);
3802
5.12M
            }
3803
7.12k
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3804
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3805
0
            }
3806
7.12k
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3807
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3808
0
            }
3809
5.13M
        }
3810
1.10M
        else {
3811
1.10M
            if (strcmp(lower, "ascii") == 0
3812
833k
                || strcmp(lower, "us_ascii") == 0) {
3813
833k
                return _PyUnicode_AsASCIIString(unicode, errors);
3814
833k
            }
3815
#ifdef MS_WINDOWS
3816
            else if (strcmp(lower, "mbcs") == 0) {
3817
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3818
            }
3819
#endif
3820
269k
            else if (strcmp(lower, "latin1") == 0 ||
3821
269k
                     strcmp(lower, "latin_1") == 0 ||
3822
269k
                     strcmp(lower, "iso_8859_1") == 0 ||
3823
269k
                     strcmp(lower, "iso8859_1") == 0) {
3824
10
                return _PyUnicode_AsLatin1String(unicode, errors);
3825
10
            }
3826
1.10M
        }
3827
6.23M
    }
3828
3829
    /* Encode via the codec registry */
3830
855k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3831
855k
    if (v == NULL)
3832
0
        return NULL;
3833
3834
    /* The normal path */
3835
855k
    if (PyBytes_Check(v))
3836
855k
        return v;
3837
3838
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3839
0
    if (PyByteArray_Check(v)) {
3840
0
        int error;
3841
0
        PyObject *b;
3842
3843
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3844
0
            "encoder %s returned bytearray instead of bytes; "
3845
0
            "use codecs.encode() to encode to arbitrary types",
3846
0
            encoding);
3847
0
        if (error) {
3848
0
            Py_DECREF(v);
3849
0
            return NULL;
3850
0
        }
3851
3852
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3853
0
                                      PyByteArray_GET_SIZE(v));
3854
0
        Py_DECREF(v);
3855
0
        return b;
3856
0
    }
3857
3858
0
    PyErr_Format(PyExc_TypeError,
3859
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3860
0
                 "use codecs.encode() to encode to arbitrary types",
3861
0
                 encoding,
3862
0
                 Py_TYPE(v)->tp_name);
3863
0
    Py_DECREF(v);
3864
0
    return NULL;
3865
0
}
3866
3867
PyAPI_FUNC(PyObject *)
3868
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3869
                           const char *encoding,
3870
                           const char *errors)
3871
0
{
3872
0
    PyObject *v;
3873
3874
0
    if (!PyUnicode_Check(unicode)) {
3875
0
        PyErr_BadArgument();
3876
0
        goto onError;
3877
0
    }
3878
3879
0
    if (encoding == NULL)
3880
0
        encoding = PyUnicode_GetDefaultEncoding();
3881
3882
    /* Encode via the codec registry */
3883
0
    v = PyCodec_Encode(unicode, encoding, errors);
3884
0
    if (v == NULL)
3885
0
        goto onError;
3886
0
    if (!PyUnicode_Check(v)) {
3887
0
        PyErr_Format(PyExc_TypeError,
3888
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3889
0
                     "use codecs.encode() to encode to arbitrary types",
3890
0
                     encoding,
3891
0
                     Py_TYPE(v)->tp_name);
3892
0
        Py_DECREF(v);
3893
0
        goto onError;
3894
0
    }
3895
0
    return v;
3896
3897
0
  onError:
3898
0
    return NULL;
3899
0
}
3900
3901
static PyObject*
3902
unicode_decode_locale(const char *str, Py_ssize_t len,
3903
                      _Py_error_handler errors, int current_locale)
3904
284k
{
3905
284k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3906
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3907
0
        return NULL;
3908
0
    }
3909
3910
284k
    wchar_t *wstr;
3911
284k
    size_t wlen;
3912
284k
    const char *reason;
3913
284k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3914
284k
                                 current_locale, errors);
3915
284k
    if (res != 0) {
3916
0
        if (res == -2) {
3917
0
            PyObject *exc;
3918
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3919
0
                                        "locale", str, len,
3920
0
                                        (Py_ssize_t)wlen,
3921
0
                                        (Py_ssize_t)(wlen + 1),
3922
0
                                        reason);
3923
0
            if (exc != NULL) {
3924
0
                PyCodec_StrictErrors(exc);
3925
0
                Py_DECREF(exc);
3926
0
            }
3927
0
        }
3928
0
        else if (res == -3) {
3929
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3930
0
        }
3931
0
        else {
3932
0
            PyErr_NoMemory();
3933
0
        }
3934
0
        return NULL;
3935
0
    }
3936
3937
284k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3938
284k
    PyMem_RawFree(wstr);
3939
284k
    return unicode;
3940
284k
}
3941
3942
PyObject*
3943
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3944
                              const char *errors)
3945
0
{
3946
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3947
0
    return unicode_decode_locale(str, len, error_handler, 1);
3948
0
}
3949
3950
PyObject*
3951
PyUnicode_DecodeLocale(const char *str, const char *errors)
3952
284k
{
3953
284k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3954
284k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3955
284k
    return unicode_decode_locale(str, size, error_handler, 1);
3956
284k
}
3957
3958
3959
PyObject*
3960
235
PyUnicode_DecodeFSDefault(const char *s) {
3961
235
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3962
235
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3963
235
}
3964
3965
PyObject*
3966
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3967
131k
{
3968
131k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3969
131k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3970
131k
    if (fs_codec->utf8) {
3971
131k
        return unicode_decode_utf8(s, size,
3972
131k
                                   fs_codec->error_handler,
3973
131k
                                   fs_codec->errors,
3974
131k
                                   NULL);
3975
131k
    }
3976
37
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3977
37
    else if (fs_codec->encoding) {
3978
0
        return PyUnicode_Decode(s, size,
3979
0
                                fs_codec->encoding,
3980
0
                                fs_codec->errors);
3981
0
    }
3982
37
#endif
3983
37
    else {
3984
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3985
           machinery is not ready and so cannot be used:
3986
           use mbstowcs() in this case. */
3987
37
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3988
37
        const wchar_t *filesystem_errors = config->filesystem_errors;
3989
37
        assert(filesystem_errors != NULL);
3990
37
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3991
37
        assert(errors != _Py_ERROR_UNKNOWN);
3992
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3993
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3994
#else
3995
37
        return unicode_decode_locale(s, size, errors, 0);
3996
37
#endif
3997
37
    }
3998
131k
}
3999
4000
4001
int
4002
PyUnicode_FSConverter(PyObject* arg, void* addr)
4003
208k
{
4004
208k
    PyObject *path = NULL;
4005
208k
    PyObject *output = NULL;
4006
208k
    Py_ssize_t size;
4007
208k
    const char *data;
4008
208k
    if (arg == NULL) {
4009
0
        Py_DECREF(*(PyObject**)addr);
4010
0
        *(PyObject**)addr = NULL;
4011
0
        return 1;
4012
0
    }
4013
208k
    path = PyOS_FSPath(arg);
4014
208k
    if (path == NULL) {
4015
0
        return 0;
4016
0
    }
4017
208k
    if (PyBytes_Check(path)) {
4018
0
        output = path;
4019
0
    }
4020
208k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4021
208k
        output = PyUnicode_EncodeFSDefault(path);
4022
208k
        Py_DECREF(path);
4023
208k
        if (!output) {
4024
0
            return 0;
4025
0
        }
4026
208k
        assert(PyBytes_Check(output));
4027
208k
    }
4028
4029
208k
    size = PyBytes_GET_SIZE(output);
4030
208k
    data = PyBytes_AS_STRING(output);
4031
208k
    if ((size_t)size != strlen(data)) {
4032
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4033
0
        Py_DECREF(output);
4034
0
        return 0;
4035
0
    }
4036
208k
    *(PyObject**)addr = output;
4037
208k
    return Py_CLEANUP_SUPPORTED;
4038
208k
}
4039
4040
4041
int
4042
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4043
99.2k
{
4044
99.2k
    if (arg == NULL) {
4045
0
        Py_DECREF(*(PyObject**)addr);
4046
0
        *(PyObject**)addr = NULL;
4047
0
        return 1;
4048
0
    }
4049
4050
99.2k
    PyObject *path = PyOS_FSPath(arg);
4051
99.2k
    if (path == NULL) {
4052
0
        return 0;
4053
0
    }
4054
4055
99.2k
    PyObject *output = NULL;
4056
99.2k
    if (PyUnicode_Check(path)) {
4057
99.2k
        output = path;
4058
99.2k
    }
4059
0
    else if (PyBytes_Check(path)) {
4060
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4061
0
                                                  PyBytes_GET_SIZE(path));
4062
0
        Py_DECREF(path);
4063
0
        if (!output) {
4064
0
            return 0;
4065
0
        }
4066
0
    }
4067
0
    else {
4068
0
        PyErr_Format(PyExc_TypeError,
4069
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4070
0
                     Py_TYPE(arg)->tp_name);
4071
0
        Py_DECREF(path);
4072
0
        return 0;
4073
0
    }
4074
4075
99.2k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4076
99.2k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4077
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4078
0
        Py_DECREF(output);
4079
0
        return 0;
4080
0
    }
4081
99.2k
    *(PyObject**)addr = output;
4082
99.2k
    return Py_CLEANUP_SUPPORTED;
4083
99.2k
}
4084
4085
4086
static int unicode_fill_utf8(PyObject *unicode);
4087
4088
4089
static int
4090
unicode_ensure_utf8(PyObject *unicode)
4091
62.9M
{
4092
62.9M
    int err = 0;
4093
62.9M
    if (PyUnicode_UTF8(unicode) == NULL) {
4094
161k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4095
161k
        if (PyUnicode_UTF8(unicode) == NULL) {
4096
161k
            err = unicode_fill_utf8(unicode);
4097
161k
        }
4098
161k
        Py_END_CRITICAL_SECTION();
4099
161k
    }
4100
62.9M
    return err;
4101
62.9M
}
4102
4103
const char *
4104
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4105
62.9M
{
4106
62.9M
    if (!PyUnicode_Check(unicode)) {
4107
0
        PyErr_BadArgument();
4108
0
        if (psize) {
4109
0
            *psize = -1;
4110
0
        }
4111
0
        return NULL;
4112
0
    }
4113
4114
62.9M
    if (unicode_ensure_utf8(unicode) == -1) {
4115
206
        if (psize) {
4116
206
            *psize = -1;
4117
206
        }
4118
206
        return NULL;
4119
206
    }
4120
4121
62.9M
    if (psize) {
4122
62.6M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4123
62.6M
    }
4124
62.9M
    return PyUnicode_UTF8(unicode);
4125
62.9M
}
4126
4127
const char *
4128
PyUnicode_AsUTF8(PyObject *unicode)
4129
247k
{
4130
247k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4131
247k
}
4132
4133
const char *
4134
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4135
2.86M
{
4136
2.86M
    Py_ssize_t size;
4137
2.86M
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4138
2.86M
    if (s && strlen(s) != (size_t)size) {
4139
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4140
0
        return NULL;
4141
0
    }
4142
2.86M
    return s;
4143
2.86M
}
4144
4145
/*
4146
PyUnicode_GetSize() has been deprecated since Python 3.3
4147
because it returned length of Py_UNICODE.
4148
4149
But this function is part of stable abi, because it doesn't
4150
include Py_UNICODE in signature and it was not excluded from
4151
stable ABI in PEP 384.
4152
*/
4153
PyAPI_FUNC(Py_ssize_t)
4154
PyUnicode_GetSize(PyObject *unicode)
4155
0
{
4156
0
    PyErr_SetString(PyExc_RuntimeError,
4157
0
                    "PyUnicode_GetSize has been removed.");
4158
0
    return -1;
4159
0
}
4160
4161
Py_ssize_t
4162
PyUnicode_GetLength(PyObject *unicode)
4163
25.9k
{
4164
25.9k
    if (!PyUnicode_Check(unicode)) {
4165
0
        PyErr_BadArgument();
4166
0
        return -1;
4167
0
    }
4168
25.9k
    return PyUnicode_GET_LENGTH(unicode);
4169
25.9k
}
4170
4171
Py_UCS4
4172
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4173
26
{
4174
26
    const void *data;
4175
26
    int kind;
4176
4177
26
    if (!PyUnicode_Check(unicode)) {
4178
0
        PyErr_BadArgument();
4179
0
        return (Py_UCS4)-1;
4180
0
    }
4181
26
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4182
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4183
0
        return (Py_UCS4)-1;
4184
0
    }
4185
26
    data = PyUnicode_DATA(unicode);
4186
26
    kind = PyUnicode_KIND(unicode);
4187
26
    return PyUnicode_READ(kind, data, index);
4188
26
}
4189
4190
int
4191
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4192
0
{
4193
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4194
0
        PyErr_BadArgument();
4195
0
        return -1;
4196
0
    }
4197
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4198
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4199
0
        return -1;
4200
0
    }
4201
0
    if (unicode_check_modifiable(unicode))
4202
0
        return -1;
4203
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4204
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4205
0
        return -1;
4206
0
    }
4207
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4208
0
                    index, ch);
4209
0
    return 0;
4210
0
}
4211
4212
const char *
4213
PyUnicode_GetDefaultEncoding(void)
4214
0
{
4215
0
    return "utf-8";
4216
0
}
4217
4218
/* create or adjust a UnicodeDecodeError */
4219
static void
4220
make_decode_exception(PyObject **exceptionObject,
4221
                      const char *encoding,
4222
                      const char *input, Py_ssize_t length,
4223
                      Py_ssize_t startpos, Py_ssize_t endpos,
4224
                      const char *reason)
4225
2.58M
{
4226
2.58M
    if (*exceptionObject == NULL) {
4227
2.35M
        *exceptionObject = PyUnicodeDecodeError_Create(
4228
2.35M
            encoding, input, length, startpos, endpos, reason);
4229
2.35M
    }
4230
238k
    else {
4231
238k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4232
0
            goto onError;
4233
238k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4234
0
            goto onError;
4235
238k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4236
0
            goto onError;
4237
238k
    }
4238
2.58M
    return;
4239
4240
2.58M
onError:
4241
0
    Py_CLEAR(*exceptionObject);
4242
0
}
4243
4244
#ifdef MS_WINDOWS
4245
static int
4246
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4247
{
4248
    if (newsize > *size) {
4249
        wchar_t *newbuf = *buf;
4250
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4251
            PyErr_NoMemory();
4252
            return -1;
4253
        }
4254
        *buf = newbuf;
4255
    }
4256
    *size = newsize;
4257
    return 0;
4258
}
4259
4260
/* error handling callback helper:
4261
   build arguments, call the callback and check the arguments,
4262
   if no exception occurred, copy the replacement to the output
4263
   and adjust various state variables.
4264
   return 0 on success, -1 on error
4265
*/
4266
4267
static int
4268
unicode_decode_call_errorhandler_wchar(
4269
    const char *errors, PyObject **errorHandler,
4270
    const char *encoding, const char *reason,
4271
    const char **input, const char **inend, Py_ssize_t *startinpos,
4272
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4273
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4274
{
4275
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4276
4277
    PyObject *restuple = NULL;
4278
    PyObject *repunicode = NULL;
4279
    Py_ssize_t outsize;
4280
    Py_ssize_t insize;
4281
    Py_ssize_t requiredsize;
4282
    Py_ssize_t newpos;
4283
    PyObject *inputobj = NULL;
4284
    Py_ssize_t repwlen;
4285
4286
    if (*errorHandler == NULL) {
4287
        *errorHandler = PyCodec_LookupError(errors);
4288
        if (*errorHandler == NULL)
4289
            goto onError;
4290
    }
4291
4292
    make_decode_exception(exceptionObject,
4293
        encoding,
4294
        *input, *inend - *input,
4295
        *startinpos, *endinpos,
4296
        reason);
4297
    if (*exceptionObject == NULL)
4298
        goto onError;
4299
4300
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4301
    if (restuple == NULL)
4302
        goto onError;
4303
    if (!PyTuple_Check(restuple)) {
4304
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4305
        goto onError;
4306
    }
4307
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4308
        goto onError;
4309
4310
    /* Copy back the bytes variables, which might have been modified by the
4311
       callback */
4312
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4313
    if (!inputobj)
4314
        goto onError;
4315
    *input = PyBytes_AS_STRING(inputobj);
4316
    insize = PyBytes_GET_SIZE(inputobj);
4317
    *inend = *input + insize;
4318
    /* we can DECREF safely, as the exception has another reference,
4319
       so the object won't go away. */
4320
    Py_DECREF(inputobj);
4321
4322
    if (newpos<0)
4323
        newpos = insize+newpos;
4324
    if (newpos<0 || newpos>insize) {
4325
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4326
        goto onError;
4327
    }
4328
4329
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4330
    if (repwlen < 0)
4331
        goto onError;
4332
    repwlen--;
4333
    /* need more space? (at least enough for what we
4334
       have+the replacement+the rest of the string (starting
4335
       at the new input position), so we won't have to check space
4336
       when there are no errors in the rest of the string) */
4337
    requiredsize = *outpos;
4338
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4339
        goto overflow;
4340
    requiredsize += repwlen;
4341
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4342
        goto overflow;
4343
    requiredsize += insize - newpos;
4344
    outsize = *bufsize;
4345
    if (requiredsize > outsize) {
4346
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4347
            requiredsize = 2*outsize;
4348
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4349
            goto onError;
4350
        }
4351
    }
4352
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4353
    *outpos += repwlen;
4354
    *endinpos = newpos;
4355
    *inptr = *input + newpos;
4356
4357
    /* we made it! */
4358
    Py_DECREF(restuple);
4359
    return 0;
4360
4361
  overflow:
4362
    PyErr_SetString(PyExc_OverflowError,
4363
                    "decoded result is too long for a Python string");
4364
4365
  onError:
4366
    Py_XDECREF(restuple);
4367
    return -1;
4368
}
4369
#endif   /* MS_WINDOWS */
4370
4371
static int
4372
unicode_decode_call_errorhandler_writer(
4373
    const char *errors, PyObject **errorHandler,
4374
    const char *encoding, const char *reason,
4375
    const char **input, const char **inend, Py_ssize_t *startinpos,
4376
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4377
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4378
2.58M
{
4379
2.58M
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4380
4381
2.58M
    PyObject *restuple = NULL;
4382
2.58M
    PyObject *repunicode = NULL;
4383
2.58M
    Py_ssize_t insize;
4384
2.58M
    Py_ssize_t newpos;
4385
2.58M
    Py_ssize_t replen;
4386
2.58M
    Py_ssize_t remain;
4387
2.58M
    PyObject *inputobj = NULL;
4388
2.58M
    int need_to_grow = 0;
4389
2.58M
    const char *new_inptr;
4390
4391
2.58M
    if (*errorHandler == NULL) {
4392
2.35M
        *errorHandler = PyCodec_LookupError(errors);
4393
2.35M
        if (*errorHandler == NULL)
4394
0
            goto onError;
4395
2.35M
    }
4396
4397
2.58M
    make_decode_exception(exceptionObject,
4398
2.58M
        encoding,
4399
2.58M
        *input, *inend - *input,
4400
2.58M
        *startinpos, *endinpos,
4401
2.58M
        reason);
4402
2.58M
    if (*exceptionObject == NULL)
4403
0
        goto onError;
4404
4405
2.58M
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4406
2.58M
    if (restuple == NULL)
4407
2.31M
        goto onError;
4408
278k
    if (!PyTuple_Check(restuple)) {
4409
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4410
0
        goto onError;
4411
0
    }
4412
278k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4413
0
        goto onError;
4414
4415
    /* Copy back the bytes variables, which might have been modified by the
4416
       callback */
4417
278k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4418
278k
    if (!inputobj)
4419
0
        goto onError;
4420
278k
    remain = *inend - *input - *endinpos;
4421
278k
    *input = PyBytes_AS_STRING(inputobj);
4422
278k
    insize = PyBytes_GET_SIZE(inputobj);
4423
278k
    *inend = *input + insize;
4424
    /* we can DECREF safely, as the exception has another reference,
4425
       so the object won't go away. */
4426
278k
    Py_DECREF(inputobj);
4427
4428
278k
    if (newpos<0)
4429
0
        newpos = insize+newpos;
4430
278k
    if (newpos<0 || newpos>insize) {
4431
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4432
0
        goto onError;
4433
0
    }
4434
4435
278k
    replen = PyUnicode_GET_LENGTH(repunicode);
4436
278k
    if (replen > 1) {
4437
36.8k
        writer->min_length += replen - 1;
4438
36.8k
        need_to_grow = 1;
4439
36.8k
    }
4440
278k
    new_inptr = *input + newpos;
4441
278k
    if (*inend - new_inptr > remain) {
4442
        /* We don't know the decoding algorithm here so we make the worst
4443
           assumption that one byte decodes to one unicode character.
4444
           If unfortunately one byte could decode to more unicode characters,
4445
           the decoder may write out-of-bound then.  Is it possible for the
4446
           algorithms using this function? */
4447
21.5k
        writer->min_length += *inend - new_inptr - remain;
4448
21.5k
        need_to_grow = 1;
4449
21.5k
    }
4450
278k
    if (need_to_grow) {
4451
37.0k
        writer->overallocate = 1;
4452
37.0k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4453
37.0k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4454
0
            goto onError;
4455
37.0k
    }
4456
278k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4457
0
        goto onError;
4458
4459
278k
    *endinpos = newpos;
4460
278k
    *inptr = new_inptr;
4461
4462
    /* we made it! */
4463
278k
    Py_DECREF(restuple);
4464
278k
    return 0;
4465
4466
2.31M
  onError:
4467
2.31M
    Py_XDECREF(restuple);
4468
2.31M
    return -1;
4469
278k
}
4470
4471
/* --- UTF-7 Codec -------------------------------------------------------- */
4472
4473
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4474
4475
/* Three simple macros defining base-64. */
4476
4477
/* Is c a base-64 character? */
4478
4479
#define IS_BASE64(c) \
4480
305k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4481
305k
     ((c) >= 'a' && (c) <= 'z') ||     \
4482
305k
     ((c) >= '0' && (c) <= '9') ||     \
4483
305k
     (c) == '+' || (c) == '/')
4484
4485
/* given that c is a base-64 character, what is its base-64 value? */
4486
4487
#define FROM_BASE64(c)                                                  \
4488
269k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4489
269k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4490
218k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4491
131k
     (c) == '+' ? 62 : 63)
4492
4493
/* What is the base-64 character of the bottom 6 bits of n? */
4494
4495
#define TO_BASE64(n)  \
4496
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4497
4498
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4499
 * decoded as itself.  We are permissive on decoding; the only ASCII
4500
 * byte not decoding to itself is the + which begins a base64
4501
 * string. */
4502
4503
#define DECODE_DIRECT(c)                                \
4504
7.12M
    ((c) <= 127 && (c) != '+')
4505
4506
/* The UTF-7 encoder treats ASCII characters differently according to
4507
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4508
 * the above).  See RFC2152.  This array identifies these different
4509
 * sets:
4510
 * 0 : "Set D"
4511
 *     alphanumeric and '(),-./:?
4512
 * 1 : "Set O"
4513
 *     !"#$%&*;<=>@[]^_`{|}
4514
 * 2 : "whitespace"
4515
 *     ht nl cr sp
4516
 * 3 : special (must be base64 encoded)
4517
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4518
 */
4519
4520
static
4521
char utf7_category[128] = {
4522
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4523
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4524
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4525
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4526
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4527
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4528
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4529
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4530
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4531
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4532
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4533
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4534
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4535
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4536
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4537
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4538
};
4539
4540
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4541
 * answer depends on whether we are encoding set O as itself, and also
4542
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4543
 * clear that the answers to these questions vary between
4544
 * applications, so this code needs to be flexible.  */
4545
4546
#define ENCODE_DIRECT(c) \
4547
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4548
4549
PyObject *
4550
PyUnicode_DecodeUTF7(const char *s,
4551
                     Py_ssize_t size,
4552
                     const char *errors)
4553
0
{
4554
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4555
0
}
4556
4557
/* The decoder.  The only state we preserve is our read position,
4558
 * i.e. how many characters we have consumed.  So if we end in the
4559
 * middle of a shift sequence we have to back off the read position
4560
 * and the output to the beginning of the sequence, otherwise we lose
4561
 * all the shift state (seen bits, number of bits seen, high
4562
 * surrogate). */
4563
4564
PyObject *
4565
PyUnicode_DecodeUTF7Stateful(const char *s,
4566
                             Py_ssize_t size,
4567
                             const char *errors,
4568
                             Py_ssize_t *consumed)
4569
22.1k
{
4570
22.1k
    const char *starts = s;
4571
22.1k
    Py_ssize_t startinpos;
4572
22.1k
    Py_ssize_t endinpos;
4573
22.1k
    const char *e;
4574
22.1k
    _PyUnicodeWriter writer;
4575
22.1k
    const char *errmsg = "";
4576
22.1k
    int inShift = 0;
4577
22.1k
    Py_ssize_t shiftOutStart;
4578
22.1k
    unsigned int base64bits = 0;
4579
22.1k
    unsigned long base64buffer = 0;
4580
22.1k
    Py_UCS4 surrogate = 0;
4581
22.1k
    PyObject *errorHandler = NULL;
4582
22.1k
    PyObject *exc = NULL;
4583
4584
22.1k
    if (size == 0) {
4585
0
        if (consumed)
4586
0
            *consumed = 0;
4587
0
        _Py_RETURN_UNICODE_EMPTY();
4588
0
    }
4589
4590
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4591
22.1k
    _PyUnicodeWriter_Init(&writer);
4592
22.1k
    writer.min_length = size;
4593
4594
22.1k
    shiftOutStart = 0;
4595
22.1k
    e = s + size;
4596
4597
7.44M
    while (s < e) {
4598
7.43M
        Py_UCS4 ch;
4599
7.43M
      restart:
4600
7.43M
        ch = (unsigned char) *s;
4601
4602
7.43M
        if (inShift) { /* in a base-64 section */
4603
285k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4604
269k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4605
269k
                base64bits += 6;
4606
269k
                s++;
4607
269k
                if (base64bits >= 16) {
4608
                    /* we have enough bits for a UTF-16 value */
4609
95.8k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4610
95.8k
                    base64bits -= 16;
4611
95.8k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4612
95.8k
                    assert(outCh <= 0xffff);
4613
95.8k
                    if (surrogate) {
4614
                        /* expecting a second surrogate */
4615
8.32k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4616
2.91k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4617
2.91k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4618
0
                                goto onError;
4619
2.91k
                            surrogate = 0;
4620
2.91k
                            continue;
4621
2.91k
                        }
4622
5.40k
                        else {
4623
5.40k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4624
0
                                goto onError;
4625
5.40k
                            surrogate = 0;
4626
5.40k
                        }
4627
8.32k
                    }
4628
92.9k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4629
                        /* first surrogate */
4630
11.4k
                        surrogate = outCh;
4631
11.4k
                    }
4632
81.4k
                    else {
4633
81.4k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4634
0
                            goto onError;
4635
81.4k
                    }
4636
92.9k
                }
4637
269k
            }
4638
15.1k
            else { /* now leaving a base-64 section */
4639
15.1k
                inShift = 0;
4640
15.1k
                if (base64bits > 0) { /* left-over bits */
4641
12.0k
                    if (base64bits >= 6) {
4642
                        /* We've seen at least one base-64 character */
4643
5.59k
                        s++;
4644
5.59k
                        errmsg = "partial character in shift sequence";
4645
5.59k
                        goto utf7Error;
4646
5.59k
                    }
4647
6.44k
                    else {
4648
                        /* Some bits remain; they should be zero */
4649
6.44k
                        if (base64buffer != 0) {
4650
1.52k
                            s++;
4651
1.52k
                            errmsg = "non-zero padding bits in shift sequence";
4652
1.52k
                            goto utf7Error;
4653
1.52k
                        }
4654
6.44k
                    }
4655
12.0k
                }
4656
8.05k
                if (surrogate && DECODE_DIRECT(ch)) {
4657
2.30k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4658
0
                        goto onError;
4659
2.30k
                }
4660
8.05k
                surrogate = 0;
4661
8.05k
                if (ch == '-') {
4662
                    /* '-' is absorbed; other terminating
4663
                       characters are preserved */
4664
1.99k
                    s++;
4665
1.99k
                }
4666
8.05k
            }
4667
285k
        }
4668
7.14M
        else if ( ch == '+' ) {
4669
23.6k
            startinpos = s-starts;
4670
23.6k
            s++; /* consume '+' */
4671
23.6k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4672
2.58k
                s++;
4673
2.58k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4674
0
                    goto onError;
4675
2.58k
            }
4676
21.0k
            else if (s < e && !IS_BASE64(*s)) {
4677
2.53k
                s++;
4678
2.53k
                errmsg = "ill-formed sequence";
4679
2.53k
                goto utf7Error;
4680
2.53k
            }
4681
18.5k
            else { /* begin base64-encoded section */
4682
18.5k
                inShift = 1;
4683
18.5k
                surrogate = 0;
4684
18.5k
                shiftOutStart = writer.pos;
4685
18.5k
                base64bits = 0;
4686
18.5k
                base64buffer = 0;
4687
18.5k
            }
4688
23.6k
        }
4689
7.12M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4690
7.01M
            s++;
4691
7.01M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4692
0
                goto onError;
4693
7.01M
        }
4694
111k
        else {
4695
111k
            startinpos = s-starts;
4696
111k
            s++;
4697
111k
            errmsg = "unexpected special character";
4698
111k
            goto utf7Error;
4699
111k
        }
4700
7.30M
        continue;
4701
7.30M
utf7Error:
4702
121k
        endinpos = s-starts;
4703
121k
        if (unicode_decode_call_errorhandler_writer(
4704
121k
                errors, &errorHandler,
4705
121k
                "utf7", errmsg,
4706
121k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4707
121k
                &writer))
4708
9.89k
            goto onError;
4709
121k
    }
4710
4711
    /* end of string */
4712
4713
12.2k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4714
        /* if we're in an inconsistent state, that's an error */
4715
3.35k
        inShift = 0;
4716
3.35k
        if (surrogate ||
4717
2.98k
                (base64bits >= 6) ||
4718
1.94k
                (base64bits > 0 && base64buffer != 0)) {
4719
1.94k
            endinpos = size;
4720
1.94k
            if (unicode_decode_call_errorhandler_writer(
4721
1.94k
                    errors, &errorHandler,
4722
1.94k
                    "utf7", "unterminated shift sequence",
4723
1.94k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4724
1.94k
                    &writer))
4725
1.59k
                goto onError;
4726
346
            if (s < e)
4727
0
                goto restart;
4728
346
        }
4729
3.35k
    }
4730
4731
    /* return state */
4732
10.6k
    if (consumed) {
4733
0
        if (inShift) {
4734
0
            *consumed = startinpos;
4735
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4736
0
                PyObject *result = PyUnicode_FromKindAndData(
4737
0
                        writer.kind, writer.data, shiftOutStart);
4738
0
                Py_XDECREF(errorHandler);
4739
0
                Py_XDECREF(exc);
4740
0
                _PyUnicodeWriter_Dealloc(&writer);
4741
0
                return result;
4742
0
            }
4743
0
            writer.pos = shiftOutStart; /* back off output */
4744
0
        }
4745
0
        else {
4746
0
            *consumed = s-starts;
4747
0
        }
4748
0
    }
4749
4750
10.6k
    Py_XDECREF(errorHandler);
4751
10.6k
    Py_XDECREF(exc);
4752
10.6k
    return _PyUnicodeWriter_Finish(&writer);
4753
4754
11.4k
  onError:
4755
11.4k
    Py_XDECREF(errorHandler);
4756
11.4k
    Py_XDECREF(exc);
4757
11.4k
    _PyUnicodeWriter_Dealloc(&writer);
4758
11.4k
    return NULL;
4759
10.6k
}
4760
4761
4762
PyObject *
4763
_PyUnicode_EncodeUTF7(PyObject *str,
4764
                      const char *errors)
4765
0
{
4766
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4767
0
    if (len == 0) {
4768
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4769
0
    }
4770
0
    int kind = PyUnicode_KIND(str);
4771
0
    const void *data = PyUnicode_DATA(str);
4772
4773
    /* It might be possible to tighten this worst case */
4774
0
    if (len > PY_SSIZE_T_MAX / 8) {
4775
0
        return PyErr_NoMemory();
4776
0
    }
4777
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4778
0
    if (writer == NULL) {
4779
0
        return NULL;
4780
0
    }
4781
4782
0
    int inShift = 0;
4783
0
    unsigned int base64bits = 0;
4784
0
    unsigned long base64buffer = 0;
4785
0
    char *out = PyBytesWriter_GetData(writer);
4786
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4787
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4788
4789
0
        if (inShift) {
4790
0
            if (ENCODE_DIRECT(ch)) {
4791
                /* shifting out */
4792
0
                if (base64bits) { /* output remaining bits */
4793
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4794
0
                    base64buffer = 0;
4795
0
                    base64bits = 0;
4796
0
                }
4797
0
                inShift = 0;
4798
                /* Characters not in the BASE64 set implicitly unshift the sequence
4799
                   so no '-' is required, except if the character is itself a '-' */
4800
0
                if (IS_BASE64(ch) || ch == '-') {
4801
0
                    *out++ = '-';
4802
0
                }
4803
0
                *out++ = (char) ch;
4804
0
            }
4805
0
            else {
4806
0
                goto encode_char;
4807
0
            }
4808
0
        }
4809
0
        else { /* not in a shift sequence */
4810
0
            if (ch == '+') {
4811
0
                *out++ = '+';
4812
0
                        *out++ = '-';
4813
0
            }
4814
0
            else if (ENCODE_DIRECT(ch)) {
4815
0
                *out++ = (char) ch;
4816
0
            }
4817
0
            else {
4818
0
                *out++ = '+';
4819
0
                inShift = 1;
4820
0
                goto encode_char;
4821
0
            }
4822
0
        }
4823
0
        continue;
4824
0
encode_char:
4825
0
        if (ch >= 0x10000) {
4826
0
            assert(ch <= MAX_UNICODE);
4827
4828
            /* code first surrogate */
4829
0
            base64bits += 16;
4830
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4831
0
            while (base64bits >= 6) {
4832
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4833
0
                base64bits -= 6;
4834
0
            }
4835
            /* prepare second surrogate */
4836
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4837
0
        }
4838
0
        base64bits += 16;
4839
0
        base64buffer = (base64buffer << 16) | ch;
4840
0
        while (base64bits >= 6) {
4841
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4842
0
            base64bits -= 6;
4843
0
        }
4844
0
    }
4845
0
    if (base64bits)
4846
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4847
0
    if (inShift)
4848
0
        *out++ = '-';
4849
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4850
0
}
4851
4852
#undef IS_BASE64
4853
#undef FROM_BASE64
4854
#undef TO_BASE64
4855
#undef DECODE_DIRECT
4856
#undef ENCODE_DIRECT
4857
4858
/* --- UTF-8 Codec -------------------------------------------------------- */
4859
4860
PyObject *
4861
PyUnicode_DecodeUTF8(const char *s,
4862
                     Py_ssize_t size,
4863
                     const char *errors)
4864
70.1M
{
4865
70.1M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4866
70.1M
}
4867
4868
#include "stringlib/asciilib.h"
4869
#include "stringlib/codecs.h"
4870
#include "stringlib/undef.h"
4871
4872
#include "stringlib/ucs1lib.h"
4873
#include "stringlib/codecs.h"
4874
#include "stringlib/undef.h"
4875
4876
#include "stringlib/ucs2lib.h"
4877
#include "stringlib/codecs.h"
4878
#include "stringlib/undef.h"
4879
4880
#include "stringlib/ucs4lib.h"
4881
#include "stringlib/codecs.h"
4882
#include "stringlib/undef.h"
4883
4884
#if (SIZEOF_SIZE_T == 8)
4885
/* Mask to quickly check whether a C 'size_t' contains a
4886
   non-ASCII, UTF8-encoded char. */
4887
179M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4888
// used to count codepoints in UTF-8 string.
4889
49.9M
# define VECTOR_0101     0x0101010101010101ULL
4890
931k
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4891
#elif (SIZEOF_SIZE_T == 4)
4892
# define ASCII_CHAR_MASK 0x80808080U
4893
# define VECTOR_0101     0x01010101U
4894
# define VECTOR_00FF     0x00ff00ffU
4895
#else
4896
# error C 'size_t' size should be either 4 or 8!
4897
#endif
4898
4899
#if (defined(__clang__) || defined(__GNUC__))
4900
#define HAVE_CTZ 1
4901
static inline unsigned int
4902
ctz(size_t v)
4903
12.7M
{
4904
12.7M
    return __builtin_ctzll((unsigned long long)v);
4905
12.7M
}
4906
#elif defined(_MSC_VER)
4907
#define HAVE_CTZ 1
4908
static inline unsigned int
4909
ctz(size_t v)
4910
{
4911
    unsigned long pos;
4912
#if SIZEOF_SIZE_T == 4
4913
    _BitScanForward(&pos, v);
4914
#else
4915
    _BitScanForward64(&pos, v);
4916
#endif /* SIZEOF_SIZE_T */
4917
    return pos;
4918
}
4919
#else
4920
#define HAVE_CTZ 0
4921
#endif
4922
4923
#if HAVE_CTZ && PY_LITTLE_ENDIAN
4924
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
4925
static size_t
4926
load_unaligned(const unsigned char *p, size_t size)
4927
58.0M
{
4928
58.0M
    union {
4929
58.0M
        size_t s;
4930
58.0M
        unsigned char b[SIZEOF_SIZE_T];
4931
58.0M
    } u;
4932
58.0M
    u.s = 0;
4933
    // This switch statement assumes little endian because:
4934
    // * union is faster than bitwise or and shift.
4935
    // * big endian machine is rare and hard to maintain.
4936
58.0M
    switch (size) {
4937
0
    default:
4938
0
#if SIZEOF_SIZE_T == 8
4939
0
    case 8:
4940
0
        u.b[7] = p[7];
4941
0
        _Py_FALLTHROUGH;
4942
3.46M
    case 7:
4943
3.46M
        u.b[6] = p[6];
4944
3.46M
        _Py_FALLTHROUGH;
4945
9.52M
    case 6:
4946
9.52M
        u.b[5] = p[5];
4947
9.52M
        _Py_FALLTHROUGH;
4948
18.3M
    case 5:
4949
18.3M
        u.b[4] = p[4];
4950
18.3M
        _Py_FALLTHROUGH;
4951
18.3M
#endif
4952
23.8M
    case 4:
4953
23.8M
        u.b[3] = p[3];
4954
23.8M
        _Py_FALLTHROUGH;
4955
37.4M
    case 3:
4956
37.4M
        u.b[2] = p[2];
4957
37.4M
        _Py_FALLTHROUGH;
4958
49.7M
    case 2:
4959
49.7M
        u.b[1] = p[1];
4960
49.7M
        _Py_FALLTHROUGH;
4961
52.2M
    case 1:
4962
52.2M
        u.b[0] = p[0];
4963
52.2M
        break;
4964
5.86M
    case 0:
4965
5.86M
        break;
4966
58.0M
    }
4967
58.0M
    return u.s;
4968
58.0M
}
4969
#endif
4970
4971
/*
4972
 * Find the first non-ASCII character in a byte sequence.
4973
 *
4974
 * This function scans a range of bytes from `start` to `end` and returns the
4975
 * index of the first byte that is not an ASCII character (i.e., has the most
4976
 * significant bit set). If all characters in the range are ASCII, it returns
4977
 * `end - start`.
4978
 */
4979
static Py_ssize_t
4980
find_first_nonascii(const unsigned char *start, const unsigned char *end)
4981
64.4M
{
4982
    // The search is done in `size_t` chunks.
4983
    // The start and end might not be aligned at `size_t` boundaries,
4984
    // so they're handled specially.
4985
4986
64.4M
    const unsigned char *p = start;
4987
4988
64.4M
    if (end - start >= SIZEOF_SIZE_T) {
4989
        // Avoid unaligned read.
4990
22.1M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
4991
22.1M
        size_t u;
4992
22.1M
        memcpy(&u, p, sizeof(size_t));
4993
22.1M
        u &= ASCII_CHAR_MASK;
4994
22.1M
        if (u) {
4995
5.14M
            return (ctz(u) - 7) / 8;
4996
5.14M
        }
4997
17.0M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
4998
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
4999
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5000
        while (p < p2) {
5001
            if (*p & 0x80) {
5002
                return p - start;
5003
            }
5004
            p++;
5005
        }
5006
#endif
5007
5008
17.0M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5009
108M
        while (p <= e) {
5010
92.2M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5011
92.2M
            if (u) {
5012
1.25M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5013
1.25M
                return p - start + (ctz(u) - 7) / 8;
5014
#else
5015
                // big endian and minor compilers are difficult to test.
5016
                // fallback to per byte check.
5017
                break;
5018
#endif
5019
1.25M
            }
5020
91.0M
            p += SIZEOF_SIZE_T;
5021
91.0M
        }
5022
17.0M
    }
5023
58.0M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5024
64.4M
    assert((end - p) < SIZEOF_SIZE_T);
5025
    // we can not use *(const size_t*)p to avoid buffer overrun.
5026
58.0M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5027
58.0M
    if (u) {
5028
6.35M
        return p - start + (ctz(u) - 7) / 8;
5029
6.35M
    }
5030
51.7M
    return end - start;
5031
#else
5032
    while (p < end) {
5033
        if (*p & 0x80) {
5034
            break;
5035
        }
5036
        p++;
5037
    }
5038
    return p - start;
5039
#endif
5040
58.0M
}
5041
5042
static inline int
5043
scalar_utf8_start_char(unsigned int ch)
5044
934k
{
5045
    // 0xxxxxxx or 11xxxxxx are first byte.
5046
934k
    return (~ch >> 7 | ch >> 6) & 1;
5047
934k
}
5048
5049
static inline size_t
5050
vector_utf8_start_chars(size_t v)
5051
49.9M
{
5052
49.9M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5053
49.9M
}
5054
5055
5056
// Count the number of UTF-8 code points in a given byte sequence.
5057
static Py_ssize_t
5058
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5059
365k
{
5060
365k
    Py_ssize_t len = 0;
5061
5062
365k
    if (end - s >= SIZEOF_SIZE_T) {
5063
295k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5064
16.6k
            len += scalar_utf8_start_char(*s++);
5065
16.6k
        }
5066
5067
745k
        while (s + SIZEOF_SIZE_T <= end) {
5068
465k
            const unsigned char *e = end;
5069
465k
            if (e - s > SIZEOF_SIZE_T * 255) {
5070
188k
                e = s + SIZEOF_SIZE_T * 255;
5071
188k
            }
5072
465k
            Py_ssize_t vstart = 0;
5073
50.4M
            while (s + SIZEOF_SIZE_T <= e) {
5074
49.9M
                size_t v = *(size_t*)s;
5075
49.9M
                size_t vs = vector_utf8_start_chars(v);
5076
49.9M
                vstart += vs;
5077
49.9M
                s += SIZEOF_SIZE_T;
5078
49.9M
            }
5079
465k
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5080
465k
            vstart += vstart >> 16;
5081
465k
#if SIZEOF_SIZE_T == 8
5082
465k
            vstart += vstart >> 32;
5083
465k
#endif
5084
465k
            len += vstart & 0x7ff;
5085
465k
        }
5086
279k
    }
5087
1.28M
    while (s < end) {
5088
917k
        len += scalar_utf8_start_char(*s++);
5089
917k
    }
5090
365k
    return len;
5091
365k
}
5092
5093
static Py_ssize_t
5094
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5095
14.5M
{
5096
14.5M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5097
14.5M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5098
14.4M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5099
11.4M
    {
5100
        /* Fast path, see in STRINGLIB(utf8_decode) for
5101
           an explanation. */
5102
11.4M
        const char *p = start;
5103
11.4M
        Py_UCS1 *q = dest;
5104
16.4M
        while (p + SIZEOF_SIZE_T <= end) {
5105
7.34M
            size_t value = *(const size_t *) p;
5106
7.34M
            if (value & ASCII_CHAR_MASK)
5107
2.38M
                break;
5108
4.96M
            *((size_t *)q) = value;
5109
4.96M
            p += SIZEOF_SIZE_T;
5110
4.96M
            q += SIZEOF_SIZE_T;
5111
4.96M
        }
5112
54.0M
        while (p < end) {
5113
44.9M
            if ((unsigned char)*p & 0x80)
5114
2.41M
                break;
5115
42.5M
            *q++ = *p++;
5116
42.5M
        }
5117
11.4M
        return p - start;
5118
11.4M
    }
5119
3.04M
#endif
5120
3.04M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5121
3.04M
                                         (const unsigned char*)end);
5122
3.04M
    memcpy(dest, start, pos);
5123
3.04M
    return pos;
5124
14.5M
}
5125
5126
static int
5127
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5128
                         const char *starts, const char *s, const char *end,
5129
                         _Py_error_handler error_handler,
5130
                         const char *errors,
5131
                         Py_ssize_t *consumed)
5132
12.7M
{
5133
12.7M
    Py_ssize_t startinpos, endinpos;
5134
12.7M
    const char *errmsg = "";
5135
12.7M
    PyObject *error_handler_obj = NULL;
5136
12.7M
    PyObject *exc = NULL;
5137
5138
329M
    while (s < end) {
5139
325M
        Py_UCS4 ch;
5140
325M
        int kind = writer->kind;
5141
5142
325M
        if (kind == PyUnicode_1BYTE_KIND) {
5143
12.9M
            if (PyUnicode_IS_ASCII(writer->buffer))
5144
12.3M
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5145
538k
            else
5146
538k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5147
312M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5148
114M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5149
197M
        } else {
5150
197M
            assert(kind == PyUnicode_4BYTE_KIND);
5151
197M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5152
197M
        }
5153
5154
325M
        switch (ch) {
5155
8.33M
        case 0:
5156
8.33M
            if (s == end || consumed)
5157
8.30M
                goto End;
5158
25.7k
            errmsg = "unexpected end of data";
5159
25.7k
            startinpos = s - starts;
5160
25.7k
            endinpos = end - starts;
5161
25.7k
            break;
5162
232M
        case 1:
5163
232M
            errmsg = "invalid start byte";
5164
232M
            startinpos = s - starts;
5165
232M
            endinpos = startinpos + 1;
5166
232M
            break;
5167
70.4M
        case 2:
5168
70.4M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5169
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5170
0
            {
5171
                /* Truncated surrogate code in range D800-DFFF */
5172
0
                goto End;
5173
0
            }
5174
70.4M
            _Py_FALLTHROUGH;
5175
71.9M
        case 3:
5176
72.1M
        case 4:
5177
72.1M
            errmsg = "invalid continuation byte";
5178
72.1M
            startinpos = s - starts;
5179
72.1M
            endinpos = startinpos + ch - 1;
5180
72.1M
            break;
5181
12.4M
        default:
5182
            // ch doesn't fit into kind, so change the buffer kind to write
5183
            // the character
5184
12.4M
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5185
0
                goto onError;
5186
12.4M
            continue;
5187
325M
        }
5188
5189
304M
        if (error_handler == _Py_ERROR_UNKNOWN)
5190
173k
            error_handler = _Py_GetErrorHandler(errors);
5191
5192
304M
        switch (error_handler) {
5193
0
        case _Py_ERROR_IGNORE:
5194
0
            s += (endinpos - startinpos);
5195
0
            break;
5196
5197
301M
        case _Py_ERROR_REPLACE:
5198
301M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5199
0
                goto onError;
5200
301M
            s += (endinpos - startinpos);
5201
301M
            break;
5202
5203
2.94M
        case _Py_ERROR_SURROGATEESCAPE:
5204
2.94M
        {
5205
2.94M
            Py_ssize_t i;
5206
5207
2.94M
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5208
0
                goto onError;
5209
5.88M
            for (i=startinpos; i<endinpos; i++) {
5210
2.94M
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5211
2.94M
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5212
2.94M
                                ch + 0xdc00);
5213
2.94M
                writer->pos++;
5214
2.94M
            }
5215
2.94M
            s += (endinpos - startinpos);
5216
2.94M
            break;
5217
2.94M
        }
5218
5219
1.59k
        default:
5220
1.59k
            if (unicode_decode_call_errorhandler_writer(
5221
1.59k
                    errors, &error_handler_obj,
5222
1.59k
                    "utf-8", errmsg,
5223
1.59k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5224
1.59k
                    writer)) {
5225
1.58k
                goto onError;
5226
1.58k
            }
5227
5228
8
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5229
0
                goto onError;
5230
0
            }
5231
304M
        }
5232
304M
    }
5233
5234
12.7M
End:
5235
12.7M
    if (consumed)
5236
931
        *consumed = s - starts;
5237
5238
12.7M
    Py_XDECREF(error_handler_obj);
5239
12.7M
    Py_XDECREF(exc);
5240
12.7M
    return 0;
5241
5242
1.58k
onError:
5243
1.58k
    Py_XDECREF(error_handler_obj);
5244
1.58k
    Py_XDECREF(exc);
5245
1.58k
    return -1;
5246
12.7M
}
5247
5248
5249
static PyObject *
5250
unicode_decode_utf8(const char *s, Py_ssize_t size,
5251
                    _Py_error_handler error_handler, const char *errors,
5252
                    Py_ssize_t *consumed)
5253
102M
{
5254
102M
    if (size == 0) {
5255
2.33M
        if (consumed) {
5256
0
            *consumed = 0;
5257
0
        }
5258
2.33M
        _Py_RETURN_UNICODE_EMPTY();
5259
2.33M
    }
5260
5261
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5262
100M
    if (size == 1 && (unsigned char)s[0] < 128) {
5263
38.9M
        if (consumed) {
5264
0
            *consumed = 1;
5265
0
        }
5266
38.9M
        return get_latin1_char((unsigned char)s[0]);
5267
38.9M
    }
5268
5269
    // I don't know this check is necessary or not. But there is a test
5270
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5271
61.4M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5272
0
        PyErr_NoMemory();
5273
0
        return NULL;
5274
0
    }
5275
5276
61.4M
    const char *starts = s;
5277
61.4M
    const char *end = s + size;
5278
5279
61.4M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5280
61.4M
    if (pos == size) {  // fast path: ASCII string.
5281
48.7M
        PyObject *u = PyUnicode_New(size, 127);
5282
48.7M
        if (u == NULL) {
5283
0
            return NULL;
5284
0
        }
5285
48.7M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5286
48.7M
        if (consumed) {
5287
104
            *consumed = size;
5288
104
        }
5289
48.7M
        return u;
5290
48.7M
    }
5291
5292
12.7M
    int maxchr = 127;
5293
12.7M
    Py_ssize_t maxsize = size;
5294
5295
12.7M
    unsigned char ch = (unsigned char)(s[pos]);
5296
    // error handler other than strict may remove/replace the invalid byte.
5297
    // consumed != NULL allows 1~3 bytes remainings.
5298
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5299
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5300
    // reallocation and copy.
5301
12.7M
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5302
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5303
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5304
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5305
        // means that it is no longer necessary to allocate several times the required amount
5306
        // of memory.
5307
365k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5308
365k
        if (ch < 0xc4) { // latin1
5309
233k
            maxchr = 0xff;
5310
233k
        }
5311
132k
        else if (ch < 0xf0) { // ucs2
5312
119k
            maxchr = 0xffff;
5313
119k
        }
5314
13.3k
        else { // ucs4
5315
13.3k
            maxchr = 0x10ffff;
5316
13.3k
        }
5317
365k
    }
5318
12.7M
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5319
12.7M
    if (!u) {
5320
0
        return NULL;
5321
0
    }
5322
5323
    // Use _PyUnicodeWriter after fast path is failed.
5324
12.7M
    _PyUnicodeWriter writer;
5325
12.7M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5326
12.7M
    if (maxchr <= 255) {
5327
12.5M
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5328
12.5M
        s += pos;
5329
12.5M
        writer.pos = pos;
5330
12.5M
    }
5331
5332
12.7M
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5333
12.7M
                                 error_handler, errors,
5334
12.7M
                                 consumed) < 0) {
5335
1.58k
        _PyUnicodeWriter_Dealloc(&writer);
5336
1.58k
        return NULL;
5337
1.58k
    }
5338
12.7M
    return _PyUnicodeWriter_Finish(&writer);
5339
12.7M
}
5340
5341
5342
// Used by PyUnicodeWriter_WriteUTF8() implementation
5343
int
5344
_PyUnicode_DecodeUTF8Writer(_PyUnicodeWriter *writer,
5345
                            const char *s, Py_ssize_t size,
5346
                            _Py_error_handler error_handler, const char *errors,
5347
                            Py_ssize_t *consumed)
5348
3.14M
{
5349
3.14M
    if (size == 0) {
5350
8.97k
        if (consumed) {
5351
0
            *consumed = 0;
5352
0
        }
5353
8.97k
        return 0;
5354
8.97k
    }
5355
5356
    // fast path: try ASCII string.
5357
3.13M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5358
0
        return -1;
5359
0
    }
5360
5361
3.13M
    const char *starts = s;
5362
3.13M
    const char *end = s + size;
5363
3.13M
    Py_ssize_t decoded = 0;
5364
3.13M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5365
3.13M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5366
3.13M
        decoded = ascii_decode(s, end, dest);
5367
3.13M
        writer->pos += decoded;
5368
5369
3.13M
        if (decoded == size) {
5370
3.08M
            if (consumed) {
5371
923
                *consumed = size;
5372
923
            }
5373
3.08M
            return 0;
5374
3.08M
        }
5375
49.1k
        s += decoded;
5376
49.1k
    }
5377
5378
51.1k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5379
51.1k
                                    error_handler, errors, consumed);
5380
3.13M
}
5381
5382
5383
PyObject *
5384
PyUnicode_DecodeUTF8Stateful(const char *s,
5385
                             Py_ssize_t size,
5386
                             const char *errors,
5387
                             Py_ssize_t *consumed)
5388
102M
{
5389
102M
    return unicode_decode_utf8(s, size,
5390
102M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5391
102M
                               errors, consumed);
5392
102M
}
5393
5394
5395
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5396
   non-zero, use strict error handler otherwise.
5397
5398
   On success, write a pointer to a newly allocated wide character string into
5399
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5400
   (in number of wchar_t units) into *wlen (if wlen is set).
5401
5402
   On memory allocation failure, return -1.
5403
5404
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5405
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5406
   is not NULL, write the decoding error message into *reason. */
5407
int
5408
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5409
                 const char **reason, _Py_error_handler errors)
5410
296
{
5411
296
    const char *orig_s = s;
5412
296
    const char *e;
5413
296
    wchar_t *unicode;
5414
296
    Py_ssize_t outpos;
5415
5416
296
    int surrogateescape = 0;
5417
296
    int surrogatepass = 0;
5418
296
    switch (errors)
5419
296
    {
5420
0
    case _Py_ERROR_STRICT:
5421
0
        break;
5422
296
    case _Py_ERROR_SURROGATEESCAPE:
5423
296
        surrogateescape = 1;
5424
296
        break;
5425
0
    case _Py_ERROR_SURROGATEPASS:
5426
0
        surrogatepass = 1;
5427
0
        break;
5428
0
    default:
5429
0
        return -3;
5430
296
    }
5431
5432
    /* Note: size will always be longer than the resulting Unicode
5433
       character count */
5434
296
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5435
0
        return -1;
5436
0
    }
5437
5438
296
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5439
296
    if (!unicode) {
5440
0
        return -1;
5441
0
    }
5442
5443
    /* Unpack UTF-8 encoded data */
5444
296
    e = s + size;
5445
296
    outpos = 0;
5446
296
    while (s < e) {
5447
296
        Py_UCS4 ch;
5448
296
#if SIZEOF_WCHAR_T == 4
5449
296
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5450
#else
5451
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5452
#endif
5453
296
        if (ch > 0xFF) {
5454
0
#if SIZEOF_WCHAR_T == 4
5455
0
            Py_UNREACHABLE();
5456
#else
5457
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5458
            /* write a surrogate pair */
5459
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5460
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5461
#endif
5462
0
        }
5463
296
        else {
5464
296
            if (!ch && s == e) {
5465
296
                break;
5466
296
            }
5467
5468
0
            if (surrogateescape) {
5469
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5470
0
            }
5471
0
            else {
5472
                /* Is it a valid three-byte code? */
5473
0
                if (surrogatepass
5474
0
                    && (e - s) >= 3
5475
0
                    && (s[0] & 0xf0) == 0xe0
5476
0
                    && (s[1] & 0xc0) == 0x80
5477
0
                    && (s[2] & 0xc0) == 0x80)
5478
0
                {
5479
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5480
0
                    s += 3;
5481
0
                    unicode[outpos++] = ch;
5482
0
                }
5483
0
                else {
5484
0
                    PyMem_RawFree(unicode );
5485
0
                    if (reason != NULL) {
5486
0
                        switch (ch) {
5487
0
                        case 0:
5488
0
                            *reason = "unexpected end of data";
5489
0
                            break;
5490
0
                        case 1:
5491
0
                            *reason = "invalid start byte";
5492
0
                            break;
5493
                        /* 2, 3, 4 */
5494
0
                        default:
5495
0
                            *reason = "invalid continuation byte";
5496
0
                            break;
5497
0
                        }
5498
0
                    }
5499
0
                    if (wlen != NULL) {
5500
0
                        *wlen = s - orig_s;
5501
0
                    }
5502
0
                    return -2;
5503
0
                }
5504
0
            }
5505
0
        }
5506
296
    }
5507
296
    unicode[outpos] = L'\0';
5508
296
    if (wlen) {
5509
296
        *wlen = outpos;
5510
296
    }
5511
296
    *wstr = unicode;
5512
296
    return 0;
5513
296
}
5514
5515
5516
wchar_t*
5517
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5518
                               size_t *wlen)
5519
0
{
5520
0
    wchar_t *wstr;
5521
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5522
0
                               &wstr, wlen,
5523
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5524
0
    if (res != 0) {
5525
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5526
0
        assert(res != -3);
5527
0
        if (wlen) {
5528
0
            *wlen = (size_t)res;
5529
0
        }
5530
0
        return NULL;
5531
0
    }
5532
0
    return wstr;
5533
0
}
5534
5535
5536
/* UTF-8 encoder.
5537
5538
   On success, return 0 and write the newly allocated character string (use
5539
   PyMem_Free() to free the memory) into *str.
5540
5541
   On encoding failure, return -2 and write the position of the invalid
5542
   surrogate character into *error_pos (if error_pos is set) and the decoding
5543
   error message into *reason (if reason is set).
5544
5545
   On memory allocation failure, return -1. */
5546
int
5547
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5548
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5549
629
{
5550
629
    const Py_ssize_t max_char_size = 4;
5551
629
    Py_ssize_t len = wcslen(text);
5552
5553
629
    assert(len >= 0);
5554
5555
629
    int surrogateescape = 0;
5556
629
    int surrogatepass = 0;
5557
629
    switch (errors)
5558
629
    {
5559
148
    case _Py_ERROR_STRICT:
5560
148
        break;
5561
481
    case _Py_ERROR_SURROGATEESCAPE:
5562
481
        surrogateescape = 1;
5563
481
        break;
5564
0
    case _Py_ERROR_SURROGATEPASS:
5565
0
        surrogatepass = 1;
5566
0
        break;
5567
0
    default:
5568
0
        return -3;
5569
629
    }
5570
5571
629
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5572
0
        return -1;
5573
0
    }
5574
629
    char *bytes;
5575
629
    if (raw_malloc) {
5576
629
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5577
629
    }
5578
0
    else {
5579
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5580
0
    }
5581
629
    if (bytes == NULL) {
5582
0
        return -1;
5583
0
    }
5584
5585
629
    char *p = bytes;
5586
629
    Py_ssize_t i;
5587
19.7k
    for (i = 0; i < len; ) {
5588
19.0k
        Py_ssize_t ch_pos = i;
5589
19.0k
        Py_UCS4 ch = text[i];
5590
19.0k
        i++;
5591
19.0k
        if (sizeof(wchar_t) == 2
5592
0
            && Py_UNICODE_IS_HIGH_SURROGATE(ch)
5593
0
            && i < len
5594
0
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5595
0
        {
5596
0
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5597
0
            i++;
5598
0
        }
5599
5600
19.0k
        if (ch < 0x80) {
5601
            /* Encode ASCII */
5602
19.0k
            *p++ = (char) ch;
5603
5604
19.0k
        }
5605
0
        else if (ch < 0x0800) {
5606
            /* Encode Latin-1 */
5607
0
            *p++ = (char)(0xc0 | (ch >> 6));
5608
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5609
0
        }
5610
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5611
            /* surrogateescape error handler */
5612
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5613
0
                if (error_pos != NULL) {
5614
0
                    *error_pos = (size_t)ch_pos;
5615
0
                }
5616
0
                if (reason != NULL) {
5617
0
                    *reason = "encoding error";
5618
0
                }
5619
0
                if (raw_malloc) {
5620
0
                    PyMem_RawFree(bytes);
5621
0
                }
5622
0
                else {
5623
0
                    PyMem_Free(bytes);
5624
0
                }
5625
0
                return -2;
5626
0
            }
5627
0
            *p++ = (char)(ch & 0xff);
5628
0
        }
5629
0
        else if (ch < 0x10000) {
5630
0
            *p++ = (char)(0xe0 | (ch >> 12));
5631
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5632
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5633
0
        }
5634
0
        else {  /* ch >= 0x10000 */
5635
0
            assert(ch <= MAX_UNICODE);
5636
            /* Encode UCS4 Unicode ordinals */
5637
0
            *p++ = (char)(0xf0 | (ch >> 18));
5638
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5639
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5640
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5641
0
        }
5642
19.0k
    }
5643
629
    *p++ = '\0';
5644
5645
629
    size_t final_size = (p - bytes);
5646
629
    char *bytes2;
5647
629
    if (raw_malloc) {
5648
629
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5649
629
    }
5650
0
    else {
5651
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5652
0
    }
5653
629
    if (bytes2 == NULL) {
5654
0
        if (error_pos != NULL) {
5655
0
            *error_pos = (size_t)-1;
5656
0
        }
5657
0
        if (raw_malloc) {
5658
0
            PyMem_RawFree(bytes);
5659
0
        }
5660
0
        else {
5661
0
            PyMem_Free(bytes);
5662
0
        }
5663
0
        return -1;
5664
0
    }
5665
629
    *str = bytes2;
5666
629
    return 0;
5667
629
}
5668
5669
5670
/* Primary internal function which creates utf8 encoded bytes objects.
5671
5672
   Allocation strategy:  if the string is short, convert into a stack buffer
5673
   and allocate exactly as much space needed at the end.  Else allocate the
5674
   maximum possible needed (4 result bytes per Unicode character), and return
5675
   the excess memory at the end.
5676
*/
5677
static PyObject *
5678
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5679
                    const char *errors)
5680
18.7M
{
5681
18.7M
    if (!PyUnicode_Check(unicode)) {
5682
0
        PyErr_BadArgument();
5683
0
        return NULL;
5684
0
    }
5685
5686
18.7M
    if (PyUnicode_UTF8(unicode))
5687
9.83M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5688
9.83M
                                         PyUnicode_UTF8_LENGTH(unicode));
5689
5690
8.90M
    int kind = PyUnicode_KIND(unicode);
5691
8.90M
    const void *data = PyUnicode_DATA(unicode);
5692
8.90M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5693
5694
8.90M
    PyBytesWriter *writer;
5695
8.90M
    char *end;
5696
5697
8.90M
    switch (kind) {
5698
0
    default:
5699
0
        Py_UNREACHABLE();
5700
5.93M
    case PyUnicode_1BYTE_KIND:
5701
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5702
5.93M
        assert(!PyUnicode_IS_ASCII(unicode));
5703
5.93M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5704
5.93M
                                      error_handler, errors, &end);
5705
5.93M
        break;
5706
1.76M
    case PyUnicode_2BYTE_KIND:
5707
1.76M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5708
1.76M
                                      error_handler, errors, &end);
5709
1.76M
        break;
5710
1.21M
    case PyUnicode_4BYTE_KIND:
5711
1.21M
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5712
1.21M
                                      error_handler, errors, &end);
5713
1.21M
        break;
5714
8.90M
    }
5715
5716
8.90M
    if (writer == NULL) {
5717
170k
        PyBytesWriter_Discard(writer);
5718
170k
        return NULL;
5719
170k
    }
5720
8.73M
    return PyBytesWriter_FinishWithPointer(writer, end);
5721
8.90M
}
5722
5723
static int
5724
unicode_fill_utf8(PyObject *unicode)
5725
161k
{
5726
161k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5727
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5728
161k
    assert(!PyUnicode_IS_ASCII(unicode));
5729
5730
161k
    int kind = PyUnicode_KIND(unicode);
5731
161k
    const void *data = PyUnicode_DATA(unicode);
5732
161k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5733
5734
161k
    PyBytesWriter *writer;
5735
161k
    char *end;
5736
5737
161k
    switch (kind) {
5738
0
    default:
5739
0
        Py_UNREACHABLE();
5740
117k
    case PyUnicode_1BYTE_KIND:
5741
117k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5742
117k
                                      _Py_ERROR_STRICT, NULL, &end);
5743
117k
        break;
5744
36.7k
    case PyUnicode_2BYTE_KIND:
5745
36.7k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5746
36.7k
                                      _Py_ERROR_STRICT, NULL, &end);
5747
36.7k
        break;
5748
6.65k
    case PyUnicode_4BYTE_KIND:
5749
6.65k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5750
6.65k
                                      _Py_ERROR_STRICT, NULL, &end);
5751
6.65k
        break;
5752
161k
    }
5753
161k
    if (writer == NULL) {
5754
206
        return -1;
5755
206
    }
5756
5757
160k
    const char *start = PyBytesWriter_GetData(writer);
5758
160k
    Py_ssize_t len = end - start;
5759
5760
160k
    char *cache = PyMem_Malloc(len + 1);
5761
160k
    if (cache == NULL) {
5762
0
        PyBytesWriter_Discard(writer);
5763
0
        PyErr_NoMemory();
5764
0
        return -1;
5765
0
    }
5766
160k
    memcpy(cache, start, len);
5767
160k
    cache[len] = '\0';
5768
160k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5769
160k
    PyUnicode_SET_UTF8(unicode, cache);
5770
160k
    PyBytesWriter_Discard(writer);
5771
160k
    return 0;
5772
160k
}
5773
5774
PyObject *
5775
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5776
17.9M
{
5777
17.9M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5778
17.9M
}
5779
5780
5781
PyObject *
5782
PyUnicode_AsUTF8String(PyObject *unicode)
5783
2.46k
{
5784
2.46k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5785
2.46k
}
5786
5787
/* --- UTF-32 Codec ------------------------------------------------------- */
5788
5789
PyObject *
5790
PyUnicode_DecodeUTF32(const char *s,
5791
                      Py_ssize_t size,
5792
                      const char *errors,
5793
                      int *byteorder)
5794
160
{
5795
160
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5796
160
}
5797
5798
PyObject *
5799
PyUnicode_DecodeUTF32Stateful(const char *s,
5800
                              Py_ssize_t size,
5801
                              const char *errors,
5802
                              int *byteorder,
5803
                              Py_ssize_t *consumed)
5804
51.3k
{
5805
51.3k
    const char *starts = s;
5806
51.3k
    Py_ssize_t startinpos;
5807
51.3k
    Py_ssize_t endinpos;
5808
51.3k
    _PyUnicodeWriter writer;
5809
51.3k
    const unsigned char *q, *e;
5810
51.3k
    int le, bo = 0;       /* assume native ordering by default */
5811
51.3k
    const char *encoding;
5812
51.3k
    const char *errmsg = "";
5813
51.3k
    PyObject *errorHandler = NULL;
5814
51.3k
    PyObject *exc = NULL;
5815
5816
51.3k
    q = (const unsigned char *)s;
5817
51.3k
    e = q + size;
5818
5819
51.3k
    if (byteorder)
5820
51.1k
        bo = *byteorder;
5821
5822
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5823
       byte order setting accordingly. In native mode, the leading BOM
5824
       mark is skipped, in all other modes, it is copied to the output
5825
       stream as-is (giving a ZWNBSP character). */
5826
51.3k
    if (bo == 0 && size >= 4) {
5827
48.8k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5828
48.8k
        if (bom == 0x0000FEFF) {
5829
211
            bo = -1;
5830
211
            q += 4;
5831
211
        }
5832
48.6k
        else if (bom == 0xFFFE0000) {
5833
363
            bo = 1;
5834
363
            q += 4;
5835
363
        }
5836
48.8k
        if (byteorder)
5837
48.6k
            *byteorder = bo;
5838
48.8k
    }
5839
5840
51.3k
    if (q == e) {
5841
122
        if (consumed)
5842
0
            *consumed = size;
5843
122
        _Py_RETURN_UNICODE_EMPTY();
5844
122
    }
5845
5846
#ifdef WORDS_BIGENDIAN
5847
    le = bo < 0;
5848
#else
5849
51.2k
    le = bo <= 0;
5850
51.2k
#endif
5851
51.2k
    encoding = le ? "utf-32-le" : "utf-32-be";
5852
5853
51.2k
    _PyUnicodeWriter_Init(&writer);
5854
51.2k
    writer.min_length = (e - q + 3) / 4;
5855
51.2k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5856
0
        goto onError;
5857
5858
142k
    while (1) {
5859
142k
        Py_UCS4 ch = 0;
5860
142k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5861
5862
142k
        if (e - q >= 4) {
5863
109k
            int kind = writer.kind;
5864
109k
            void *data = writer.data;
5865
109k
            const unsigned char *last = e - 4;
5866
109k
            Py_ssize_t pos = writer.pos;
5867
109k
            if (le) {
5868
3.03M
                do {
5869
3.03M
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5870
3.03M
                    if (ch > maxch)
5871
104k
                        break;
5872
2.92M
                    if (kind != PyUnicode_1BYTE_KIND &&
5873
2.90M
                        Py_UNICODE_IS_SURROGATE(ch))
5874
322
                        break;
5875
2.92M
                    PyUnicode_WRITE(kind, data, pos++, ch);
5876
2.92M
                    q += 4;
5877
2.92M
                } while (q <= last);
5878
105k
            }
5879
3.54k
            else {
5880
6.26k
                do {
5881
6.26k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5882
6.26k
                    if (ch > maxch)
5883
3.21k
                        break;
5884
3.04k
                    if (kind != PyUnicode_1BYTE_KIND &&
5885
2.55k
                        Py_UNICODE_IS_SURROGATE(ch))
5886
113
                        break;
5887
2.93k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5888
2.93k
                    q += 4;
5889
2.93k
                } while (q <= last);
5890
3.54k
            }
5891
109k
            writer.pos = pos;
5892
109k
        }
5893
5894
142k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5895
440
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5896
440
            startinpos = ((const char *)q) - starts;
5897
440
            endinpos = startinpos + 4;
5898
440
        }
5899
141k
        else if (ch <= maxch) {
5900
33.8k
            if (q == e || consumed)
5901
5.35k
                break;
5902
            /* remaining bytes at the end? (size should be divisible by 4) */
5903
28.4k
            errmsg = "truncated data";
5904
28.4k
            startinpos = ((const char *)q) - starts;
5905
28.4k
            endinpos = ((const char *)e) - starts;
5906
28.4k
        }
5907
107k
        else {
5908
107k
            if (ch < 0x110000) {
5909
5.06k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5910
0
                    goto onError;
5911
5.06k
                q += 4;
5912
5.06k
                continue;
5913
5.06k
            }
5914
102k
            errmsg = "code point not in range(0x110000)";
5915
102k
            startinpos = ((const char *)q) - starts;
5916
102k
            endinpos = startinpos + 4;
5917
102k
        }
5918
5919
        /* The remaining input chars are ignored if the callback
5920
           chooses to skip the input */
5921
131k
        if (unicode_decode_call_errorhandler_writer(
5922
131k
                errors, &errorHandler,
5923
131k
                encoding, errmsg,
5924
131k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5925
131k
                &writer))
5926
45.8k
            goto onError;
5927
131k
    }
5928
5929
5.35k
    if (consumed)
5930
0
        *consumed = (const char *)q-starts;
5931
5932
5.35k
    Py_XDECREF(errorHandler);
5933
5.35k
    Py_XDECREF(exc);
5934
5.35k
    return _PyUnicodeWriter_Finish(&writer);
5935
5936
45.8k
  onError:
5937
45.8k
    _PyUnicodeWriter_Dealloc(&writer);
5938
45.8k
    Py_XDECREF(errorHandler);
5939
45.8k
    Py_XDECREF(exc);
5940
45.8k
    return NULL;
5941
51.2k
}
5942
5943
PyObject *
5944
_PyUnicode_EncodeUTF32(PyObject *str,
5945
                       const char *errors,
5946
                       int byteorder)
5947
0
{
5948
0
    if (!PyUnicode_Check(str)) {
5949
0
        PyErr_BadArgument();
5950
0
        return NULL;
5951
0
    }
5952
0
    int kind = PyUnicode_KIND(str);
5953
0
    const void *data = PyUnicode_DATA(str);
5954
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
5955
5956
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5957
0
        return PyErr_NoMemory();
5958
0
    Py_ssize_t nsize = len + (byteorder == 0);
5959
5960
0
#if PY_LITTLE_ENDIAN
5961
0
    int native_ordering = byteorder <= 0;
5962
#else
5963
    int native_ordering = byteorder >= 0;
5964
#endif
5965
5966
0
    if (kind == PyUnicode_1BYTE_KIND) {
5967
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
5968
        // on short strings
5969
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5970
0
        if (v == NULL) {
5971
0
            return NULL;
5972
0
        }
5973
5974
        /* output buffer is 4-bytes aligned */
5975
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5976
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
5977
0
        if (byteorder == 0) {
5978
0
            *out++ = 0xFEFF;
5979
0
        }
5980
0
        if (len > 0) {
5981
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
5982
0
                                 &out, native_ordering);
5983
0
        }
5984
0
        return v;
5985
0
    }
5986
5987
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
5988
0
    if (writer == NULL) {
5989
0
        return NULL;
5990
0
    }
5991
5992
    /* output buffer is 4-bytes aligned */
5993
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
5994
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
5995
0
    if (byteorder == 0) {
5996
0
        *out++ = 0xFEFF;
5997
0
    }
5998
0
    if (len == 0) {
5999
0
        return PyBytesWriter_Finish(writer);
6000
0
    }
6001
6002
0
    const char *encoding;
6003
0
    if (byteorder == -1)
6004
0
        encoding = "utf-32-le";
6005
0
    else if (byteorder == 1)
6006
0
        encoding = "utf-32-be";
6007
0
    else
6008
0
        encoding = "utf-32";
6009
6010
0
    PyObject *errorHandler = NULL;
6011
0
    PyObject *exc = NULL;
6012
0
    PyObject *rep = NULL;
6013
6014
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6015
0
        if (kind == PyUnicode_2BYTE_KIND) {
6016
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6017
0
                                        &out, native_ordering);
6018
0
        }
6019
0
        else {
6020
0
            assert(kind == PyUnicode_4BYTE_KIND);
6021
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6022
0
                                        &out, native_ordering);
6023
0
        }
6024
0
        if (pos == len)
6025
0
            break;
6026
6027
0
        Py_ssize_t newpos;
6028
0
        rep = unicode_encode_call_errorhandler(
6029
0
                errors, &errorHandler,
6030
0
                encoding, "surrogates not allowed",
6031
0
                str, &exc, pos, pos + 1, &newpos);
6032
0
        if (!rep)
6033
0
            goto error;
6034
6035
0
        Py_ssize_t repsize, moreunits;
6036
0
        if (PyBytes_Check(rep)) {
6037
0
            repsize = PyBytes_GET_SIZE(rep);
6038
0
            if (repsize & 3) {
6039
0
                raise_encode_exception(&exc, encoding,
6040
0
                                       str, pos, pos + 1,
6041
0
                                       "surrogates not allowed");
6042
0
                goto error;
6043
0
            }
6044
0
            moreunits = repsize / 4;
6045
0
        }
6046
0
        else {
6047
0
            assert(PyUnicode_Check(rep));
6048
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6049
0
            if (!PyUnicode_IS_ASCII(rep)) {
6050
0
                raise_encode_exception(&exc, encoding,
6051
0
                                       str, pos, pos + 1,
6052
0
                                       "surrogates not allowed");
6053
0
                goto error;
6054
0
            }
6055
0
        }
6056
0
        moreunits += pos - newpos;
6057
0
        pos = newpos;
6058
6059
        /* four bytes are reserved for each surrogate */
6060
0
        if (moreunits > 0) {
6061
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6062
0
            if (out == NULL) {
6063
0
                goto error;
6064
0
            }
6065
0
        }
6066
6067
0
        if (PyBytes_Check(rep)) {
6068
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6069
0
            out += repsize / 4;
6070
0
        }
6071
0
        else {
6072
            /* rep is unicode */
6073
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6074
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6075
0
                                 &out, native_ordering);
6076
0
        }
6077
6078
0
        Py_CLEAR(rep);
6079
0
    }
6080
6081
0
    Py_XDECREF(errorHandler);
6082
0
    Py_XDECREF(exc);
6083
6084
    /* Cut back to size actually needed. This is necessary for, for example,
6085
       encoding of a string containing isolated surrogates and the 'ignore'
6086
       handler is used. */
6087
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6088
6089
0
  error:
6090
0
    Py_XDECREF(rep);
6091
0
    Py_XDECREF(errorHandler);
6092
0
    Py_XDECREF(exc);
6093
0
    PyBytesWriter_Discard(writer);
6094
0
    return NULL;
6095
0
}
6096
6097
PyObject *
6098
PyUnicode_AsUTF32String(PyObject *unicode)
6099
0
{
6100
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6101
0
}
6102
6103
/* --- UTF-16 Codec ------------------------------------------------------- */
6104
6105
PyObject *
6106
PyUnicode_DecodeUTF16(const char *s,
6107
                      Py_ssize_t size,
6108
                      const char *errors,
6109
                      int *byteorder)
6110
183
{
6111
183
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6112
183
}
6113
6114
PyObject *
6115
PyUnicode_DecodeUTF16Stateful(const char *s,
6116
                              Py_ssize_t size,
6117
                              const char *errors,
6118
                              int *byteorder,
6119
                              Py_ssize_t *consumed)
6120
17.2k
{
6121
17.2k
    const char *starts = s;
6122
17.2k
    Py_ssize_t startinpos;
6123
17.2k
    Py_ssize_t endinpos;
6124
17.2k
    _PyUnicodeWriter writer;
6125
17.2k
    const unsigned char *q, *e;
6126
17.2k
    int bo = 0;       /* assume native ordering by default */
6127
17.2k
    int native_ordering;
6128
17.2k
    const char *errmsg = "";
6129
17.2k
    PyObject *errorHandler = NULL;
6130
17.2k
    PyObject *exc = NULL;
6131
17.2k
    const char *encoding;
6132
6133
17.2k
    q = (const unsigned char *)s;
6134
17.2k
    e = q + size;
6135
6136
17.2k
    if (byteorder)
6137
17.0k
        bo = *byteorder;
6138
6139
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6140
       byte order setting accordingly. In native mode, the leading BOM
6141
       mark is skipped, in all other modes, it is copied to the output
6142
       stream as-is (giving a ZWNBSP character). */
6143
17.2k
    if (bo == 0 && size >= 2) {
6144
16.3k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6145
16.3k
        if (bom == 0xFEFF) {
6146
276
            q += 2;
6147
276
            bo = -1;
6148
276
        }
6149
16.1k
        else if (bom == 0xFFFE) {
6150
2.21k
            q += 2;
6151
2.21k
            bo = 1;
6152
2.21k
        }
6153
16.3k
        if (byteorder)
6154
16.1k
            *byteorder = bo;
6155
16.3k
    }
6156
6157
17.2k
    if (q == e) {
6158
56
        if (consumed)
6159
0
            *consumed = size;
6160
56
        _Py_RETURN_UNICODE_EMPTY();
6161
56
    }
6162
6163
17.1k
#if PY_LITTLE_ENDIAN
6164
17.1k
    native_ordering = bo <= 0;
6165
17.1k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6166
#else
6167
    native_ordering = bo >= 0;
6168
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6169
#endif
6170
6171
    /* Note: size will always be longer than the resulting Unicode
6172
       character count normally.  Error handler will take care of
6173
       resizing when needed. */
6174
17.1k
    _PyUnicodeWriter_Init(&writer);
6175
17.1k
    writer.min_length = (e - q + 1) / 2;
6176
17.1k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6177
0
        goto onError;
6178
6179
61.2k
    while (1) {
6180
61.2k
        Py_UCS4 ch = 0;
6181
61.2k
        if (e - q >= 2) {
6182
52.3k
            int kind = writer.kind;
6183
52.3k
            if (kind == PyUnicode_1BYTE_KIND) {
6184
20.6k
                if (PyUnicode_IS_ASCII(writer.buffer))
6185
16.5k
                    ch = asciilib_utf16_decode(&q, e,
6186
16.5k
                            (Py_UCS1*)writer.data, &writer.pos,
6187
16.5k
                            native_ordering);
6188
4.10k
                else
6189
4.10k
                    ch = ucs1lib_utf16_decode(&q, e,
6190
4.10k
                            (Py_UCS1*)writer.data, &writer.pos,
6191
4.10k
                            native_ordering);
6192
31.6k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6193
17.0k
                ch = ucs2lib_utf16_decode(&q, e,
6194
17.0k
                        (Py_UCS2*)writer.data, &writer.pos,
6195
17.0k
                        native_ordering);
6196
17.0k
            } else {
6197
14.6k
                assert(kind == PyUnicode_4BYTE_KIND);
6198
14.6k
                ch = ucs4lib_utf16_decode(&q, e,
6199
14.6k
                        (Py_UCS4*)writer.data, &writer.pos,
6200
14.6k
                        native_ordering);
6201
14.6k
            }
6202
52.3k
        }
6203
6204
61.2k
        switch (ch)
6205
61.2k
        {
6206
18.1k
        case 0:
6207
            /* remaining byte at the end? (size should be even) */
6208
18.1k
            if (q == e || consumed)
6209
11.4k
                goto End;
6210
6.70k
            errmsg = "truncated data";
6211
6.70k
            startinpos = ((const char *)q) - starts;
6212
6.70k
            endinpos = ((const char *)e) - starts;
6213
6.70k
            break;
6214
            /* The remaining input chars are ignored if the callback
6215
               chooses to skip the input */
6216
1.73k
        case 1:
6217
1.73k
            q -= 2;
6218
1.73k
            if (consumed)
6219
0
                goto End;
6220
1.73k
            errmsg = "unexpected end of data";
6221
1.73k
            startinpos = ((const char *)q) - starts;
6222
1.73k
            endinpos = ((const char *)e) - starts;
6223
1.73k
            break;
6224
13.4k
        case 2:
6225
13.4k
            errmsg = "illegal encoding";
6226
13.4k
            startinpos = ((const char *)q) - 2 - starts;
6227
13.4k
            endinpos = startinpos + 2;
6228
13.4k
            break;
6229
8.62k
        case 3:
6230
8.62k
            errmsg = "illegal UTF-16 surrogate";
6231
8.62k
            startinpos = ((const char *)q) - 4 - starts;
6232
8.62k
            endinpos = startinpos + 2;
6233
8.62k
            break;
6234
19.3k
        default:
6235
19.3k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6236
0
                goto onError;
6237
19.3k
            continue;
6238
61.2k
        }
6239
6240
30.5k
        if (unicode_decode_call_errorhandler_writer(
6241
30.5k
                errors,
6242
30.5k
                &errorHandler,
6243
30.5k
                encoding, errmsg,
6244
30.5k
                &starts,
6245
30.5k
                (const char **)&e,
6246
30.5k
                &startinpos,
6247
30.5k
                &endinpos,
6248
30.5k
                &exc,
6249
30.5k
                (const char **)&q,
6250
30.5k
                &writer))
6251
5.69k
            goto onError;
6252
30.5k
    }
6253
6254
11.4k
End:
6255
11.4k
    if (consumed)
6256
0
        *consumed = (const char *)q-starts;
6257
6258
11.4k
    Py_XDECREF(errorHandler);
6259
11.4k
    Py_XDECREF(exc);
6260
11.4k
    return _PyUnicodeWriter_Finish(&writer);
6261
6262
5.69k
  onError:
6263
5.69k
    _PyUnicodeWriter_Dealloc(&writer);
6264
5.69k
    Py_XDECREF(errorHandler);
6265
5.69k
    Py_XDECREF(exc);
6266
5.69k
    return NULL;
6267
17.1k
}
6268
6269
PyObject *
6270
_PyUnicode_EncodeUTF16(PyObject *str,
6271
                       const char *errors,
6272
                       int byteorder)
6273
7.12k
{
6274
7.12k
    if (!PyUnicode_Check(str)) {
6275
0
        PyErr_BadArgument();
6276
0
        return NULL;
6277
0
    }
6278
7.12k
    int kind = PyUnicode_KIND(str);
6279
7.12k
    const void *data = PyUnicode_DATA(str);
6280
7.12k
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6281
6282
7.12k
    Py_ssize_t pairs = 0;
6283
7.12k
    if (kind == PyUnicode_4BYTE_KIND) {
6284
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6285
0
        const Py_UCS4 *end = in + len;
6286
0
        while (in < end) {
6287
0
            if (*in++ >= 0x10000) {
6288
0
                pairs++;
6289
0
            }
6290
0
        }
6291
0
    }
6292
7.12k
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6293
0
        return PyErr_NoMemory();
6294
0
    }
6295
7.12k
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6296
6297
#if PY_BIG_ENDIAN
6298
    int native_ordering = byteorder >= 0;
6299
#else
6300
7.12k
    int native_ordering = byteorder <= 0;
6301
7.12k
#endif
6302
6303
7.12k
    if (kind == PyUnicode_1BYTE_KIND) {
6304
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6305
        // on short strings
6306
7.06k
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6307
7.06k
        if (v == NULL) {
6308
0
            return NULL;
6309
0
        }
6310
6311
        /* output buffer is 2-bytes aligned */
6312
7.06k
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6313
7.06k
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6314
7.06k
        if (byteorder == 0) {
6315
0
            *out++ = 0xFEFF;
6316
0
        }
6317
7.06k
        if (len > 0) {
6318
7.06k
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6319
7.06k
        }
6320
7.06k
        return v;
6321
7.06k
    }
6322
6323
67
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6324
67
    if (writer == NULL) {
6325
0
        return NULL;
6326
0
    }
6327
6328
    /* output buffer is 2-bytes aligned */
6329
67
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6330
67
    unsigned short *out = PyBytesWriter_GetData(writer);
6331
67
    if (byteorder == 0) {
6332
0
        *out++ = 0xFEFF;
6333
0
    }
6334
67
    if (len == 0) {
6335
0
        return PyBytesWriter_Finish(writer);
6336
0
    }
6337
6338
67
    const char *encoding;
6339
67
    if (byteorder < 0) {
6340
0
        encoding = "utf-16-le";
6341
0
    }
6342
67
    else if (byteorder > 0) {
6343
67
        encoding = "utf-16-be";
6344
67
    }
6345
0
    else {
6346
0
        encoding = "utf-16";
6347
0
    }
6348
6349
67
    PyObject *errorHandler = NULL;
6350
67
    PyObject *exc = NULL;
6351
67
    PyObject *rep = NULL;
6352
6353
67
    for (Py_ssize_t pos = 0; pos < len; ) {
6354
67
        if (kind == PyUnicode_2BYTE_KIND) {
6355
67
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6356
67
                                        &out, native_ordering);
6357
67
        }
6358
0
        else {
6359
0
            assert(kind == PyUnicode_4BYTE_KIND);
6360
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6361
0
                                        &out, native_ordering);
6362
0
        }
6363
67
        if (pos == len)
6364
67
            break;
6365
6366
0
        Py_ssize_t newpos;
6367
0
        rep = unicode_encode_call_errorhandler(
6368
0
                errors, &errorHandler,
6369
0
                encoding, "surrogates not allowed",
6370
0
                str, &exc, pos, pos + 1, &newpos);
6371
0
        if (!rep)
6372
0
            goto error;
6373
6374
0
        Py_ssize_t repsize, moreunits;
6375
0
        if (PyBytes_Check(rep)) {
6376
0
            repsize = PyBytes_GET_SIZE(rep);
6377
0
            if (repsize & 1) {
6378
0
                raise_encode_exception(&exc, encoding,
6379
0
                                       str, pos, pos + 1,
6380
0
                                       "surrogates not allowed");
6381
0
                goto error;
6382
0
            }
6383
0
            moreunits = repsize / 2;
6384
0
        }
6385
0
        else {
6386
0
            assert(PyUnicode_Check(rep));
6387
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6388
0
            if (!PyUnicode_IS_ASCII(rep)) {
6389
0
                raise_encode_exception(&exc, encoding,
6390
0
                                       str, pos, pos + 1,
6391
0
                                       "surrogates not allowed");
6392
0
                goto error;
6393
0
            }
6394
0
        }
6395
0
        moreunits += pos - newpos;
6396
0
        pos = newpos;
6397
6398
        /* two bytes are reserved for each surrogate */
6399
0
        if (moreunits > 0) {
6400
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6401
0
            if (out == NULL) {
6402
0
                goto error;
6403
0
            }
6404
0
        }
6405
6406
0
        if (PyBytes_Check(rep)) {
6407
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6408
0
            out += repsize / 2;
6409
0
        } else {
6410
            /* rep is unicode */
6411
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6412
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6413
0
                                 &out, native_ordering);
6414
0
        }
6415
6416
0
        Py_CLEAR(rep);
6417
0
    }
6418
6419
67
    Py_XDECREF(errorHandler);
6420
67
    Py_XDECREF(exc);
6421
6422
    /* Cut back to size actually needed. This is necessary for, for example,
6423
    encoding of a string containing isolated surrogates and the 'ignore' handler
6424
    is used. */
6425
67
    return PyBytesWriter_FinishWithPointer(writer, out);
6426
6427
0
  error:
6428
0
    Py_XDECREF(rep);
6429
0
    Py_XDECREF(errorHandler);
6430
0
    Py_XDECREF(exc);
6431
0
    PyBytesWriter_Discard(writer);
6432
0
    return NULL;
6433
67
}
6434
6435
PyObject *
6436
PyUnicode_AsUTF16String(PyObject *unicode)
6437
0
{
6438
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6439
0
}
6440
6441
_PyUnicode_Name_CAPI *
6442
_PyUnicode_GetNameCAPI(void)
6443
16.1k
{
6444
16.1k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6445
16.1k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6446
6447
16.1k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6448
16.1k
    if (ucnhash_capi == NULL) {
6449
2
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6450
2
                PyUnicodeData_CAPSULE_NAME, 1);
6451
6452
        // It's fine if we overwrite the value here. It's always the same value.
6453
2
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6454
2
    }
6455
16.1k
    return ucnhash_capi;
6456
16.1k
}
6457
6458
/* --- Unicode Escape Codec ----------------------------------------------- */
6459
6460
PyObject *
6461
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6462
                               Py_ssize_t size,
6463
                               const char *errors,
6464
                               Py_ssize_t *consumed,
6465
                               int *first_invalid_escape_char,
6466
                               const char **first_invalid_escape_ptr)
6467
30.2k
{
6468
30.2k
    const char *starts = s;
6469
30.2k
    const char *initial_starts = starts;
6470
30.2k
    _PyUnicodeWriter writer;
6471
30.2k
    const char *end;
6472
30.2k
    PyObject *errorHandler = NULL;
6473
30.2k
    PyObject *exc = NULL;
6474
30.2k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6475
6476
    // so we can remember if we've seen an invalid escape char or not
6477
30.2k
    *first_invalid_escape_char = -1;
6478
30.2k
    *first_invalid_escape_ptr = NULL;
6479
6480
30.2k
    if (size == 0) {
6481
2.89k
        if (consumed) {
6482
0
            *consumed = 0;
6483
0
        }
6484
2.89k
        _Py_RETURN_UNICODE_EMPTY();
6485
2.89k
    }
6486
    /* Escaped strings will always be longer than the resulting
6487
       Unicode string, so we start with size here and then reduce the
6488
       length after conversion to the true value.
6489
       (but if the error callback returns a long replacement string
6490
       we'll have to allocate more space) */
6491
27.3k
    _PyUnicodeWriter_Init(&writer);
6492
27.3k
    writer.min_length = size;
6493
27.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6494
0
        goto onError;
6495
0
    }
6496
6497
27.3k
    end = s + size;
6498
8.94M
    while (s < end) {
6499
8.92M
        unsigned char c = (unsigned char) *s++;
6500
8.92M
        Py_UCS4 ch;
6501
8.92M
        int count;
6502
8.92M
        const char *message;
6503
6504
8.92M
#define WRITE_ASCII_CHAR(ch)                                                  \
6505
8.92M
            do {                                                              \
6506
115k
                assert(ch <= 127);                                            \
6507
115k
                assert(writer.pos < writer.size);                             \
6508
115k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6509
115k
            } while(0)
6510
6511
8.92M
#define WRITE_CHAR(ch)                                                        \
6512
8.92M
            do {                                                              \
6513
8.83M
                if (ch <= writer.maxchar) {                                   \
6514
8.82M
                    assert(writer.pos < writer.size);                         \
6515
8.82M
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6516
8.82M
                }                                                             \
6517
8.83M
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6518
0
                    goto onError;                                             \
6519
0
                }                                                             \
6520
8.83M
            } while(0)
6521
6522
        /* Non-escape characters are interpreted as Unicode ordinals */
6523
8.92M
        if (c != '\\') {
6524
8.63M
            WRITE_CHAR(c);
6525
8.63M
            continue;
6526
8.63M
        }
6527
6528
282k
        Py_ssize_t startinpos = s - starts - 1;
6529
        /* \ - Escapes */
6530
282k
        if (s >= end) {
6531
15
            message = "\\ at end of string";
6532
15
            goto incomplete;
6533
15
        }
6534
282k
        c = (unsigned char) *s++;
6535
6536
282k
        assert(writer.pos < writer.size);
6537
282k
        switch (c) {
6538
6539
            /* \x escapes */
6540
1.45k
        case '\n': continue;
6541
37.3k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6542
2.09k
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6543
3.50k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6544
3.05k
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6545
        /* FF */
6546
6.76k
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6547
1.48k
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6548
2.21k
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6549
6.47k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6550
        /* VT */
6551
14.1k
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6552
        /* BEL, not classic C */
6553
2.60k
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6554
6555
            /* \OOO (octal) escapes */
6556
42.5k
        case '0': case '1': case '2': case '3':
6557
109k
        case '4': case '5': case '6': case '7':
6558
109k
            ch = c - '0';
6559
109k
            if (s < end && '0' <= *s && *s <= '7') {
6560
64.6k
                ch = (ch<<3) + *s++ - '0';
6561
64.6k
                if (s < end && '0' <= *s && *s <= '7') {
6562
50.7k
                    ch = (ch<<3) + *s++ - '0';
6563
50.7k
                }
6564
64.6k
            }
6565
109k
            if (ch > 0377) {
6566
48.6k
                if (*first_invalid_escape_char == -1) {
6567
861
                    *first_invalid_escape_char = ch;
6568
861
                    if (starts == initial_starts) {
6569
                        /* Back up 3 chars, since we've already incremented s. */
6570
861
                        *first_invalid_escape_ptr = s - 3;
6571
861
                    }
6572
861
                }
6573
48.6k
            }
6574
109k
            WRITE_CHAR(ch);
6575
109k
            continue;
6576
6577
            /* hex escapes */
6578
            /* \xXX */
6579
109k
        case 'x':
6580
14.6k
            count = 2;
6581
14.6k
            message = "truncated \\xXX escape";
6582
14.6k
            goto hexescape;
6583
6584
            /* \uXXXX */
6585
6.63k
        case 'u':
6586
6.63k
            count = 4;
6587
6.63k
            message = "truncated \\uXXXX escape";
6588
6.63k
            goto hexescape;
6589
6590
            /* \UXXXXXXXX */
6591
17.9k
        case 'U':
6592
17.9k
            count = 8;
6593
17.9k
            message = "truncated \\UXXXXXXXX escape";
6594
39.2k
        hexescape:
6595
238k
            for (ch = 0; count; ++s, --count) {
6596
199k
                if (s >= end) {
6597
17
                    goto incomplete;
6598
17
                }
6599
199k
                c = (unsigned char)*s;
6600
199k
                ch <<= 4;
6601
199k
                if (c >= '0' && c <= '9') {
6602
140k
                    ch += c - '0';
6603
140k
                }
6604
58.9k
                else if (c >= 'a' && c <= 'f') {
6605
58.4k
                    ch += c - ('a' - 10);
6606
58.4k
                }
6607
456
                else if (c >= 'A' && c <= 'F') {
6608
421
                    ch += c - ('A' - 10);
6609
421
                }
6610
35
                else {
6611
35
                    goto error;
6612
35
                }
6613
199k
            }
6614
6615
            /* when we get here, ch is a 32-bit unicode character */
6616
39.2k
            if (ch > MAX_UNICODE) {
6617
8
                message = "illegal Unicode character";
6618
8
                goto error;
6619
8
            }
6620
6621
39.2k
            WRITE_CHAR(ch);
6622
39.2k
            continue;
6623
6624
            /* \N{name} */
6625
39.2k
        case 'N':
6626
16.1k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6627
16.1k
            if (ucnhash_capi == NULL) {
6628
0
                PyErr_SetString(
6629
0
                        PyExc_UnicodeError,
6630
0
                        "\\N escapes not supported (can't load unicodedata module)"
6631
0
                );
6632
0
                goto onError;
6633
0
            }
6634
6635
16.1k
            message = "malformed \\N character escape";
6636
16.1k
            if (s >= end) {
6637
7
                goto incomplete;
6638
7
            }
6639
16.1k
            if (*s == '{') {
6640
16.1k
                const char *start = ++s;
6641
16.1k
                size_t namelen;
6642
                /* look for the closing brace */
6643
84.4k
                while (s < end && *s != '}')
6644
68.3k
                    s++;
6645
16.1k
                if (s >= end) {
6646
25
                    goto incomplete;
6647
25
                }
6648
16.1k
                namelen = s - start;
6649
16.1k
                if (namelen) {
6650
                    /* found a name.  look it up in the unicode database */
6651
16.1k
                    s++;
6652
16.1k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6653
16.1k
                    if (namelen <= INT_MAX &&
6654
16.1k
                        ucnhash_capi->getcode(start, (int)namelen,
6655
16.1k
                                              &ch, 0)) {
6656
15.9k
                        assert(ch <= MAX_UNICODE);
6657
15.9k
                        WRITE_CHAR(ch);
6658
15.9k
                        continue;
6659
15.9k
                    }
6660
110
                    message = "unknown Unicode character name";
6661
110
                }
6662
16.1k
            }
6663
127
            goto error;
6664
6665
36.3k
        default:
6666
36.3k
            if (*first_invalid_escape_char == -1) {
6667
4.32k
                *first_invalid_escape_char = c;
6668
4.32k
                if (starts == initial_starts) {
6669
                    /* Back up one char, since we've already incremented s. */
6670
4.32k
                    *first_invalid_escape_ptr = s - 1;
6671
4.32k
                }
6672
4.32k
            }
6673
36.3k
            WRITE_ASCII_CHAR('\\');
6674
36.3k
            WRITE_CHAR(c);
6675
36.3k
            continue;
6676
282k
        }
6677
6678
64
      incomplete:
6679
64
        if (consumed) {
6680
0
            *consumed = startinpos;
6681
0
            break;
6682
0
        }
6683
234
      error:;
6684
234
        Py_ssize_t endinpos = s-starts;
6685
234
        writer.min_length = end - s + writer.pos;
6686
234
        if (unicode_decode_call_errorhandler_writer(
6687
234
                errors, &errorHandler,
6688
234
                "unicodeescape", message,
6689
234
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6690
234
                &writer)) {
6691
234
            goto onError;
6692
234
        }
6693
234
        assert(end - s <= writer.size - writer.pos);
6694
6695
0
#undef WRITE_ASCII_CHAR
6696
0
#undef WRITE_CHAR
6697
0
    }
6698
6699
27.0k
    Py_XDECREF(errorHandler);
6700
27.0k
    Py_XDECREF(exc);
6701
27.0k
    return _PyUnicodeWriter_Finish(&writer);
6702
6703
234
  onError:
6704
234
    _PyUnicodeWriter_Dealloc(&writer);
6705
234
    Py_XDECREF(errorHandler);
6706
234
    Py_XDECREF(exc);
6707
234
    return NULL;
6708
27.3k
}
6709
6710
PyObject *
6711
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6712
                              Py_ssize_t size,
6713
                              const char *errors,
6714
                              Py_ssize_t *consumed)
6715
545
{
6716
545
    int first_invalid_escape_char;
6717
545
    const char *first_invalid_escape_ptr;
6718
545
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6719
545
                                                      consumed,
6720
545
                                                      &first_invalid_escape_char,
6721
545
                                                      &first_invalid_escape_ptr);
6722
545
    if (result == NULL)
6723
122
        return NULL;
6724
423
    if (first_invalid_escape_char != -1) {
6725
303
        if (first_invalid_escape_char > 0xff) {
6726
96
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6727
96
                                 "\"\\%o\" is an invalid octal escape sequence. "
6728
96
                                 "Such sequences will not work in the future. ",
6729
96
                                 first_invalid_escape_char) < 0)
6730
0
            {
6731
0
                Py_DECREF(result);
6732
0
                return NULL;
6733
0
            }
6734
96
        }
6735
207
        else {
6736
207
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6737
207
                                 "\"\\%c\" is an invalid escape sequence. "
6738
207
                                 "Such sequences will not work in the future. ",
6739
207
                                 first_invalid_escape_char) < 0)
6740
0
            {
6741
0
                Py_DECREF(result);
6742
0
                return NULL;
6743
0
            }
6744
207
        }
6745
303
    }
6746
423
    return result;
6747
423
}
6748
6749
PyObject *
6750
PyUnicode_DecodeUnicodeEscape(const char *s,
6751
                              Py_ssize_t size,
6752
                              const char *errors)
6753
0
{
6754
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6755
0
}
6756
6757
/* Return a Unicode-Escape string version of the Unicode object. */
6758
6759
PyObject *
6760
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6761
330k
{
6762
330k
    if (!PyUnicode_Check(unicode)) {
6763
0
        PyErr_BadArgument();
6764
0
        return NULL;
6765
0
    }
6766
6767
330k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6768
330k
    if (len == 0) {
6769
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6770
0
    }
6771
330k
    int kind = PyUnicode_KIND(unicode);
6772
330k
    const void *data = PyUnicode_DATA(unicode);
6773
6774
    /* Initial allocation is based on the longest-possible character
6775
     * escape.
6776
     *
6777
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6778
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6779
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6780
330k
    Py_ssize_t expandsize = kind * 2 + 2;
6781
330k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6782
0
        return PyErr_NoMemory();
6783
0
    }
6784
6785
330k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6786
330k
    if (writer == NULL) {
6787
0
        return NULL;
6788
0
    }
6789
330k
    char *p = PyBytesWriter_GetData(writer);
6790
6791
661k
    for (Py_ssize_t i = 0; i < len; i++) {
6792
330k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6793
6794
        /* U+0000-U+00ff range */
6795
330k
        if (ch < 0x100) {
6796
324k
            if (ch >= ' ' && ch < 127) {
6797
27.6k
                if (ch != '\\') {
6798
                    /* Copy printable US ASCII as-is */
6799
0
                    *p++ = (char) ch;
6800
0
                }
6801
                /* Escape backslashes */
6802
27.6k
                else {
6803
27.6k
                    *p++ = '\\';
6804
27.6k
                    *p++ = '\\';
6805
27.6k
                }
6806
27.6k
            }
6807
6808
            /* Map special whitespace to '\t', \n', '\r' */
6809
297k
            else if (ch == '\t') {
6810
4.05k
                *p++ = '\\';
6811
4.05k
                *p++ = 't';
6812
4.05k
            }
6813
293k
            else if (ch == '\n') {
6814
2.26k
                *p++ = '\\';
6815
2.26k
                *p++ = 'n';
6816
2.26k
            }
6817
290k
            else if (ch == '\r') {
6818
650
                *p++ = '\\';
6819
650
                *p++ = 'r';
6820
650
            }
6821
6822
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6823
290k
            else {
6824
290k
                *p++ = '\\';
6825
290k
                *p++ = 'x';
6826
290k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6827
290k
                *p++ = Py_hexdigits[ch & 0x000F];
6828
290k
            }
6829
324k
        }
6830
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6831
5.92k
        else if (ch < 0x10000) {
6832
4.71k
            *p++ = '\\';
6833
4.71k
            *p++ = 'u';
6834
4.71k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6835
4.71k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6836
4.71k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6837
4.71k
            *p++ = Py_hexdigits[ch & 0x000F];
6838
4.71k
        }
6839
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6840
1.20k
        else {
6841
6842
            /* Make sure that the first two digits are zero */
6843
1.20k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6844
1.20k
            *p++ = '\\';
6845
1.20k
            *p++ = 'U';
6846
1.20k
            *p++ = '0';
6847
1.20k
            *p++ = '0';
6848
1.20k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6849
1.20k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6850
1.20k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6851
1.20k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6852
1.20k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6853
1.20k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6854
1.20k
        }
6855
330k
    }
6856
6857
330k
    return PyBytesWriter_FinishWithPointer(writer, p);
6858
330k
}
6859
6860
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6861
6862
PyObject *
6863
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6864
                                          Py_ssize_t size,
6865
                                          const char *errors,
6866
                                          Py_ssize_t *consumed)
6867
112
{
6868
112
    const char *starts = s;
6869
112
    _PyUnicodeWriter writer;
6870
112
    const char *end;
6871
112
    PyObject *errorHandler = NULL;
6872
112
    PyObject *exc = NULL;
6873
6874
112
    if (size == 0) {
6875
0
        if (consumed) {
6876
0
            *consumed = 0;
6877
0
        }
6878
0
        _Py_RETURN_UNICODE_EMPTY();
6879
0
    }
6880
6881
    /* Escaped strings will always be longer than the resulting
6882
       Unicode string, so we start with size here and then reduce the
6883
       length after conversion to the true value. (But decoding error
6884
       handler might have to resize the string) */
6885
112
    _PyUnicodeWriter_Init(&writer);
6886
112
    writer.min_length = size;
6887
112
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6888
0
        goto onError;
6889
0
    }
6890
6891
112
    end = s + size;
6892
46.7k
    while (s < end) {
6893
46.6k
        unsigned char c = (unsigned char) *s++;
6894
46.6k
        Py_UCS4 ch;
6895
46.6k
        int count;
6896
46.6k
        const char *message;
6897
6898
46.6k
#define WRITE_CHAR(ch)                                                        \
6899
46.6k
            do {                                                              \
6900
46.6k
                if (ch <= writer.maxchar) {                                   \
6901
46.5k
                    assert(writer.pos < writer.size);                         \
6902
46.5k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6903
46.5k
                }                                                             \
6904
46.6k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6905
0
                    goto onError;                                             \
6906
0
                }                                                             \
6907
46.6k
            } while(0)
6908
6909
        /* Non-escape characters are interpreted as Unicode ordinals */
6910
46.6k
        if (c != '\\' || (s >= end && !consumed)) {
6911
43.5k
            WRITE_CHAR(c);
6912
43.5k
            continue;
6913
43.5k
        }
6914
6915
3.07k
        Py_ssize_t startinpos = s - starts - 1;
6916
        /* \ - Escapes */
6917
3.07k
        if (s >= end) {
6918
0
            assert(consumed);
6919
            // Set message to silent compiler warning.
6920
            // Actually it is never used.
6921
0
            message = "\\ at end of string";
6922
0
            goto incomplete;
6923
0
        }
6924
6925
3.07k
        c = (unsigned char) *s++;
6926
3.07k
        if (c == 'u') {
6927
404
            count = 4;
6928
404
            message = "truncated \\uXXXX escape";
6929
404
        }
6930
2.66k
        else if (c == 'U') {
6931
542
            count = 8;
6932
542
            message = "truncated \\UXXXXXXXX escape";
6933
542
        }
6934
2.12k
        else {
6935
2.12k
            assert(writer.pos < writer.size);
6936
2.12k
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6937
2.12k
            WRITE_CHAR(c);
6938
2.12k
            continue;
6939
2.12k
        }
6940
6941
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6942
6.78k
        for (ch = 0; count; ++s, --count) {
6943
5.86k
            if (s >= end) {
6944
7
                goto incomplete;
6945
7
            }
6946
5.85k
            c = (unsigned char)*s;
6947
5.85k
            ch <<= 4;
6948
5.85k
            if (c >= '0' && c <= '9') {
6949
5.00k
                ch += c - '0';
6950
5.00k
            }
6951
850
            else if (c >= 'a' && c <= 'f') {
6952
736
                ch += c - ('a' - 10);
6953
736
            }
6954
114
            else if (c >= 'A' && c <= 'F') {
6955
95
                ch += c - ('A' - 10);
6956
95
            }
6957
19
            else {
6958
19
                goto error;
6959
19
            }
6960
5.85k
        }
6961
920
        if (ch > MAX_UNICODE) {
6962
3
            message = "\\Uxxxxxxxx out of range";
6963
3
            goto error;
6964
3
        }
6965
917
        WRITE_CHAR(ch);
6966
917
        continue;
6967
6968
917
      incomplete:
6969
7
        if (consumed) {
6970
0
            *consumed = startinpos;
6971
0
            break;
6972
0
        }
6973
29
      error:;
6974
29
        Py_ssize_t endinpos = s-starts;
6975
29
        writer.min_length = end - s + writer.pos;
6976
29
        if (unicode_decode_call_errorhandler_writer(
6977
29
                errors, &errorHandler,
6978
29
                "rawunicodeescape", message,
6979
29
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6980
29
                &writer)) {
6981
29
            goto onError;
6982
29
        }
6983
29
        assert(end - s <= writer.size - writer.pos);
6984
6985
0
#undef WRITE_CHAR
6986
0
    }
6987
83
    Py_XDECREF(errorHandler);
6988
83
    Py_XDECREF(exc);
6989
83
    return _PyUnicodeWriter_Finish(&writer);
6990
6991
29
  onError:
6992
29
    _PyUnicodeWriter_Dealloc(&writer);
6993
29
    Py_XDECREF(errorHandler);
6994
29
    Py_XDECREF(exc);
6995
29
    return NULL;
6996
112
}
6997
6998
PyObject *
6999
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7000
                                 Py_ssize_t size,
7001
                                 const char *errors)
7002
0
{
7003
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7004
0
}
7005
7006
7007
PyObject *
7008
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7009
247k
{
7010
247k
    if (!PyUnicode_Check(unicode)) {
7011
0
        PyErr_BadArgument();
7012
0
        return NULL;
7013
0
    }
7014
247k
    int kind = PyUnicode_KIND(unicode);
7015
247k
    const void *data = PyUnicode_DATA(unicode);
7016
247k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7017
247k
    if (len == 0) {
7018
443
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7019
443
    }
7020
247k
    if (kind == PyUnicode_1BYTE_KIND) {
7021
247k
        return PyBytes_FromStringAndSize(data, len);
7022
247k
    }
7023
7024
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7025
       bytes, and 1 byte characters 4. */
7026
314
    Py_ssize_t expandsize = kind * 2 + 2;
7027
314
    if (len > PY_SSIZE_T_MAX / expandsize) {
7028
0
        return PyErr_NoMemory();
7029
0
    }
7030
7031
314
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7032
314
    if (writer == NULL) {
7033
0
        return NULL;
7034
0
    }
7035
314
    char *p = PyBytesWriter_GetData(writer);
7036
7037
5.28M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7038
5.28M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7039
7040
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7041
5.28M
        if (ch < 0x100) {
7042
5.24M
            *p++ = (char) ch;
7043
5.24M
        }
7044
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7045
42.3k
        else if (ch < 0x10000) {
7046
41.4k
            *p++ = '\\';
7047
41.4k
            *p++ = 'u';
7048
41.4k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7049
41.4k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7050
41.4k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7051
41.4k
            *p++ = Py_hexdigits[ch & 15];
7052
41.4k
        }
7053
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7054
833
        else {
7055
833
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7056
833
            *p++ = '\\';
7057
833
            *p++ = 'U';
7058
833
            *p++ = '0';
7059
833
            *p++ = '0';
7060
833
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7061
833
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7062
833
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7063
833
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7064
833
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7065
833
            *p++ = Py_hexdigits[ch & 15];
7066
833
        }
7067
5.28M
    }
7068
7069
314
    return PyBytesWriter_FinishWithPointer(writer, p);
7070
314
}
7071
7072
/* --- Latin-1 Codec ------------------------------------------------------ */
7073
7074
PyObject *
7075
PyUnicode_DecodeLatin1(const char *s,
7076
                       Py_ssize_t size,
7077
                       const char *errors)
7078
2.95M
{
7079
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7080
2.95M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7081
2.95M
}
7082
7083
/* create or adjust a UnicodeEncodeError */
7084
static void
7085
make_encode_exception(PyObject **exceptionObject,
7086
                      const char *encoding,
7087
                      PyObject *unicode,
7088
                      Py_ssize_t startpos, Py_ssize_t endpos,
7089
                      const char *reason)
7090
756k
{
7091
756k
    if (*exceptionObject == NULL) {
7092
756k
        *exceptionObject = PyObject_CallFunction(
7093
756k
            PyExc_UnicodeEncodeError, "sOnns",
7094
756k
            encoding, unicode, startpos, endpos, reason);
7095
756k
    }
7096
0
    else {
7097
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7098
0
            goto onError;
7099
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7100
0
            goto onError;
7101
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7102
0
            goto onError;
7103
0
        return;
7104
0
      onError:
7105
0
        Py_CLEAR(*exceptionObject);
7106
0
    }
7107
756k
}
7108
7109
/* raises a UnicodeEncodeError */
7110
static void
7111
raise_encode_exception(PyObject **exceptionObject,
7112
                       const char *encoding,
7113
                       PyObject *unicode,
7114
                       Py_ssize_t startpos, Py_ssize_t endpos,
7115
                       const char *reason)
7116
572k
{
7117
572k
    make_encode_exception(exceptionObject,
7118
572k
                          encoding, unicode, startpos, endpos, reason);
7119
572k
    if (*exceptionObject != NULL)
7120
572k
        PyCodec_StrictErrors(*exceptionObject);
7121
572k
}
7122
7123
/* error handling callback helper:
7124
   build arguments, call the callback and check the arguments,
7125
   put the result into newpos and return the replacement string, which
7126
   has to be freed by the caller */
7127
static PyObject *
7128
unicode_encode_call_errorhandler(const char *errors,
7129
                                 PyObject **errorHandler,
7130
                                 const char *encoding, const char *reason,
7131
                                 PyObject *unicode, PyObject **exceptionObject,
7132
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7133
                                 Py_ssize_t *newpos)
7134
183k
{
7135
183k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7136
183k
    Py_ssize_t len;
7137
183k
    PyObject *restuple;
7138
183k
    PyObject *resunicode;
7139
7140
183k
    if (*errorHandler == NULL) {
7141
183k
        *errorHandler = PyCodec_LookupError(errors);
7142
183k
        if (*errorHandler == NULL)
7143
0
            return NULL;
7144
183k
    }
7145
7146
183k
    len = PyUnicode_GET_LENGTH(unicode);
7147
7148
183k
    make_encode_exception(exceptionObject,
7149
183k
                          encoding, unicode, startpos, endpos, reason);
7150
183k
    if (*exceptionObject == NULL)
7151
0
        return NULL;
7152
7153
183k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7154
183k
    if (restuple == NULL)
7155
183k
        return NULL;
7156
0
    if (!PyTuple_Check(restuple)) {
7157
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7158
0
        Py_DECREF(restuple);
7159
0
        return NULL;
7160
0
    }
7161
0
    if (!PyArg_ParseTuple(restuple, argparse,
7162
0
                          &resunicode, newpos)) {
7163
0
        Py_DECREF(restuple);
7164
0
        return NULL;
7165
0
    }
7166
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7167
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7168
0
        Py_DECREF(restuple);
7169
0
        return NULL;
7170
0
    }
7171
0
    if (*newpos<0)
7172
0
        *newpos = len + *newpos;
7173
0
    if (*newpos<0 || *newpos>len) {
7174
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7175
0
        Py_DECREF(restuple);
7176
0
        return NULL;
7177
0
    }
7178
0
    Py_INCREF(resunicode);
7179
0
    Py_DECREF(restuple);
7180
0
    return resunicode;
7181
0
}
7182
7183
static PyObject *
7184
unicode_encode_ucs1(PyObject *unicode,
7185
                    const char *errors,
7186
                    const Py_UCS4 limit)
7187
591k
{
7188
    /* input state */
7189
591k
    Py_ssize_t pos=0, size;
7190
591k
    int kind;
7191
591k
    const void *data;
7192
591k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7193
591k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7194
591k
    PyObject *error_handler_obj = NULL;
7195
591k
    PyObject *exc = NULL;
7196
591k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7197
591k
    PyObject *rep = NULL;
7198
7199
591k
    size = PyUnicode_GET_LENGTH(unicode);
7200
591k
    kind = PyUnicode_KIND(unicode);
7201
591k
    data = PyUnicode_DATA(unicode);
7202
    /* allocate enough for a simple encoding without
7203
       replacements, if we need more, we'll resize */
7204
591k
    if (size == 0)
7205
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7206
7207
    /* output object */
7208
591k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7209
591k
    if (writer == NULL) {
7210
0
        return NULL;
7211
0
    }
7212
    /* pointer into the output */
7213
591k
    char *str = PyBytesWriter_GetData(writer);
7214
7215
7.44M
    while (pos < size) {
7216
7.44M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7217
7218
        /* can we encode this? */
7219
7.44M
        if (ch < limit) {
7220
            /* no overflow check, because we know that the space is enough */
7221
6.84M
            *str++ = (char)ch;
7222
6.84M
            ++pos;
7223
6.84M
        }
7224
591k
        else {
7225
591k
            Py_ssize_t newpos, i;
7226
            /* startpos for collecting unencodable chars */
7227
591k
            Py_ssize_t collstart = pos;
7228
591k
            Py_ssize_t collend = collstart + 1;
7229
            /* find all unecodable characters */
7230
7231
2.43M
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7232
1.84M
                ++collend;
7233
7234
            /* Only overallocate the buffer if it's not the last write */
7235
591k
            writer->overallocate = (collend < size);
7236
7237
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7238
591k
            if (error_handler == _Py_ERROR_UNKNOWN)
7239
591k
                error_handler = _Py_GetErrorHandler(errors);
7240
7241
591k
            switch (error_handler) {
7242
572k
            case _Py_ERROR_STRICT:
7243
572k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7244
572k
                goto onError;
7245
7246
5.77k
            case _Py_ERROR_REPLACE:
7247
5.77k
                memset(str, '?', collend - collstart);
7248
5.77k
                str += (collend - collstart);
7249
5.77k
                _Py_FALLTHROUGH;
7250
5.77k
            case _Py_ERROR_IGNORE:
7251
5.77k
                pos = collend;
7252
5.77k
                break;
7253
7254
0
            case _Py_ERROR_BACKSLASHREPLACE:
7255
                /* subtract preallocated bytes */
7256
0
                writer->size -= (collend - collstart);
7257
0
                str = backslashreplace(writer, str,
7258
0
                                       unicode, collstart, collend);
7259
0
                if (str == NULL)
7260
0
                    goto onError;
7261
0
                pos = collend;
7262
0
                break;
7263
7264
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7265
                /* subtract preallocated bytes */
7266
0
                writer->size -= (collend - collstart);
7267
0
                str = xmlcharrefreplace(writer, str,
7268
0
                                        unicode, collstart, collend);
7269
0
                if (str == NULL)
7270
0
                    goto onError;
7271
0
                pos = collend;
7272
0
                break;
7273
7274
12.8k
            case _Py_ERROR_SURROGATEESCAPE:
7275
12.8k
                for (i = collstart; i < collend; ++i) {
7276
12.8k
                    ch = PyUnicode_READ(kind, data, i);
7277
12.8k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7278
                        /* Not a UTF-8b surrogate */
7279
12.8k
                        break;
7280
12.8k
                    }
7281
0
                    *str++ = (char)(ch - 0xdc00);
7282
0
                    ++pos;
7283
0
                }
7284
12.8k
                if (i >= collend)
7285
0
                    break;
7286
12.8k
                collstart = pos;
7287
12.8k
                assert(collstart != collend);
7288
12.8k
                _Py_FALLTHROUGH;
7289
7290
12.8k
            default:
7291
12.8k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7292
12.8k
                                                       encoding, reason, unicode, &exc,
7293
12.8k
                                                       collstart, collend, &newpos);
7294
12.8k
                if (rep == NULL)
7295
12.8k
                    goto onError;
7296
7297
0
                if (newpos < collstart) {
7298
0
                    writer->overallocate = 1;
7299
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7300
0
                                                             collstart - newpos,
7301
0
                                                             str);
7302
0
                    if (str == NULL) {
7303
0
                        goto onError;
7304
0
                    }
7305
0
                }
7306
0
                else {
7307
                    /* subtract preallocated bytes */
7308
0
                    writer->size -= newpos - collstart;
7309
                    /* Only overallocate the buffer if it's not the last write */
7310
0
                    writer->overallocate = (newpos < size);
7311
0
                }
7312
7313
0
                char *rep_str;
7314
0
                Py_ssize_t rep_len;
7315
0
                if (PyBytes_Check(rep)) {
7316
                    /* Directly copy bytes result to output. */
7317
0
                    rep_str = PyBytes_AS_STRING(rep);
7318
0
                    rep_len = PyBytes_GET_SIZE(rep);
7319
0
                }
7320
0
                else {
7321
0
                    assert(PyUnicode_Check(rep));
7322
7323
0
                    if (limit == 256 ?
7324
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7325
0
                        !PyUnicode_IS_ASCII(rep))
7326
0
                    {
7327
                        /* Not all characters are smaller than limit */
7328
0
                        raise_encode_exception(&exc, encoding, unicode,
7329
0
                                               collstart, collend, reason);
7330
0
                        goto onError;
7331
0
                    }
7332
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7333
0
                    rep_str = PyUnicode_DATA(rep);
7334
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7335
0
                }
7336
7337
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7338
0
                if (str == NULL) {
7339
0
                    goto onError;
7340
0
                }
7341
0
                memcpy(str, rep_str, rep_len);
7342
0
                str += rep_len;
7343
7344
0
                pos = newpos;
7345
0
                Py_CLEAR(rep);
7346
591k
            }
7347
7348
            /* If overallocation was disabled, ensure that it was the last
7349
               write. Otherwise, we missed an optimization */
7350
591k
            assert(writer->overallocate || pos == size);
7351
5.77k
        }
7352
7.44M
    }
7353
7354
5.60k
    Py_XDECREF(error_handler_obj);
7355
5.60k
    Py_XDECREF(exc);
7356
5.60k
    return PyBytesWriter_FinishWithPointer(writer, str);
7357
7358
585k
  onError:
7359
585k
    Py_XDECREF(rep);
7360
585k
    PyBytesWriter_Discard(writer);
7361
585k
    Py_XDECREF(error_handler_obj);
7362
585k
    Py_XDECREF(exc);
7363
585k
    return NULL;
7364
591k
}
7365
7366
PyObject *
7367
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7368
10
{
7369
10
    if (!PyUnicode_Check(unicode)) {
7370
0
        PyErr_BadArgument();
7371
0
        return NULL;
7372
0
    }
7373
    /* Fast path: if it is a one-byte string, construct
7374
       bytes object directly. */
7375
10
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7376
10
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7377
10
                                         PyUnicode_GET_LENGTH(unicode));
7378
    /* Non-Latin-1 characters present. Defer to above function to
7379
       raise the exception. */
7380
0
    return unicode_encode_ucs1(unicode, errors, 256);
7381
10
}
7382
7383
PyObject*
7384
PyUnicode_AsLatin1String(PyObject *unicode)
7385
0
{
7386
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7387
0
}
7388
7389
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7390
7391
PyObject *
7392
PyUnicode_DecodeASCII(const char *s,
7393
                      Py_ssize_t size,
7394
                      const char *errors)
7395
11.9M
{
7396
11.9M
    const char *starts = s;
7397
11.9M
    const char *e = s + size;
7398
11.9M
    PyObject *error_handler_obj = NULL;
7399
11.9M
    PyObject *exc = NULL;
7400
11.9M
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7401
7402
11.9M
    if (size == 0)
7403
0
        _Py_RETURN_UNICODE_EMPTY();
7404
7405
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7406
11.9M
    if (size == 1 && (unsigned char)s[0] < 128) {
7407
524k
        return get_latin1_char((unsigned char)s[0]);
7408
524k
    }
7409
7410
    // Shortcut for simple case
7411
11.4M
    PyObject *u = PyUnicode_New(size, 127);
7412
11.4M
    if (u == NULL) {
7413
0
        return NULL;
7414
0
    }
7415
11.4M
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7416
11.4M
    if (outpos == size) {
7417
9.00M
        return u;
7418
9.00M
    }
7419
7420
2.41M
    _PyUnicodeWriter writer;
7421
2.41M
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7422
2.41M
    writer.pos = outpos;
7423
7424
2.41M
    s += outpos;
7425
2.41M
    int kind = writer.kind;
7426
2.41M
    void *data = writer.data;
7427
2.41M
    Py_ssize_t startinpos, endinpos;
7428
7429
22.9M
    while (s < e) {
7430
22.7M
        unsigned char c = (unsigned char)*s;
7431
22.7M
        if (c < 128) {
7432
6.52M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7433
6.52M
            writer.pos++;
7434
6.52M
            ++s;
7435
6.52M
            continue;
7436
6.52M
        }
7437
7438
        /* byte outsize range 0x00..0x7f: call the error handler */
7439
7440
16.2M
        if (error_handler == _Py_ERROR_UNKNOWN)
7441
2.41M
            error_handler = _Py_GetErrorHandler(errors);
7442
7443
16.2M
        switch (error_handler)
7444
16.2M
        {
7445
800k
        case _Py_ERROR_REPLACE:
7446
14.0M
        case _Py_ERROR_SURROGATEESCAPE:
7447
            /* Fast-path: the error handler only writes one character,
7448
               but we may switch to UCS2 at the first write */
7449
14.0M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7450
0
                goto onError;
7451
14.0M
            kind = writer.kind;
7452
14.0M
            data = writer.data;
7453
7454
14.0M
            if (error_handler == _Py_ERROR_REPLACE)
7455
800k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7456
13.2M
            else
7457
13.2M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7458
14.0M
            writer.pos++;
7459
14.0M
            ++s;
7460
14.0M
            break;
7461
7462
0
        case _Py_ERROR_IGNORE:
7463
0
            ++s;
7464
0
            break;
7465
7466
2.24M
        default:
7467
2.24M
            startinpos = s-starts;
7468
2.24M
            endinpos = startinpos + 1;
7469
2.24M
            if (unicode_decode_call_errorhandler_writer(
7470
2.24M
                    errors, &error_handler_obj,
7471
2.24M
                    "ascii", "ordinal not in range(128)",
7472
2.24M
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7473
2.24M
                    &writer))
7474
2.24M
                goto onError;
7475
0
            kind = writer.kind;
7476
0
            data = writer.data;
7477
16.2M
        }
7478
16.2M
    }
7479
167k
    Py_XDECREF(error_handler_obj);
7480
167k
    Py_XDECREF(exc);
7481
167k
    return _PyUnicodeWriter_Finish(&writer);
7482
7483
2.24M
  onError:
7484
2.24M
    _PyUnicodeWriter_Dealloc(&writer);
7485
2.24M
    Py_XDECREF(error_handler_obj);
7486
2.24M
    Py_XDECREF(exc);
7487
2.24M
    return NULL;
7488
2.41M
}
7489
7490
PyObject *
7491
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7492
953k
{
7493
953k
    if (!PyUnicode_Check(unicode)) {
7494
0
        PyErr_BadArgument();
7495
0
        return NULL;
7496
0
    }
7497
    /* Fast path: if it is an ASCII-only string, construct bytes object
7498
       directly. Else defer to above function to raise the exception. */
7499
953k
    if (PyUnicode_IS_ASCII(unicode))
7500
362k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7501
362k
                                         PyUnicode_GET_LENGTH(unicode));
7502
591k
    return unicode_encode_ucs1(unicode, errors, 128);
7503
953k
}
7504
7505
PyObject *
7506
PyUnicode_AsASCIIString(PyObject *unicode)
7507
120k
{
7508
120k
    return _PyUnicode_AsASCIIString(unicode, NULL);
7509
120k
}
7510
7511
#ifdef MS_WINDOWS
7512
7513
/* --- MBCS codecs for Windows -------------------------------------------- */
7514
7515
#if SIZEOF_INT < SIZEOF_SIZE_T
7516
#define NEED_RETRY
7517
#endif
7518
7519
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7520
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7521
   both cases also and avoids partial characters overrunning the
7522
   length limit in MultiByteToWideChar on Windows */
7523
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7524
7525
#ifndef WC_ERR_INVALID_CHARS
7526
#  define WC_ERR_INVALID_CHARS 0x0080
7527
#endif
7528
7529
static const char*
7530
code_page_name(UINT code_page, PyObject **obj)
7531
{
7532
    *obj = NULL;
7533
    if (code_page == CP_ACP)
7534
        return "mbcs";
7535
7536
    *obj = PyBytes_FromFormat("cp%u", code_page);
7537
    if (*obj == NULL)
7538
        return NULL;
7539
    return PyBytes_AS_STRING(*obj);
7540
}
7541
7542
static DWORD
7543
decode_code_page_flags(UINT code_page)
7544
{
7545
    if (code_page == CP_UTF7) {
7546
        /* The CP_UTF7 decoder only supports flags=0 */
7547
        return 0;
7548
    }
7549
    else
7550
        return MB_ERR_INVALID_CHARS;
7551
}
7552
7553
/*
7554
 * Decode a byte string from a Windows code page into unicode object in strict
7555
 * mode.
7556
 *
7557
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7558
 * OSError and returns -1 on other error.
7559
 */
7560
static int
7561
decode_code_page_strict(UINT code_page,
7562
                        wchar_t **buf,
7563
                        Py_ssize_t *bufsize,
7564
                        const char *in,
7565
                        int insize)
7566
{
7567
    DWORD flags = MB_ERR_INVALID_CHARS;
7568
    wchar_t *out;
7569
    DWORD outsize;
7570
7571
    /* First get the size of the result */
7572
    assert(insize > 0);
7573
    while ((outsize = MultiByteToWideChar(code_page, flags,
7574
                                          in, insize, NULL, 0)) <= 0)
7575
    {
7576
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7577
            goto error;
7578
        }
7579
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7580
        flags = 0;
7581
    }
7582
7583
    /* Extend a wchar_t* buffer */
7584
    Py_ssize_t n = *bufsize;   /* Get the current length */
7585
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7586
        return -1;
7587
    }
7588
    out = *buf + n;
7589
7590
    /* Do the conversion */
7591
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7592
    if (outsize <= 0)
7593
        goto error;
7594
    return insize;
7595
7596
error:
7597
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7598
        return -2;
7599
    PyErr_SetFromWindowsErr(0);
7600
    return -1;
7601
}
7602
7603
/*
7604
 * Decode a byte string from a code page into unicode object with an error
7605
 * handler.
7606
 *
7607
 * Returns consumed size if succeed, or raise an OSError or
7608
 * UnicodeDecodeError exception and returns -1 on error.
7609
 */
7610
static int
7611
decode_code_page_errors(UINT code_page,
7612
                        wchar_t **buf,
7613
                        Py_ssize_t *bufsize,
7614
                        const char *in, const int size,
7615
                        const char *errors, int final)
7616
{
7617
    const char *startin = in;
7618
    const char *endin = in + size;
7619
    DWORD flags = MB_ERR_INVALID_CHARS;
7620
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7621
       2000 English version of the message. */
7622
    const char *reason = "No mapping for the Unicode character exists "
7623
                         "in the target code page.";
7624
    /* each step cannot decode more than 1 character, but a character can be
7625
       represented as a surrogate pair */
7626
    wchar_t buffer[2], *out;
7627
    int insize;
7628
    Py_ssize_t outsize;
7629
    PyObject *errorHandler = NULL;
7630
    PyObject *exc = NULL;
7631
    PyObject *encoding_obj = NULL;
7632
    const char *encoding;
7633
    DWORD err;
7634
    int ret = -1;
7635
7636
    assert(size > 0);
7637
7638
    encoding = code_page_name(code_page, &encoding_obj);
7639
    if (encoding == NULL)
7640
        return -1;
7641
7642
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7643
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7644
           UnicodeDecodeError. */
7645
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7646
        if (exc != NULL) {
7647
            PyCodec_StrictErrors(exc);
7648
            Py_CLEAR(exc);
7649
        }
7650
        goto error;
7651
    }
7652
7653
    /* Extend a wchar_t* buffer */
7654
    Py_ssize_t n = *bufsize;   /* Get the current length */
7655
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7656
        PyErr_NoMemory();
7657
        goto error;
7658
    }
7659
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7660
        goto error;
7661
    }
7662
    out = *buf + n;
7663
7664
    /* Decode the byte string character per character */
7665
    while (in < endin)
7666
    {
7667
        /* Decode a character */
7668
        insize = 1;
7669
        do
7670
        {
7671
            outsize = MultiByteToWideChar(code_page, flags,
7672
                                          in, insize,
7673
                                          buffer, Py_ARRAY_LENGTH(buffer));
7674
            if (outsize > 0)
7675
                break;
7676
            err = GetLastError();
7677
            if (err == ERROR_INVALID_FLAGS && flags) {
7678
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7679
                flags = 0;
7680
                continue;
7681
            }
7682
            if (err != ERROR_NO_UNICODE_TRANSLATION
7683
                && err != ERROR_INSUFFICIENT_BUFFER)
7684
            {
7685
                PyErr_SetFromWindowsErr(err);
7686
                goto error;
7687
            }
7688
            insize++;
7689
        }
7690
        /* 4=maximum length of a UTF-8 sequence */
7691
        while (insize <= 4 && (in + insize) <= endin);
7692
7693
        if (outsize <= 0) {
7694
            Py_ssize_t startinpos, endinpos, outpos;
7695
7696
            /* last character in partial decode? */
7697
            if (in + insize >= endin && !final)
7698
                break;
7699
7700
            startinpos = in - startin;
7701
            endinpos = startinpos + 1;
7702
            outpos = out - *buf;
7703
            if (unicode_decode_call_errorhandler_wchar(
7704
                    errors, &errorHandler,
7705
                    encoding, reason,
7706
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7707
                    buf, bufsize, &outpos))
7708
            {
7709
                goto error;
7710
            }
7711
            out = *buf + outpos;
7712
        }
7713
        else {
7714
            in += insize;
7715
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7716
            out += outsize;
7717
        }
7718
    }
7719
7720
    /* Shrink the buffer */
7721
    assert(out - *buf <= *bufsize);
7722
    *bufsize = out - *buf;
7723
    /* (in - startin) <= size and size is an int */
7724
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7725
7726
error:
7727
    Py_XDECREF(encoding_obj);
7728
    Py_XDECREF(errorHandler);
7729
    Py_XDECREF(exc);
7730
    return ret;
7731
}
7732
7733
static PyObject *
7734
decode_code_page_stateful(int code_page,
7735
                          const char *s, Py_ssize_t size,
7736
                          const char *errors, Py_ssize_t *consumed)
7737
{
7738
    wchar_t *buf = NULL;
7739
    Py_ssize_t bufsize = 0;
7740
    int chunk_size, final, converted, done;
7741
7742
    if (code_page < 0) {
7743
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7744
        return NULL;
7745
    }
7746
    if (size < 0) {
7747
        PyErr_BadInternalCall();
7748
        return NULL;
7749
    }
7750
7751
    if (consumed)
7752
        *consumed = 0;
7753
7754
    do
7755
    {
7756
#ifdef NEED_RETRY
7757
        if (size > DECODING_CHUNK_SIZE) {
7758
            chunk_size = DECODING_CHUNK_SIZE;
7759
            final = 0;
7760
            done = 0;
7761
        }
7762
        else
7763
#endif
7764
        {
7765
            chunk_size = (int)size;
7766
            final = (consumed == NULL);
7767
            done = 1;
7768
        }
7769
7770
        if (chunk_size == 0 && done) {
7771
            if (buf != NULL)
7772
                break;
7773
            _Py_RETURN_UNICODE_EMPTY();
7774
        }
7775
7776
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7777
                                            s, chunk_size);
7778
        if (converted == -2)
7779
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7780
                                                s, chunk_size,
7781
                                                errors, final);
7782
        assert(converted != 0 || done);
7783
7784
        if (converted < 0) {
7785
            PyMem_Free(buf);
7786
            return NULL;
7787
        }
7788
7789
        if (consumed)
7790
            *consumed += converted;
7791
7792
        s += converted;
7793
        size -= converted;
7794
    } while (!done);
7795
7796
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7797
    PyMem_Free(buf);
7798
    return v;
7799
}
7800
7801
PyObject *
7802
PyUnicode_DecodeCodePageStateful(int code_page,
7803
                                 const char *s,
7804
                                 Py_ssize_t size,
7805
                                 const char *errors,
7806
                                 Py_ssize_t *consumed)
7807
{
7808
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7809
}
7810
7811
PyObject *
7812
PyUnicode_DecodeMBCSStateful(const char *s,
7813
                             Py_ssize_t size,
7814
                             const char *errors,
7815
                             Py_ssize_t *consumed)
7816
{
7817
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7818
}
7819
7820
PyObject *
7821
PyUnicode_DecodeMBCS(const char *s,
7822
                     Py_ssize_t size,
7823
                     const char *errors)
7824
{
7825
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7826
}
7827
7828
static DWORD
7829
encode_code_page_flags(UINT code_page, const char *errors)
7830
{
7831
    if (code_page == CP_UTF8) {
7832
        return WC_ERR_INVALID_CHARS;
7833
    }
7834
    else if (code_page == CP_UTF7) {
7835
        /* CP_UTF7 only supports flags=0 */
7836
        return 0;
7837
    }
7838
    else {
7839
        if (errors != NULL && strcmp(errors, "replace") == 0)
7840
            return 0;
7841
        else
7842
            return WC_NO_BEST_FIT_CHARS;
7843
    }
7844
}
7845
7846
/*
7847
 * Encode a Unicode string to a Windows code page into a byte string in strict
7848
 * mode.
7849
 *
7850
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7851
 * an OSError and returns -1 on other error.
7852
 */
7853
static int
7854
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7855
                        PyObject *unicode, Py_ssize_t offset, int len,
7856
                        const char* errors)
7857
{
7858
    BOOL usedDefaultChar = FALSE;
7859
    BOOL *pusedDefaultChar = &usedDefaultChar;
7860
    int outsize;
7861
    wchar_t *p;
7862
    Py_ssize_t size;
7863
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7864
    char *out;
7865
    /* Create a substring so that we can get the UTF-16 representation
7866
       of just the slice under consideration. */
7867
    PyObject *substring;
7868
    int ret = -1;
7869
7870
    assert(len > 0);
7871
7872
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7873
        pusedDefaultChar = &usedDefaultChar;
7874
    else
7875
        pusedDefaultChar = NULL;
7876
7877
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7878
    if (substring == NULL)
7879
        return -1;
7880
    p = PyUnicode_AsWideCharString(substring, &size);
7881
    Py_CLEAR(substring);
7882
    if (p == NULL) {
7883
        return -1;
7884
    }
7885
    assert(size <= INT_MAX);
7886
7887
    /* First get the size of the result */
7888
    outsize = WideCharToMultiByte(code_page, flags,
7889
                                  p, (int)size,
7890
                                  NULL, 0,
7891
                                  NULL, pusedDefaultChar);
7892
    if (outsize <= 0)
7893
        goto error;
7894
    /* If we used a default char, then we failed! */
7895
    if (pusedDefaultChar && *pusedDefaultChar) {
7896
        ret = -2;
7897
        goto done;
7898
    }
7899
7900
    if (*writer == NULL) {
7901
        /* Create string object */
7902
        *writer = PyBytesWriter_Create(outsize);
7903
        if (*writer == NULL) {
7904
            goto done;
7905
        }
7906
        out = PyBytesWriter_GetData(*writer);
7907
    }
7908
    else {
7909
        /* Extend string object */
7910
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
7911
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
7912
            goto done;
7913
        }
7914
        out = (char*)PyBytesWriter_GetData(*writer) + n;
7915
    }
7916
7917
    /* Do the conversion */
7918
    outsize = WideCharToMultiByte(code_page, flags,
7919
                                  p, (int)size,
7920
                                  out, outsize,
7921
                                  NULL, pusedDefaultChar);
7922
    if (outsize <= 0)
7923
        goto error;
7924
    if (pusedDefaultChar && *pusedDefaultChar) {
7925
        ret = -2;
7926
        goto done;
7927
    }
7928
    ret = 0;
7929
7930
done:
7931
    PyMem_Free(p);
7932
    return ret;
7933
7934
error:
7935
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7936
        ret = -2;
7937
        goto done;
7938
    }
7939
    PyErr_SetFromWindowsErr(0);
7940
    goto done;
7941
}
7942
7943
/*
7944
 * Encode a Unicode string to a Windows code page into a byte string using an
7945
 * error handler.
7946
 *
7947
 * Returns consumed characters if succeed, or raise an OSError and returns
7948
 * -1 on other error.
7949
 */
7950
static int
7951
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
7952
                        PyObject *unicode, Py_ssize_t unicode_offset,
7953
                        Py_ssize_t insize, const char* errors)
7954
{
7955
    const DWORD flags = encode_code_page_flags(code_page, errors);
7956
    Py_ssize_t pos = unicode_offset;
7957
    Py_ssize_t endin = unicode_offset + insize;
7958
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7959
       2000 English version of the message. */
7960
    const char *reason = "invalid character";
7961
    /* 4=maximum length of a UTF-8 sequence */
7962
    char buffer[4];
7963
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7964
    Py_ssize_t outsize;
7965
    char *out;
7966
    PyObject *errorHandler = NULL;
7967
    PyObject *exc = NULL;
7968
    PyObject *encoding_obj = NULL;
7969
    const char *encoding;
7970
    Py_ssize_t newpos;
7971
    PyObject *rep;
7972
    int ret = -1;
7973
7974
    assert(insize > 0);
7975
7976
    encoding = code_page_name(code_page, &encoding_obj);
7977
    if (encoding == NULL)
7978
        return -1;
7979
7980
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7981
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7982
           then we raise a UnicodeEncodeError. */
7983
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7984
        if (exc != NULL) {
7985
            PyCodec_StrictErrors(exc);
7986
            Py_DECREF(exc);
7987
        }
7988
        Py_XDECREF(encoding_obj);
7989
        return -1;
7990
    }
7991
7992
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7993
        pusedDefaultChar = &usedDefaultChar;
7994
    else
7995
        pusedDefaultChar = NULL;
7996
7997
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7998
        PyErr_NoMemory();
7999
        goto error;
8000
    }
8001
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8002
8003
    if (*writer == NULL) {
8004
        /* Create string object */
8005
        *writer = PyBytesWriter_Create(outsize);
8006
        if (*writer == NULL) {
8007
            goto error;
8008
        }
8009
        out = PyBytesWriter_GetData(*writer);
8010
    }
8011
    else {
8012
        /* Extend string object */
8013
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8014
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8015
            goto error;
8016
        }
8017
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8018
    }
8019
8020
    /* Encode the string character per character */
8021
    while (pos < endin)
8022
    {
8023
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8024
        wchar_t chars[2];
8025
        int charsize;
8026
        if (ch < 0x10000) {
8027
            chars[0] = (wchar_t)ch;
8028
            charsize = 1;
8029
        }
8030
        else {
8031
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8032
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8033
            charsize = 2;
8034
        }
8035
8036
        outsize = WideCharToMultiByte(code_page, flags,
8037
                                      chars, charsize,
8038
                                      buffer, Py_ARRAY_LENGTH(buffer),
8039
                                      NULL, pusedDefaultChar);
8040
        if (outsize > 0) {
8041
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8042
            {
8043
                pos++;
8044
                memcpy(out, buffer, outsize);
8045
                out += outsize;
8046
                continue;
8047
            }
8048
        }
8049
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8050
            PyErr_SetFromWindowsErr(0);
8051
            goto error;
8052
        }
8053
8054
        rep = unicode_encode_call_errorhandler(
8055
                  errors, &errorHandler, encoding, reason,
8056
                  unicode, &exc,
8057
                  pos, pos + 1, &newpos);
8058
        if (rep == NULL)
8059
            goto error;
8060
8061
        Py_ssize_t morebytes = pos - newpos;
8062
        if (PyBytes_Check(rep)) {
8063
            outsize = PyBytes_GET_SIZE(rep);
8064
            morebytes += outsize;
8065
            if (morebytes > 0) {
8066
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8067
                if (out == NULL) {
8068
                    Py_DECREF(rep);
8069
                    goto error;
8070
                }
8071
            }
8072
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8073
            out += outsize;
8074
        }
8075
        else {
8076
            Py_ssize_t i;
8077
            int kind;
8078
            const void *data;
8079
8080
            outsize = PyUnicode_GET_LENGTH(rep);
8081
            morebytes += outsize;
8082
            if (morebytes > 0) {
8083
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8084
                if (out == NULL) {
8085
                    Py_DECREF(rep);
8086
                    goto error;
8087
                }
8088
            }
8089
            kind = PyUnicode_KIND(rep);
8090
            data = PyUnicode_DATA(rep);
8091
            for (i=0; i < outsize; i++) {
8092
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8093
                if (ch > 127) {
8094
                    raise_encode_exception(&exc,
8095
                        encoding, unicode,
8096
                        pos, pos + 1,
8097
                        "unable to encode error handler result to ASCII");
8098
                    Py_DECREF(rep);
8099
                    goto error;
8100
                }
8101
                *out = (unsigned char)ch;
8102
                out++;
8103
            }
8104
        }
8105
        pos = newpos;
8106
        Py_DECREF(rep);
8107
    }
8108
    /* write a NUL byte */
8109
    *out = 0;
8110
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8111
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8112
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8113
        goto error;
8114
    }
8115
    ret = 0;
8116
8117
error:
8118
    Py_XDECREF(encoding_obj);
8119
    Py_XDECREF(errorHandler);
8120
    Py_XDECREF(exc);
8121
    return ret;
8122
}
8123
8124
8125
PyObject *
8126
PyUnicode_EncodeCodePage(int code_page,
8127
                         PyObject *unicode,
8128
                         const char *errors)
8129
{
8130
    Py_ssize_t len;
8131
    PyBytesWriter *writer = NULL;
8132
    Py_ssize_t offset;
8133
    int chunk_len, ret, done;
8134
8135
    if (!PyUnicode_Check(unicode)) {
8136
        PyErr_BadArgument();
8137
        return NULL;
8138
    }
8139
8140
    len = PyUnicode_GET_LENGTH(unicode);
8141
8142
    if (code_page < 0) {
8143
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8144
        return NULL;
8145
    }
8146
8147
    if (len == 0)
8148
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8149
8150
    offset = 0;
8151
    do
8152
    {
8153
#ifdef NEED_RETRY
8154
        if (len > DECODING_CHUNK_SIZE) {
8155
            chunk_len = DECODING_CHUNK_SIZE;
8156
            done = 0;
8157
        }
8158
        else
8159
#endif
8160
        {
8161
            chunk_len = (int)len;
8162
            done = 1;
8163
        }
8164
8165
        ret = encode_code_page_strict(code_page, &writer,
8166
                                      unicode, offset, chunk_len,
8167
                                      errors);
8168
        if (ret == -2)
8169
            ret = encode_code_page_errors(code_page, &writer,
8170
                                          unicode, offset,
8171
                                          chunk_len, errors);
8172
        if (ret < 0) {
8173
            PyBytesWriter_Discard(writer);
8174
            return NULL;
8175
        }
8176
8177
        offset += chunk_len;
8178
        len -= chunk_len;
8179
    } while (!done);
8180
8181
    return PyBytesWriter_Finish(writer);
8182
}
8183
8184
8185
PyObject *
8186
PyUnicode_AsMBCSString(PyObject *unicode)
8187
{
8188
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8189
}
8190
8191
#undef NEED_RETRY
8192
8193
#endif /* MS_WINDOWS */
8194
8195
/* --- Character Mapping Codec -------------------------------------------- */
8196
8197
static int
8198
charmap_decode_string(const char *s,
8199
                      Py_ssize_t size,
8200
                      PyObject *mapping,
8201
                      const char *errors,
8202
                      _PyUnicodeWriter *writer)
8203
655k
{
8204
655k
    const char *starts = s;
8205
655k
    const char *e;
8206
655k
    Py_ssize_t startinpos, endinpos;
8207
655k
    PyObject *errorHandler = NULL, *exc = NULL;
8208
655k
    Py_ssize_t maplen;
8209
655k
    int mapkind;
8210
655k
    const void *mapdata;
8211
655k
    Py_UCS4 x;
8212
655k
    unsigned char ch;
8213
8214
655k
    maplen = PyUnicode_GET_LENGTH(mapping);
8215
655k
    mapdata = PyUnicode_DATA(mapping);
8216
655k
    mapkind = PyUnicode_KIND(mapping);
8217
8218
655k
    e = s + size;
8219
8220
655k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8221
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8222
         * is disabled in encoding aliases, latin1 is preferred because
8223
         * its implementation is faster. */
8224
130
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8225
130
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8226
130
        Py_UCS4 maxchar = writer->maxchar;
8227
8228
130
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8229
1.05M
        while (s < e) {
8230
1.05M
            ch = *s;
8231
1.05M
            x = mapdata_ucs1[ch];
8232
1.05M
            if (x > maxchar) {
8233
120
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8234
0
                    goto onError;
8235
120
                maxchar = writer->maxchar;
8236
120
                outdata = (Py_UCS1 *)writer->data;
8237
120
            }
8238
1.05M
            outdata[writer->pos] = x;
8239
1.05M
            writer->pos++;
8240
1.05M
            ++s;
8241
1.05M
        }
8242
130
        return 0;
8243
130
    }
8244
8245
766k
    while (s < e) {
8246
750k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8247
750k
            int outkind = writer->kind;
8248
750k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8249
750k
            if (outkind == PyUnicode_1BYTE_KIND) {
8250
690k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8251
690k
                Py_UCS4 maxchar = writer->maxchar;
8252
21.4M
                while (s < e) {
8253
20.7M
                    ch = *s;
8254
20.7M
                    x = mapdata_ucs2[ch];
8255
20.7M
                    if (x > maxchar)
8256
75.7k
                        goto Error;
8257
20.7M
                    outdata[writer->pos] = x;
8258
20.7M
                    writer->pos++;
8259
20.7M
                    ++s;
8260
20.7M
                }
8261
615k
                break;
8262
690k
            }
8263
60.0k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8264
60.0k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8265
32.1M
                while (s < e) {
8266
32.1M
                    ch = *s;
8267
32.1M
                    x = mapdata_ucs2[ch];
8268
32.1M
                    if (x == 0xFFFE)
8269
35.7k
                        goto Error;
8270
32.1M
                    outdata[writer->pos] = x;
8271
32.1M
                    writer->pos++;
8272
32.1M
                    ++s;
8273
32.1M
                }
8274
24.3k
                break;
8275
60.0k
            }
8276
750k
        }
8277
0
        ch = *s;
8278
8279
0
        if (ch < maplen)
8280
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8281
0
        else
8282
0
            x = 0xfffe; /* invalid value */
8283
111k
Error:
8284
111k
        if (x == 0xfffe)
8285
56.7k
        {
8286
            /* undefined mapping */
8287
56.7k
            startinpos = s-starts;
8288
56.7k
            endinpos = startinpos+1;
8289
56.7k
            if (unicode_decode_call_errorhandler_writer(
8290
56.7k
                    errors, &errorHandler,
8291
56.7k
                    "charmap", "character maps to <undefined>",
8292
56.7k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8293
56.7k
                    writer)) {
8294
18
                goto onError;
8295
18
            }
8296
56.7k
            continue;
8297
56.7k
        }
8298
8299
54.6k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8300
0
            goto onError;
8301
54.6k
        ++s;
8302
54.6k
    }
8303
654k
    Py_XDECREF(errorHandler);
8304
654k
    Py_XDECREF(exc);
8305
654k
    return 0;
8306
8307
18
onError:
8308
18
    Py_XDECREF(errorHandler);
8309
18
    Py_XDECREF(exc);
8310
18
    return -1;
8311
654k
}
8312
8313
static int
8314
charmap_decode_mapping(const char *s,
8315
                       Py_ssize_t size,
8316
                       PyObject *mapping,
8317
                       const char *errors,
8318
                       _PyUnicodeWriter *writer)
8319
0
{
8320
0
    const char *starts = s;
8321
0
    const char *e;
8322
0
    Py_ssize_t startinpos, endinpos;
8323
0
    PyObject *errorHandler = NULL, *exc = NULL;
8324
0
    unsigned char ch;
8325
0
    PyObject *key, *item = NULL;
8326
8327
0
    e = s + size;
8328
8329
0
    while (s < e) {
8330
0
        ch = *s;
8331
8332
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8333
0
        key = PyLong_FromLong((long)ch);
8334
0
        if (key == NULL)
8335
0
            goto onError;
8336
8337
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8338
0
        Py_DECREF(key);
8339
0
        if (rc == 0) {
8340
            /* No mapping found means: mapping is undefined. */
8341
0
            goto Undefined;
8342
0
        }
8343
0
        if (item == NULL) {
8344
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8345
                /* No mapping found means: mapping is undefined. */
8346
0
                PyErr_Clear();
8347
0
                goto Undefined;
8348
0
            } else
8349
0
                goto onError;
8350
0
        }
8351
8352
        /* Apply mapping */
8353
0
        if (item == Py_None)
8354
0
            goto Undefined;
8355
0
        if (PyLong_Check(item)) {
8356
0
            long value = PyLong_AsLong(item);
8357
0
            if (value == 0xFFFE)
8358
0
                goto Undefined;
8359
0
            if (value < 0 || value > MAX_UNICODE) {
8360
0
                PyErr_Format(PyExc_TypeError,
8361
0
                             "character mapping must be in range(0x%lx)",
8362
0
                             (unsigned long)MAX_UNICODE + 1);
8363
0
                goto onError;
8364
0
            }
8365
8366
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8367
0
                goto onError;
8368
0
        }
8369
0
        else if (PyUnicode_Check(item)) {
8370
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8371
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8372
0
                if (value == 0xFFFE)
8373
0
                    goto Undefined;
8374
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8375
0
                    goto onError;
8376
0
            }
8377
0
            else {
8378
0
                writer->overallocate = 1;
8379
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8380
0
                    goto onError;
8381
0
            }
8382
0
        }
8383
0
        else {
8384
            /* wrong return value */
8385
0
            PyErr_SetString(PyExc_TypeError,
8386
0
                            "character mapping must return integer, None or str");
8387
0
            goto onError;
8388
0
        }
8389
0
        Py_CLEAR(item);
8390
0
        ++s;
8391
0
        continue;
8392
8393
0
Undefined:
8394
        /* undefined mapping */
8395
0
        Py_CLEAR(item);
8396
0
        startinpos = s-starts;
8397
0
        endinpos = startinpos+1;
8398
0
        if (unicode_decode_call_errorhandler_writer(
8399
0
                errors, &errorHandler,
8400
0
                "charmap", "character maps to <undefined>",
8401
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8402
0
                writer)) {
8403
0
            goto onError;
8404
0
        }
8405
0
    }
8406
0
    Py_XDECREF(errorHandler);
8407
0
    Py_XDECREF(exc);
8408
0
    return 0;
8409
8410
0
onError:
8411
0
    Py_XDECREF(item);
8412
0
    Py_XDECREF(errorHandler);
8413
0
    Py_XDECREF(exc);
8414
0
    return -1;
8415
0
}
8416
8417
PyObject *
8418
PyUnicode_DecodeCharmap(const char *s,
8419
                        Py_ssize_t size,
8420
                        PyObject *mapping,
8421
                        const char *errors)
8422
655k
{
8423
655k
    _PyUnicodeWriter writer;
8424
8425
    /* Default to Latin-1 */
8426
655k
    if (mapping == NULL)
8427
21
        return PyUnicode_DecodeLatin1(s, size, errors);
8428
8429
655k
    if (size == 0)
8430
0
        _Py_RETURN_UNICODE_EMPTY();
8431
655k
    _PyUnicodeWriter_Init(&writer);
8432
655k
    writer.min_length = size;
8433
655k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8434
0
        goto onError;
8435
8436
655k
    if (PyUnicode_CheckExact(mapping)) {
8437
655k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8438
18
            goto onError;
8439
655k
    }
8440
0
    else {
8441
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8442
0
            goto onError;
8443
0
    }
8444
655k
    return _PyUnicodeWriter_Finish(&writer);
8445
8446
18
  onError:
8447
18
    _PyUnicodeWriter_Dealloc(&writer);
8448
18
    return NULL;
8449
655k
}
8450
8451
/* Charmap encoding: the lookup table */
8452
8453
/*[clinic input]
8454
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8455
[clinic start generated code]*/
8456
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8457
8458
struct encoding_map {
8459
    PyObject_HEAD
8460
    unsigned char level1[32];
8461
    int count2, count3;
8462
    unsigned char level23[1];
8463
};
8464
8465
/*[clinic input]
8466
EncodingMap.size
8467
8468
Return the size (in bytes) of this object.
8469
[clinic start generated code]*/
8470
8471
static PyObject *
8472
EncodingMap_size_impl(struct encoding_map *self)
8473
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8474
0
{
8475
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8476
0
                           128*self->count3);
8477
0
}
8478
8479
static PyMethodDef encoding_map_methods[] = {
8480
    ENCODINGMAP_SIZE_METHODDEF
8481
    {NULL, NULL}
8482
};
8483
8484
static PyTypeObject EncodingMapType = {
8485
    PyVarObject_HEAD_INIT(NULL, 0)
8486
    .tp_name = "EncodingMap",
8487
    .tp_basicsize = sizeof(struct encoding_map),
8488
    /* methods */
8489
    .tp_flags = Py_TPFLAGS_DEFAULT,
8490
    .tp_methods = encoding_map_methods,
8491
};
8492
8493
PyObject*
8494
PyUnicode_BuildEncodingMap(PyObject* string)
8495
136
{
8496
136
    PyObject *result;
8497
136
    struct encoding_map *mresult;
8498
136
    int i;
8499
136
    int need_dict = 0;
8500
136
    unsigned char level1[32];
8501
136
    unsigned char level2[512];
8502
136
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8503
136
    int count2 = 0, count3 = 0;
8504
136
    int kind;
8505
136
    const void *data;
8506
136
    int length;
8507
136
    Py_UCS4 ch;
8508
8509
136
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8510
0
        PyErr_BadArgument();
8511
0
        return NULL;
8512
0
    }
8513
136
    kind = PyUnicode_KIND(string);
8514
136
    data = PyUnicode_DATA(string);
8515
136
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8516
136
    memset(level1, 0xFF, sizeof level1);
8517
136
    memset(level2, 0xFF, sizeof level2);
8518
8519
    /* If there isn't a one-to-one mapping of NULL to \0,
8520
       or if there are non-BMP characters, we need to use
8521
       a mapping dictionary. */
8522
136
    if (PyUnicode_READ(kind, data, 0) != 0)
8523
0
        need_dict = 1;
8524
34.8k
    for (i = 1; i < length; i++) {
8525
34.6k
        int l1, l2;
8526
34.6k
        ch = PyUnicode_READ(kind, data, i);
8527
34.6k
        if (ch == 0 || ch > 0xFFFF) {
8528
0
            need_dict = 1;
8529
0
            break;
8530
0
        }
8531
34.6k
        if (ch == 0xFFFE)
8532
            /* unmapped character */
8533
976
            continue;
8534
33.7k
        l1 = ch >> 11;
8535
33.7k
        l2 = ch >> 7;
8536
33.7k
        if (level1[l1] == 0xFF)
8537
249
            level1[l1] = count2++;
8538
33.7k
        if (level2[l2] == 0xFF)
8539
732
            level2[l2] = count3++;
8540
33.7k
    }
8541
8542
136
    if (count2 >= 0xFF || count3 >= 0xFF)
8543
0
        need_dict = 1;
8544
8545
136
    if (need_dict) {
8546
0
        PyObject *result = PyDict_New();
8547
0
        if (!result)
8548
0
            return NULL;
8549
0
        for (i = 0; i < length; i++) {
8550
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8551
0
            PyObject *key = PyLong_FromLong(c);
8552
0
            if (key == NULL) {
8553
0
                Py_DECREF(result);
8554
0
                return NULL;
8555
0
            }
8556
0
            PyObject *value = PyLong_FromLong(i);
8557
0
            if (value == NULL) {
8558
0
                Py_DECREF(key);
8559
0
                Py_DECREF(result);
8560
0
                return NULL;
8561
0
            }
8562
0
            int rc = PyDict_SetItem(result, key, value);
8563
0
            Py_DECREF(key);
8564
0
            Py_DECREF(value);
8565
0
            if (rc < 0) {
8566
0
                Py_DECREF(result);
8567
0
                return NULL;
8568
0
            }
8569
0
        }
8570
0
        return result;
8571
0
    }
8572
8573
    /* Create a three-level trie */
8574
136
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8575
136
                             16*count2 + 128*count3 - 1);
8576
136
    if (!result) {
8577
0
        return PyErr_NoMemory();
8578
0
    }
8579
8580
136
    _PyObject_Init(result, &EncodingMapType);
8581
136
    mresult = (struct encoding_map*)result;
8582
136
    mresult->count2 = count2;
8583
136
    mresult->count3 = count3;
8584
136
    mlevel1 = mresult->level1;
8585
136
    mlevel2 = mresult->level23;
8586
136
    mlevel3 = mresult->level23 + 16*count2;
8587
136
    memcpy(mlevel1, level1, 32);
8588
136
    memset(mlevel2, 0xFF, 16*count2);
8589
136
    memset(mlevel3, 0, 128*count3);
8590
136
    count3 = 0;
8591
34.8k
    for (i = 1; i < length; i++) {
8592
34.6k
        int o1, o2, o3, i2, i3;
8593
34.6k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8594
34.6k
        if (ch == 0xFFFE)
8595
            /* unmapped character */
8596
976
            continue;
8597
33.7k
        o1 = ch>>11;
8598
33.7k
        o2 = (ch>>7) & 0xF;
8599
33.7k
        i2 = 16*mlevel1[o1] + o2;
8600
33.7k
        if (mlevel2[i2] == 0xFF)
8601
732
            mlevel2[i2] = count3++;
8602
33.7k
        o3 = ch & 0x7F;
8603
33.7k
        i3 = 128*mlevel2[i2] + o3;
8604
33.7k
        mlevel3[i3] = i;
8605
33.7k
    }
8606
136
    return result;
8607
136
}
8608
8609
static int
8610
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8611
0
{
8612
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8613
0
    int l1 = c>>11;
8614
0
    int l2 = (c>>7) & 0xF;
8615
0
    int l3 = c & 0x7F;
8616
0
    int i;
8617
8618
0
    if (c > 0xFFFF)
8619
0
        return -1;
8620
0
    if (c == 0)
8621
0
        return 0;
8622
    /* level 1*/
8623
0
    i = map->level1[l1];
8624
0
    if (i == 0xFF) {
8625
0
        return -1;
8626
0
    }
8627
    /* level 2*/
8628
0
    i = map->level23[16*i+l2];
8629
0
    if (i == 0xFF) {
8630
0
        return -1;
8631
0
    }
8632
    /* level 3 */
8633
0
    i = map->level23[16*map->count2 + 128*i + l3];
8634
0
    if (i == 0) {
8635
0
        return -1;
8636
0
    }
8637
0
    return i;
8638
0
}
8639
8640
/* Lookup the character in the mapping.
8641
   On success, return PyLong, PyBytes or None (if the character can't be found).
8642
   If the result is PyLong, put its value in replace.
8643
   On error, return NULL.
8644
   */
8645
static PyObject *
8646
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8647
0
{
8648
0
    PyObject *w = PyLong_FromLong((long)c);
8649
0
    PyObject *x;
8650
8651
0
    if (w == NULL)
8652
0
        return NULL;
8653
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8654
0
    Py_DECREF(w);
8655
0
    if (rc == 0) {
8656
        /* No mapping found means: mapping is undefined. */
8657
0
        Py_RETURN_NONE;
8658
0
    }
8659
0
    if (x == NULL) {
8660
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8661
            /* No mapping found means: mapping is undefined. */
8662
0
            PyErr_Clear();
8663
0
            Py_RETURN_NONE;
8664
0
        } else
8665
0
            return NULL;
8666
0
    }
8667
0
    else if (x == Py_None)
8668
0
        return x;
8669
0
    else if (PyLong_Check(x)) {
8670
0
        long value = PyLong_AsLong(x);
8671
0
        if (value < 0 || value > 255) {
8672
0
            PyErr_SetString(PyExc_TypeError,
8673
0
                            "character mapping must be in range(256)");
8674
0
            Py_DECREF(x);
8675
0
            return NULL;
8676
0
        }
8677
0
        *replace = (unsigned char)value;
8678
0
        return x;
8679
0
    }
8680
0
    else if (PyBytes_Check(x))
8681
0
        return x;
8682
0
    else {
8683
        /* wrong return value */
8684
0
        PyErr_Format(PyExc_TypeError,
8685
0
                     "character mapping must return integer, bytes or None, not %.400s",
8686
0
                     Py_TYPE(x)->tp_name);
8687
0
        Py_DECREF(x);
8688
0
        return NULL;
8689
0
    }
8690
0
}
8691
8692
static int
8693
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8694
0
{
8695
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8696
    /* exponentially overallocate to minimize reallocations */
8697
0
    if (requiredsize < 2 * outsize)
8698
0
        requiredsize = 2 * outsize;
8699
0
    return PyBytesWriter_Resize(writer, requiredsize);
8700
0
}
8701
8702
typedef enum charmapencode_result {
8703
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8704
} charmapencode_result;
8705
/* lookup the character, put the result in the output string and adjust
8706
   various state variables. Resize the output bytes object if not enough
8707
   space is available. Return a new reference to the object that
8708
   was put in the output buffer, or Py_None, if the mapping was undefined
8709
   (in which case no character was written) or NULL, if a
8710
   reallocation error occurred. The caller must decref the result */
8711
static charmapencode_result
8712
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8713
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8714
0
{
8715
0
    PyObject *rep;
8716
0
    unsigned char replace;
8717
0
    char *outstart;
8718
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8719
8720
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8721
0
        int res = encoding_map_lookup(c, mapping);
8722
0
        Py_ssize_t requiredsize = *outpos+1;
8723
0
        if (res == -1) {
8724
0
            return enc_FAILED;
8725
0
        }
8726
8727
0
        if (outsize<requiredsize) {
8728
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8729
0
                return enc_EXCEPTION;
8730
0
            }
8731
0
        }
8732
0
        outstart = _PyBytesWriter_GetData(writer);
8733
0
        outstart[(*outpos)++] = (char)res;
8734
0
        return enc_SUCCESS;
8735
0
    }
8736
8737
0
    rep = charmapencode_lookup(c, mapping, &replace);
8738
0
    if (rep==NULL)
8739
0
        return enc_EXCEPTION;
8740
0
    else if (rep==Py_None) {
8741
0
        Py_DECREF(rep);
8742
0
        return enc_FAILED;
8743
0
    } else {
8744
0
        if (PyLong_Check(rep)) {
8745
0
            Py_ssize_t requiredsize = *outpos+1;
8746
0
            if (outsize<requiredsize)
8747
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8748
0
                    Py_DECREF(rep);
8749
0
                    return enc_EXCEPTION;
8750
0
                }
8751
0
            outstart = _PyBytesWriter_GetData(writer);
8752
0
            outstart[(*outpos)++] = (char)replace;
8753
0
        }
8754
0
        else {
8755
0
            const char *repchars = PyBytes_AS_STRING(rep);
8756
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8757
0
            Py_ssize_t requiredsize = *outpos+repsize;
8758
0
            if (outsize<requiredsize)
8759
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8760
0
                    Py_DECREF(rep);
8761
0
                    return enc_EXCEPTION;
8762
0
                }
8763
0
            outstart = _PyBytesWriter_GetData(writer);
8764
0
            memcpy(outstart + *outpos, repchars, repsize);
8765
0
            *outpos += repsize;
8766
0
        }
8767
0
    }
8768
0
    Py_DECREF(rep);
8769
0
    return enc_SUCCESS;
8770
0
}
8771
8772
/* handle an error in _PyUnicode_EncodeCharmap()
8773
   Return 0 on success, -1 on error */
8774
static int
8775
charmap_encoding_error(
8776
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8777
    PyObject **exceptionObject,
8778
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8779
    PyBytesWriter *writer, Py_ssize_t *respos)
8780
0
{
8781
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8782
0
    Py_ssize_t size, repsize;
8783
0
    Py_ssize_t newpos;
8784
0
    int kind;
8785
0
    const void *data;
8786
0
    Py_ssize_t index;
8787
    /* startpos for collecting unencodable chars */
8788
0
    Py_ssize_t collstartpos = *inpos;
8789
0
    Py_ssize_t collendpos = *inpos+1;
8790
0
    Py_ssize_t collpos;
8791
0
    const char *encoding = "charmap";
8792
0
    const char *reason = "character maps to <undefined>";
8793
0
    charmapencode_result x;
8794
0
    Py_UCS4 ch;
8795
0
    int val;
8796
8797
0
    size = PyUnicode_GET_LENGTH(unicode);
8798
    /* find all unencodable characters */
8799
0
    while (collendpos < size) {
8800
0
        PyObject *rep;
8801
0
        unsigned char replace;
8802
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8803
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8804
0
            val = encoding_map_lookup(ch, mapping);
8805
0
            if (val != -1)
8806
0
                break;
8807
0
            ++collendpos;
8808
0
            continue;
8809
0
        }
8810
8811
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8812
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8813
0
        if (rep==NULL)
8814
0
            return -1;
8815
0
        else if (rep!=Py_None) {
8816
0
            Py_DECREF(rep);
8817
0
            break;
8818
0
        }
8819
0
        Py_DECREF(rep);
8820
0
        ++collendpos;
8821
0
    }
8822
    /* cache callback name lookup
8823
     * (if not done yet, i.e. it's the first error) */
8824
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8825
0
        *error_handler = _Py_GetErrorHandler(errors);
8826
8827
0
    switch (*error_handler) {
8828
0
    case _Py_ERROR_STRICT:
8829
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8830
0
        return -1;
8831
8832
0
    case _Py_ERROR_REPLACE:
8833
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8834
0
            x = charmapencode_output('?', mapping, writer, respos);
8835
0
            if (x==enc_EXCEPTION) {
8836
0
                return -1;
8837
0
            }
8838
0
            else if (x==enc_FAILED) {
8839
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8840
0
                return -1;
8841
0
            }
8842
0
        }
8843
0
        _Py_FALLTHROUGH;
8844
0
    case _Py_ERROR_IGNORE:
8845
0
        *inpos = collendpos;
8846
0
        break;
8847
8848
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8849
        /* generate replacement (temporarily (mis)uses p) */
8850
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8851
0
            char buffer[2+29+1+1];
8852
0
            char *cp;
8853
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8854
0
            for (cp = buffer; *cp; ++cp) {
8855
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8856
0
                if (x==enc_EXCEPTION)
8857
0
                    return -1;
8858
0
                else if (x==enc_FAILED) {
8859
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8860
0
                    return -1;
8861
0
                }
8862
0
            }
8863
0
        }
8864
0
        *inpos = collendpos;
8865
0
        break;
8866
8867
0
    default:
8868
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8869
0
                                                      encoding, reason, unicode, exceptionObject,
8870
0
                                                      collstartpos, collendpos, &newpos);
8871
0
        if (repunicode == NULL)
8872
0
            return -1;
8873
0
        if (PyBytes_Check(repunicode)) {
8874
            /* Directly copy bytes result to output. */
8875
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8876
0
            Py_ssize_t requiredsize;
8877
0
            repsize = PyBytes_Size(repunicode);
8878
0
            requiredsize = *respos + repsize;
8879
0
            if (requiredsize > outsize)
8880
                /* Make room for all additional bytes. */
8881
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8882
0
                    Py_DECREF(repunicode);
8883
0
                    return -1;
8884
0
                }
8885
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8886
0
                   PyBytes_AsString(repunicode),  repsize);
8887
0
            *respos += repsize;
8888
0
            *inpos = newpos;
8889
0
            Py_DECREF(repunicode);
8890
0
            break;
8891
0
        }
8892
        /* generate replacement  */
8893
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8894
0
        data = PyUnicode_DATA(repunicode);
8895
0
        kind = PyUnicode_KIND(repunicode);
8896
0
        for (index = 0; index < repsize; index++) {
8897
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8898
0
            x = charmapencode_output(repch, mapping, writer, respos);
8899
0
            if (x==enc_EXCEPTION) {
8900
0
                Py_DECREF(repunicode);
8901
0
                return -1;
8902
0
            }
8903
0
            else if (x==enc_FAILED) {
8904
0
                Py_DECREF(repunicode);
8905
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8906
0
                return -1;
8907
0
            }
8908
0
        }
8909
0
        *inpos = newpos;
8910
0
        Py_DECREF(repunicode);
8911
0
    }
8912
0
    return 0;
8913
0
}
8914
8915
PyObject *
8916
_PyUnicode_EncodeCharmap(PyObject *unicode,
8917
                         PyObject *mapping,
8918
                         const char *errors)
8919
0
{
8920
    /* Default to Latin-1 */
8921
0
    if (mapping == NULL) {
8922
0
        return unicode_encode_ucs1(unicode, errors, 256);
8923
0
    }
8924
8925
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
8926
0
    if (size == 0) {
8927
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8928
0
    }
8929
0
    const void *data = PyUnicode_DATA(unicode);
8930
0
    int kind = PyUnicode_KIND(unicode);
8931
8932
0
    PyObject *error_handler_obj = NULL;
8933
0
    PyObject *exc = NULL;
8934
8935
    /* output object */
8936
0
    PyBytesWriter *writer;
8937
    /* allocate enough for a simple encoding without
8938
       replacements, if we need more, we'll resize */
8939
0
    writer = PyBytesWriter_Create(size);
8940
0
    if (writer == NULL) {
8941
0
        goto onError;
8942
0
    }
8943
8944
    /* current input position */
8945
0
    Py_ssize_t inpos = 0;
8946
    /* current output position */
8947
0
    Py_ssize_t respos = 0;
8948
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8949
8950
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8951
0
        char *outstart = _PyBytesWriter_GetData(writer);
8952
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8953
8954
0
        while (inpos<size) {
8955
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8956
8957
            /* try to encode it */
8958
0
            int res = encoding_map_lookup(ch, mapping);
8959
0
            Py_ssize_t requiredsize = respos+1;
8960
0
            if (res == -1) {
8961
0
                goto enc_FAILED;
8962
0
            }
8963
8964
0
            if (outsize<requiredsize) {
8965
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
8966
0
                    goto onError;
8967
0
                }
8968
0
                outstart = _PyBytesWriter_GetData(writer);
8969
0
                outsize = _PyBytesWriter_GetSize(writer);
8970
0
            }
8971
0
            outstart[respos++] = (char)res;
8972
8973
            /* done with this character => adjust input position */
8974
0
            ++inpos;
8975
0
            continue;
8976
8977
0
enc_FAILED:
8978
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8979
0
                                       &exc,
8980
0
                                       &error_handler, &error_handler_obj, errors,
8981
0
                                       writer, &respos)) {
8982
0
                goto onError;
8983
0
            }
8984
0
            outstart = _PyBytesWriter_GetData(writer);
8985
0
            outsize = _PyBytesWriter_GetSize(writer);
8986
0
        }
8987
0
    }
8988
0
    else {
8989
0
        while (inpos<size) {
8990
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8991
            /* try to encode it */
8992
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
8993
0
            if (x==enc_EXCEPTION) { /* error */
8994
0
                goto onError;
8995
0
            }
8996
0
            if (x==enc_FAILED) { /* unencodable character */
8997
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
8998
0
                                           &exc,
8999
0
                                           &error_handler, &error_handler_obj, errors,
9000
0
                                           writer, &respos)) {
9001
0
                    goto onError;
9002
0
                }
9003
0
            }
9004
0
            else {
9005
                /* done with this character => adjust input position */
9006
0
                ++inpos;
9007
0
            }
9008
0
        }
9009
0
    }
9010
9011
0
    Py_XDECREF(exc);
9012
0
    Py_XDECREF(error_handler_obj);
9013
9014
    /* Resize if we allocated too much */
9015
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9016
9017
0
  onError:
9018
0
    PyBytesWriter_Discard(writer);
9019
0
    Py_XDECREF(exc);
9020
0
    Py_XDECREF(error_handler_obj);
9021
0
    return NULL;
9022
0
}
9023
9024
PyObject *
9025
PyUnicode_AsCharmapString(PyObject *unicode,
9026
                          PyObject *mapping)
9027
0
{
9028
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9029
0
        PyErr_BadArgument();
9030
0
        return NULL;
9031
0
    }
9032
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9033
0
}
9034
9035
/* create or adjust a UnicodeTranslateError */
9036
static void
9037
make_translate_exception(PyObject **exceptionObject,
9038
                         PyObject *unicode,
9039
                         Py_ssize_t startpos, Py_ssize_t endpos,
9040
                         const char *reason)
9041
0
{
9042
0
    if (*exceptionObject == NULL) {
9043
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9044
0
            unicode, startpos, endpos, reason);
9045
0
    }
9046
0
    else {
9047
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9048
0
            goto onError;
9049
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9050
0
            goto onError;
9051
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9052
0
            goto onError;
9053
0
        return;
9054
0
      onError:
9055
0
        Py_CLEAR(*exceptionObject);
9056
0
    }
9057
0
}
9058
9059
/* error handling callback helper:
9060
   build arguments, call the callback and check the arguments,
9061
   put the result into newpos and return the replacement string, which
9062
   has to be freed by the caller */
9063
static PyObject *
9064
unicode_translate_call_errorhandler(const char *errors,
9065
                                    PyObject **errorHandler,
9066
                                    const char *reason,
9067
                                    PyObject *unicode, PyObject **exceptionObject,
9068
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9069
                                    Py_ssize_t *newpos)
9070
0
{
9071
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9072
9073
0
    Py_ssize_t i_newpos;
9074
0
    PyObject *restuple;
9075
0
    PyObject *resunicode;
9076
9077
0
    if (*errorHandler == NULL) {
9078
0
        *errorHandler = PyCodec_LookupError(errors);
9079
0
        if (*errorHandler == NULL)
9080
0
            return NULL;
9081
0
    }
9082
9083
0
    make_translate_exception(exceptionObject,
9084
0
                             unicode, startpos, endpos, reason);
9085
0
    if (*exceptionObject == NULL)
9086
0
        return NULL;
9087
9088
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9089
0
    if (restuple == NULL)
9090
0
        return NULL;
9091
0
    if (!PyTuple_Check(restuple)) {
9092
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9093
0
        Py_DECREF(restuple);
9094
0
        return NULL;
9095
0
    }
9096
0
    if (!PyArg_ParseTuple(restuple, argparse,
9097
0
                          &resunicode, &i_newpos)) {
9098
0
        Py_DECREF(restuple);
9099
0
        return NULL;
9100
0
    }
9101
0
    if (i_newpos<0)
9102
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9103
0
    else
9104
0
        *newpos = i_newpos;
9105
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9106
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9107
0
        Py_DECREF(restuple);
9108
0
        return NULL;
9109
0
    }
9110
0
    Py_INCREF(resunicode);
9111
0
    Py_DECREF(restuple);
9112
0
    return resunicode;
9113
0
}
9114
9115
/* Lookup the character ch in the mapping and put the result in result,
9116
   which must be decrefed by the caller.
9117
   The result can be PyLong, PyUnicode, None or NULL.
9118
   If the result is PyLong, put its value in replace.
9119
   Return 0 on success, -1 on error */
9120
static int
9121
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9122
18.8k
{
9123
18.8k
    PyObject *w = PyLong_FromLong((long)c);
9124
18.8k
    PyObject *x;
9125
9126
18.8k
    if (w == NULL)
9127
0
        return -1;
9128
18.8k
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9129
18.8k
    Py_DECREF(w);
9130
18.8k
    if (rc == 0) {
9131
        /* No mapping found means: use 1:1 mapping. */
9132
6.35k
        *result = NULL;
9133
6.35k
        return 0;
9134
6.35k
    }
9135
12.4k
    if (x == NULL) {
9136
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9137
            /* No mapping found means: use 1:1 mapping. */
9138
0
            PyErr_Clear();
9139
0
            *result = NULL;
9140
0
            return 0;
9141
0
        } else
9142
0
            return -1;
9143
0
    }
9144
12.4k
    else if (x == Py_None) {
9145
0
        *result = x;
9146
0
        return 0;
9147
0
    }
9148
12.4k
    else if (PyLong_Check(x)) {
9149
0
        long value = PyLong_AsLong(x);
9150
0
        if (value < 0 || value > MAX_UNICODE) {
9151
0
            PyErr_Format(PyExc_ValueError,
9152
0
                         "character mapping must be in range(0x%lx)",
9153
0
                         (unsigned long)MAX_UNICODE + 1);
9154
0
            Py_DECREF(x);
9155
0
            return -1;
9156
0
        }
9157
0
        *result = x;
9158
0
        *replace = (Py_UCS4)value;
9159
0
        return 0;
9160
0
    }
9161
12.4k
    else if (PyUnicode_Check(x)) {
9162
12.4k
        *result = x;
9163
12.4k
        return 0;
9164
12.4k
    }
9165
0
    else {
9166
        /* wrong return value */
9167
0
        PyErr_SetString(PyExc_TypeError,
9168
0
                        "character mapping must return integer, None or str");
9169
0
        Py_DECREF(x);
9170
0
        return -1;
9171
0
    }
9172
12.4k
}
9173
9174
/* lookup the character, write the result into the writer.
9175
   Return 1 if the result was written into the writer, return 0 if the mapping
9176
   was undefined, raise an exception return -1 on error. */
9177
static int
9178
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9179
                        _PyUnicodeWriter *writer)
9180
6.40k
{
9181
6.40k
    PyObject *item;
9182
6.40k
    Py_UCS4 replace;
9183
9184
6.40k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9185
0
        return -1;
9186
9187
6.40k
    if (item == NULL) {
9188
        /* not found => default to 1:1 mapping */
9189
107
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9190
0
            return -1;
9191
0
        }
9192
107
        return 1;
9193
107
    }
9194
9195
6.29k
    if (item == Py_None) {
9196
0
        Py_DECREF(item);
9197
0
        return 0;
9198
0
    }
9199
9200
6.29k
    if (PyLong_Check(item)) {
9201
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9202
0
            Py_DECREF(item);
9203
0
            return -1;
9204
0
        }
9205
0
        Py_DECREF(item);
9206
0
        return 1;
9207
0
    }
9208
9209
6.29k
    if (!PyUnicode_Check(item)) {
9210
0
        Py_DECREF(item);
9211
0
        return -1;
9212
0
    }
9213
9214
6.29k
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9215
0
        Py_DECREF(item);
9216
0
        return -1;
9217
0
    }
9218
9219
6.29k
    Py_DECREF(item);
9220
6.29k
    return 1;
9221
6.29k
}
9222
9223
static int
9224
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9225
                              Py_UCS1 *translate)
9226
12.4k
{
9227
12.4k
    PyObject *item = NULL;
9228
12.4k
    Py_UCS4 replace;
9229
12.4k
    int ret = 0;
9230
9231
12.4k
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9232
0
        return -1;
9233
0
    }
9234
9235
12.4k
    if (item == Py_None) {
9236
        /* deletion */
9237
0
        translate[ch] = 0xfe;
9238
0
    }
9239
12.4k
    else if (item == NULL) {
9240
        /* not found => default to 1:1 mapping */
9241
6.25k
        translate[ch] = ch;
9242
6.25k
        return 1;
9243
6.25k
    }
9244
6.18k
    else if (PyLong_Check(item)) {
9245
0
        if (replace > 127) {
9246
            /* invalid character or character outside ASCII:
9247
               skip the fast translate */
9248
0
            goto exit;
9249
0
        }
9250
0
        translate[ch] = (Py_UCS1)replace;
9251
0
    }
9252
6.18k
    else if (PyUnicode_Check(item)) {
9253
6.18k
        if (PyUnicode_GET_LENGTH(item) != 1)
9254
6.18k
            goto exit;
9255
9256
0
        replace = PyUnicode_READ_CHAR(item, 0);
9257
0
        if (replace > 127)
9258
0
            goto exit;
9259
0
        translate[ch] = (Py_UCS1)replace;
9260
0
    }
9261
0
    else {
9262
        /* not None, NULL, long or unicode */
9263
0
        goto exit;
9264
0
    }
9265
0
    ret = 1;
9266
9267
6.18k
  exit:
9268
6.18k
    Py_DECREF(item);
9269
6.18k
    return ret;
9270
0
}
9271
9272
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9273
   was translated into writer, return 0 if the input string was partially
9274
   translated into writer, raise an exception and return -1 on error. */
9275
static int
9276
unicode_fast_translate(PyObject *input, PyObject *mapping,
9277
                       _PyUnicodeWriter *writer, int ignore,
9278
                       Py_ssize_t *input_pos)
9279
12.3k
{
9280
12.3k
    Py_UCS1 ascii_table[128], ch, ch2;
9281
12.3k
    Py_ssize_t len;
9282
12.3k
    const Py_UCS1 *in, *end;
9283
12.3k
    Py_UCS1 *out;
9284
12.3k
    int res = 0;
9285
9286
12.3k
    len = PyUnicode_GET_LENGTH(input);
9287
9288
12.3k
    memset(ascii_table, 0xff, 128);
9289
9290
12.3k
    in = PyUnicode_1BYTE_DATA(input);
9291
12.3k
    end = in + len;
9292
9293
12.3k
    assert(PyUnicode_IS_ASCII(writer->buffer));
9294
12.3k
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9295
12.3k
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9296
9297
18.6k
    for (; in < end; in++) {
9298
12.4k
        ch = *in;
9299
12.4k
        ch2 = ascii_table[ch];
9300
12.4k
        if (ch2 == 0xff) {
9301
12.4k
            int translate = unicode_fast_translate_lookup(mapping, ch,
9302
12.4k
                                                          ascii_table);
9303
12.4k
            if (translate < 0)
9304
0
                return -1;
9305
12.4k
            if (translate == 0)
9306
6.18k
                goto exit;
9307
6.25k
            ch2 = ascii_table[ch];
9308
6.25k
        }
9309
6.29k
        if (ch2 == 0xfe) {
9310
0
            if (ignore)
9311
0
                continue;
9312
0
            goto exit;
9313
0
        }
9314
6.29k
        assert(ch2 < 128);
9315
6.29k
        *out = ch2;
9316
6.29k
        out++;
9317
6.29k
    }
9318
6.17k
    res = 1;
9319
9320
12.3k
exit:
9321
12.3k
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9322
12.3k
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9323
12.3k
    return res;
9324
6.17k
}
9325
9326
static PyObject *
9327
_PyUnicode_TranslateCharmap(PyObject *input,
9328
                            PyObject *mapping,
9329
                            const char *errors)
9330
12.3k
{
9331
    /* input object */
9332
12.3k
    const void *data;
9333
12.3k
    Py_ssize_t size, i;
9334
12.3k
    int kind;
9335
    /* output buffer */
9336
12.3k
    _PyUnicodeWriter writer;
9337
    /* error handler */
9338
12.3k
    const char *reason = "character maps to <undefined>";
9339
12.3k
    PyObject *errorHandler = NULL;
9340
12.3k
    PyObject *exc = NULL;
9341
12.3k
    int ignore;
9342
12.3k
    int res;
9343
9344
12.3k
    if (mapping == NULL) {
9345
0
        PyErr_BadArgument();
9346
0
        return NULL;
9347
0
    }
9348
9349
12.3k
    data = PyUnicode_DATA(input);
9350
12.3k
    kind = PyUnicode_KIND(input);
9351
12.3k
    size = PyUnicode_GET_LENGTH(input);
9352
9353
12.3k
    if (size == 0)
9354
0
        return PyUnicode_FromObject(input);
9355
9356
    /* allocate enough for a simple 1:1 translation without
9357
       replacements, if we need more, we'll resize */
9358
12.3k
    _PyUnicodeWriter_Init(&writer);
9359
12.3k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9360
0
        goto onError;
9361
9362
12.3k
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9363
9364
12.3k
    if (PyUnicode_IS_ASCII(input)) {
9365
12.3k
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9366
12.3k
        if (res < 0) {
9367
0
            _PyUnicodeWriter_Dealloc(&writer);
9368
0
            return NULL;
9369
0
        }
9370
12.3k
        if (res == 1)
9371
6.17k
            return _PyUnicodeWriter_Finish(&writer);
9372
12.3k
    }
9373
0
    else {
9374
0
        i = 0;
9375
0
    }
9376
9377
12.5k
    while (i<size) {
9378
        /* try to encode it */
9379
6.40k
        int translate;
9380
6.40k
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9381
6.40k
        Py_ssize_t newpos;
9382
        /* startpos for collecting untranslatable chars */
9383
6.40k
        Py_ssize_t collstart;
9384
6.40k
        Py_ssize_t collend;
9385
6.40k
        Py_UCS4 ch;
9386
9387
6.40k
        ch = PyUnicode_READ(kind, data, i);
9388
6.40k
        translate = charmaptranslate_output(ch, mapping, &writer);
9389
6.40k
        if (translate < 0)
9390
0
            goto onError;
9391
9392
6.40k
        if (translate != 0) {
9393
            /* it worked => adjust input pointer */
9394
6.40k
            ++i;
9395
6.40k
            continue;
9396
6.40k
        }
9397
9398
        /* untranslatable character */
9399
0
        collstart = i;
9400
0
        collend = i+1;
9401
9402
        /* find all untranslatable characters */
9403
0
        while (collend < size) {
9404
0
            PyObject *x;
9405
0
            Py_UCS4 replace;
9406
0
            ch = PyUnicode_READ(kind, data, collend);
9407
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9408
0
                goto onError;
9409
0
            Py_XDECREF(x);
9410
0
            if (x != Py_None)
9411
0
                break;
9412
0
            ++collend;
9413
0
        }
9414
9415
0
        if (ignore) {
9416
0
            i = collend;
9417
0
        }
9418
0
        else {
9419
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9420
0
                                                             reason, input, &exc,
9421
0
                                                             collstart, collend, &newpos);
9422
0
            if (repunicode == NULL)
9423
0
                goto onError;
9424
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9425
0
                Py_DECREF(repunicode);
9426
0
                goto onError;
9427
0
            }
9428
0
            Py_DECREF(repunicode);
9429
0
            i = newpos;
9430
0
        }
9431
0
    }
9432
6.18k
    Py_XDECREF(exc);
9433
6.18k
    Py_XDECREF(errorHandler);
9434
6.18k
    return _PyUnicodeWriter_Finish(&writer);
9435
9436
0
  onError:
9437
0
    _PyUnicodeWriter_Dealloc(&writer);
9438
0
    Py_XDECREF(exc);
9439
0
    Py_XDECREF(errorHandler);
9440
0
    return NULL;
9441
6.18k
}
9442
9443
PyObject *
9444
PyUnicode_Translate(PyObject *str,
9445
                    PyObject *mapping,
9446
                    const char *errors)
9447
0
{
9448
0
    if (ensure_unicode(str) < 0)
9449
0
        return NULL;
9450
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9451
0
}
9452
9453
PyObject *
9454
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9455
12.6M
{
9456
12.6M
    if (!PyUnicode_Check(unicode)) {
9457
0
        PyErr_BadInternalCall();
9458
0
        return NULL;
9459
0
    }
9460
12.6M
    if (PyUnicode_IS_ASCII(unicode)) {
9461
        /* If the string is already ASCII, just return the same string */
9462
12.6M
        return Py_NewRef(unicode);
9463
12.6M
    }
9464
9465
2.45k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9466
2.45k
    PyObject *result = PyUnicode_New(len, 127);
9467
2.45k
    if (result == NULL) {
9468
0
        return NULL;
9469
0
    }
9470
9471
2.45k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9472
2.45k
    int kind = PyUnicode_KIND(unicode);
9473
2.45k
    const void *data = PyUnicode_DATA(unicode);
9474
2.45k
    Py_ssize_t i;
9475
88.7k
    for (i = 0; i < len; ++i) {
9476
86.4k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9477
86.4k
        if (ch < 127) {
9478
83.5k
            out[i] = ch;
9479
83.5k
        }
9480
2.89k
        else if (Py_UNICODE_ISSPACE(ch)) {
9481
1.27k
            out[i] = ' ';
9482
1.27k
        }
9483
1.62k
        else {
9484
1.62k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9485
1.62k
            if (decimal < 0) {
9486
132
                out[i] = '?';
9487
132
                out[i+1] = '\0';
9488
132
                _PyUnicode_LENGTH(result) = i + 1;
9489
132
                break;
9490
132
            }
9491
1.48k
            out[i] = '0' + decimal;
9492
1.48k
        }
9493
86.4k
    }
9494
9495
2.45k
    assert(_PyUnicode_CheckConsistency(result, 1));
9496
2.45k
    return result;
9497
2.45k
}
9498
9499
/* --- Helpers ------------------------------------------------------------ */
9500
9501
/* helper macro to fixup start/end slice values */
9502
#define ADJUST_INDICES(start, end, len) \
9503
114M
    do {                                \
9504
114M
        if (end > len) {                \
9505
87.3M
            end = len;                  \
9506
87.3M
        }                               \
9507
114M
        else if (end < 0) {             \
9508
0
            end += len;                 \
9509
0
            if (end < 0) {              \
9510
0
                end = 0;                \
9511
0
            }                           \
9512
0
        }                               \
9513
114M
        if (start < 0) {                \
9514
19.0k
            start += len;               \
9515
19.0k
            if (start < 0) {            \
9516
0
                start = 0;              \
9517
0
            }                           \
9518
19.0k
        }                               \
9519
114M
    } while (0)
9520
9521
static Py_ssize_t
9522
any_find_slice(PyObject* s1, PyObject* s2,
9523
               Py_ssize_t start,
9524
               Py_ssize_t end,
9525
               int direction)
9526
26.8M
{
9527
26.8M
    int kind1, kind2;
9528
26.8M
    const void *buf1, *buf2;
9529
26.8M
    Py_ssize_t len1, len2, result;
9530
9531
26.8M
    kind1 = PyUnicode_KIND(s1);
9532
26.8M
    kind2 = PyUnicode_KIND(s2);
9533
26.8M
    if (kind1 < kind2)
9534
0
        return -1;
9535
9536
26.8M
    len1 = PyUnicode_GET_LENGTH(s1);
9537
26.8M
    len2 = PyUnicode_GET_LENGTH(s2);
9538
26.8M
    ADJUST_INDICES(start, end, len1);
9539
26.8M
    if (end - start < len2)
9540
2.01M
        return -1;
9541
9542
24.8M
    buf1 = PyUnicode_DATA(s1);
9543
24.8M
    buf2 = PyUnicode_DATA(s2);
9544
24.8M
    if (len2 == 1) {
9545
24.0M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9546
24.0M
        result = findchar((const char *)buf1 + kind1*start,
9547
24.0M
                          kind1, end - start, ch, direction);
9548
24.0M
        if (result == -1)
9549
3.72M
            return -1;
9550
20.3M
        else
9551
20.3M
            return start + result;
9552
24.0M
    }
9553
9554
810k
    if (kind2 != kind1) {
9555
188k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9556
188k
        if (!buf2)
9557
0
            return -2;
9558
188k
    }
9559
9560
810k
    if (direction > 0) {
9561
810k
        switch (kind1) {
9562
622k
        case PyUnicode_1BYTE_KIND:
9563
622k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9564
368k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9565
254k
            else
9566
254k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9567
622k
            break;
9568
67.6k
        case PyUnicode_2BYTE_KIND:
9569
67.6k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9570
67.6k
            break;
9571
120k
        case PyUnicode_4BYTE_KIND:
9572
120k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9573
120k
            break;
9574
0
        default:
9575
0
            Py_UNREACHABLE();
9576
810k
        }
9577
810k
    }
9578
0
    else {
9579
0
        switch (kind1) {
9580
0
        case PyUnicode_1BYTE_KIND:
9581
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9582
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9583
0
            else
9584
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9585
0
            break;
9586
0
        case PyUnicode_2BYTE_KIND:
9587
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9588
0
            break;
9589
0
        case PyUnicode_4BYTE_KIND:
9590
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9591
0
            break;
9592
0
        default:
9593
0
            Py_UNREACHABLE();
9594
0
        }
9595
0
    }
9596
9597
810k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9598
810k
    if (kind2 != kind1)
9599
188k
        PyMem_Free((void *)buf2);
9600
9601
810k
    return result;
9602
810k
}
9603
9604
9605
Py_ssize_t
9606
PyUnicode_Count(PyObject *str,
9607
                PyObject *substr,
9608
                Py_ssize_t start,
9609
                Py_ssize_t end)
9610
0
{
9611
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9612
0
        return -1;
9613
9614
0
    return unicode_count_impl(str, substr, start, end);
9615
0
}
9616
9617
Py_ssize_t
9618
PyUnicode_Find(PyObject *str,
9619
               PyObject *substr,
9620
               Py_ssize_t start,
9621
               Py_ssize_t end,
9622
               int direction)
9623
0
{
9624
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9625
0
        return -2;
9626
9627
0
    return any_find_slice(str, substr, start, end, direction);
9628
0
}
9629
9630
Py_ssize_t
9631
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9632
                   Py_ssize_t start, Py_ssize_t end,
9633
                   int direction)
9634
3.79M
{
9635
3.79M
    int kind;
9636
3.79M
    Py_ssize_t len, result;
9637
3.79M
    len = PyUnicode_GET_LENGTH(str);
9638
3.79M
    ADJUST_INDICES(start, end, len);
9639
3.79M
    if (end - start < 1)
9640
0
        return -1;
9641
3.79M
    kind = PyUnicode_KIND(str);
9642
3.79M
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9643
3.79M
                      kind, end-start, ch, direction);
9644
3.79M
    if (result == -1)
9645
2.86M
        return -1;
9646
932k
    else
9647
932k
        return start + result;
9648
3.79M
}
9649
9650
static int
9651
tailmatch(PyObject *self,
9652
          PyObject *substring,
9653
          Py_ssize_t start,
9654
          Py_ssize_t end,
9655
          int direction)
9656
57.1M
{
9657
57.1M
    int kind_self;
9658
57.1M
    int kind_sub;
9659
57.1M
    const void *data_self;
9660
57.1M
    const void *data_sub;
9661
57.1M
    Py_ssize_t offset;
9662
57.1M
    Py_ssize_t i;
9663
57.1M
    Py_ssize_t end_sub;
9664
9665
57.1M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9666
57.1M
    end -= PyUnicode_GET_LENGTH(substring);
9667
57.1M
    if (end < start)
9668
10.0M
        return 0;
9669
9670
47.0M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9671
0
        return 1;
9672
9673
47.0M
    kind_self = PyUnicode_KIND(self);
9674
47.0M
    data_self = PyUnicode_DATA(self);
9675
47.0M
    kind_sub = PyUnicode_KIND(substring);
9676
47.0M
    data_sub = PyUnicode_DATA(substring);
9677
47.0M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9678
9679
47.0M
    if (direction > 0)
9680
7.46M
        offset = end;
9681
39.5M
    else
9682
39.5M
        offset = start;
9683
9684
47.0M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9685
47.0M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9686
32.6M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9687
32.6M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9688
        /* If both are of the same kind, memcmp is sufficient */
9689
13.2M
        if (kind_self == kind_sub) {
9690
6.61M
            return ! memcmp((char *)data_self +
9691
6.61M
                                (offset * PyUnicode_KIND(substring)),
9692
6.61M
                            data_sub,
9693
6.61M
                            PyUnicode_GET_LENGTH(substring) *
9694
6.61M
                                PyUnicode_KIND(substring));
9695
6.61M
        }
9696
        /* otherwise we have to compare each character by first accessing it */
9697
6.67M
        else {
9698
            /* We do not need to compare 0 and len(substring)-1 because
9699
               the if statement above ensured already that they are equal
9700
               when we end up here. */
9701
6.82M
            for (i = 1; i < end_sub; ++i) {
9702
177k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9703
177k
                    PyUnicode_READ(kind_sub, data_sub, i))
9704
26.4k
                    return 0;
9705
177k
            }
9706
6.64M
            return 1;
9707
6.67M
        }
9708
13.2M
    }
9709
9710
33.7M
    return 0;
9711
47.0M
}
9712
9713
Py_ssize_t
9714
PyUnicode_Tailmatch(PyObject *str,
9715
                    PyObject *substr,
9716
                    Py_ssize_t start,
9717
                    Py_ssize_t end,
9718
                    int direction)
9719
369
{
9720
369
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9721
0
        return -1;
9722
9723
369
    return tailmatch(str, substr, start, end, direction);
9724
369
}
9725
9726
static PyObject *
9727
ascii_upper_or_lower(PyObject *self, int lower)
9728
70.2M
{
9729
70.2M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9730
70.2M
    const char *data = PyUnicode_DATA(self);
9731
70.2M
    char *resdata;
9732
70.2M
    PyObject *res;
9733
9734
70.2M
    res = PyUnicode_New(len, 127);
9735
70.2M
    if (res == NULL)
9736
0
        return NULL;
9737
70.2M
    resdata = PyUnicode_DATA(res);
9738
70.2M
    if (lower)
9739
70.2M
        _Py_bytes_lower(resdata, data, len);
9740
306
    else
9741
306
        _Py_bytes_upper(resdata, data, len);
9742
70.2M
    return res;
9743
70.2M
}
9744
9745
static Py_UCS4
9746
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9747
461k
{
9748
461k
    Py_ssize_t j;
9749
461k
    int final_sigma;
9750
461k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9751
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9752
9753
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9754
9755
    where ! is a negation and \p{xxx} is a character with property xxx.
9756
    */
9757
847k
    for (j = i - 1; j >= 0; j--) {
9758
845k
        c = PyUnicode_READ(kind, data, j);
9759
845k
        if (!_PyUnicode_IsCaseIgnorable(c))
9760
459k
            break;
9761
845k
    }
9762
461k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9763
461k
    if (final_sigma) {
9764
710k
        for (j = i + 1; j < length; j++) {
9765
706k
            c = PyUnicode_READ(kind, data, j);
9766
706k
            if (!_PyUnicode_IsCaseIgnorable(c))
9767
355k
                break;
9768
706k
        }
9769
359k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9770
359k
    }
9771
461k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9772
461k
}
9773
9774
static int
9775
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9776
           Py_UCS4 c, Py_UCS4 *mapped)
9777
130M
{
9778
    /* Obscure special case. */
9779
130M
    if (c == 0x3A3) {
9780
461k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9781
461k
        return 1;
9782
461k
    }
9783
130M
    return _PyUnicode_ToLowerFull(c, mapped);
9784
130M
}
9785
9786
static Py_ssize_t
9787
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9788
0
{
9789
0
    Py_ssize_t i, k = 0;
9790
0
    int n_res, j;
9791
0
    Py_UCS4 c, mapped[3];
9792
9793
0
    c = PyUnicode_READ(kind, data, 0);
9794
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9795
0
    for (j = 0; j < n_res; j++) {
9796
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9797
0
        res[k++] = mapped[j];
9798
0
    }
9799
0
    for (i = 1; i < length; i++) {
9800
0
        c = PyUnicode_READ(kind, data, i);
9801
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9802
0
        for (j = 0; j < n_res; j++) {
9803
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9804
0
            res[k++] = mapped[j];
9805
0
        }
9806
0
    }
9807
0
    return k;
9808
0
}
9809
9810
static Py_ssize_t
9811
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9812
0
    Py_ssize_t i, k = 0;
9813
9814
0
    for (i = 0; i < length; i++) {
9815
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9816
0
        int n_res, j;
9817
0
        if (Py_UNICODE_ISUPPER(c)) {
9818
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9819
0
        }
9820
0
        else if (Py_UNICODE_ISLOWER(c)) {
9821
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9822
0
        }
9823
0
        else {
9824
0
            n_res = 1;
9825
0
            mapped[0] = c;
9826
0
        }
9827
0
        for (j = 0; j < n_res; j++) {
9828
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9829
0
            res[k++] = mapped[j];
9830
0
        }
9831
0
    }
9832
0
    return k;
9833
0
}
9834
9835
static Py_ssize_t
9836
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9837
                  Py_UCS4 *maxchar, int lower)
9838
6.04M
{
9839
6.04M
    Py_ssize_t i, k = 0;
9840
9841
137M
    for (i = 0; i < length; i++) {
9842
130M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9843
130M
        int n_res, j;
9844
130M
        if (lower)
9845
130M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9846
0
        else
9847
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9848
261M
        for (j = 0; j < n_res; j++) {
9849
130M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9850
130M
            res[k++] = mapped[j];
9851
130M
        }
9852
130M
    }
9853
6.04M
    return k;
9854
6.04M
}
9855
9856
static Py_ssize_t
9857
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9858
0
{
9859
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9860
0
}
9861
9862
static Py_ssize_t
9863
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9864
6.04M
{
9865
6.04M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9866
6.04M
}
9867
9868
static Py_ssize_t
9869
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9870
0
{
9871
0
    Py_ssize_t i, k = 0;
9872
9873
0
    for (i = 0; i < length; i++) {
9874
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9875
0
        Py_UCS4 mapped[3];
9876
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9877
0
        for (j = 0; j < n_res; j++) {
9878
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9879
0
            res[k++] = mapped[j];
9880
0
        }
9881
0
    }
9882
0
    return k;
9883
0
}
9884
9885
static Py_ssize_t
9886
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9887
0
{
9888
0
    Py_ssize_t i, k = 0;
9889
0
    int previous_is_cased;
9890
9891
0
    previous_is_cased = 0;
9892
0
    for (i = 0; i < length; i++) {
9893
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9894
0
        Py_UCS4 mapped[3];
9895
0
        int n_res, j;
9896
9897
0
        if (previous_is_cased)
9898
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9899
0
        else
9900
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9901
9902
0
        for (j = 0; j < n_res; j++) {
9903
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9904
0
            res[k++] = mapped[j];
9905
0
        }
9906
9907
0
        previous_is_cased = _PyUnicode_IsCased(c);
9908
0
    }
9909
0
    return k;
9910
0
}
9911
9912
static PyObject *
9913
case_operation(PyObject *self,
9914
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9915
6.04M
{
9916
6.04M
    PyObject *res = NULL;
9917
6.04M
    Py_ssize_t length, newlength = 0;
9918
6.04M
    int kind, outkind;
9919
6.04M
    const void *data;
9920
6.04M
    void *outdata;
9921
6.04M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9922
9923
6.04M
    kind = PyUnicode_KIND(self);
9924
6.04M
    data = PyUnicode_DATA(self);
9925
6.04M
    length = PyUnicode_GET_LENGTH(self);
9926
6.04M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9927
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9928
0
        return NULL;
9929
0
    }
9930
6.04M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
9931
6.04M
    if (tmp == NULL)
9932
0
        return PyErr_NoMemory();
9933
6.04M
    newlength = perform(kind, data, length, tmp, &maxchar);
9934
6.04M
    res = PyUnicode_New(newlength, maxchar);
9935
6.04M
    if (res == NULL)
9936
0
        goto leave;
9937
6.04M
    tmpend = tmp + newlength;
9938
6.04M
    outdata = PyUnicode_DATA(res);
9939
6.04M
    outkind = PyUnicode_KIND(res);
9940
6.04M
    switch (outkind) {
9941
198k
    case PyUnicode_1BYTE_KIND:
9942
198k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9943
198k
        break;
9944
5.67M
    case PyUnicode_2BYTE_KIND:
9945
5.67M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9946
5.67M
        break;
9947
164k
    case PyUnicode_4BYTE_KIND:
9948
164k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9949
164k
        break;
9950
0
    default:
9951
0
        Py_UNREACHABLE();
9952
6.04M
    }
9953
6.04M
  leave:
9954
6.04M
    PyMem_Free(tmp);
9955
6.04M
    return res;
9956
6.04M
}
9957
9958
PyObject *
9959
PyUnicode_Join(PyObject *separator, PyObject *seq)
9960
28.5M
{
9961
28.5M
    PyObject *res;
9962
28.5M
    PyObject *fseq;
9963
28.5M
    Py_ssize_t seqlen;
9964
28.5M
    PyObject **items;
9965
9966
28.5M
    fseq = PySequence_Fast(seq, "can only join an iterable");
9967
28.5M
    if (fseq == NULL) {
9968
675
        return NULL;
9969
675
    }
9970
9971
28.5M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
9972
9973
28.5M
    items = PySequence_Fast_ITEMS(fseq);
9974
28.5M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9975
28.5M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9976
9977
28.5M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
9978
9979
28.5M
    Py_DECREF(fseq);
9980
28.5M
    return res;
9981
28.5M
}
9982
9983
PyObject *
9984
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9985
46.3M
{
9986
46.3M
    PyObject *res = NULL; /* the result */
9987
46.3M
    PyObject *sep = NULL;
9988
46.3M
    Py_ssize_t seplen;
9989
46.3M
    PyObject *item;
9990
46.3M
    Py_ssize_t sz, i, res_offset;
9991
46.3M
    Py_UCS4 maxchar;
9992
46.3M
    Py_UCS4 item_maxchar;
9993
46.3M
    int use_memcpy;
9994
46.3M
    unsigned char *res_data = NULL, *sep_data = NULL;
9995
46.3M
    PyObject *last_obj;
9996
46.3M
    int kind = 0;
9997
9998
    /* If empty sequence, return u"". */
9999
46.3M
    if (seqlen == 0) {
10000
7.22M
        _Py_RETURN_UNICODE_EMPTY();
10001
7.22M
    }
10002
10003
    /* If singleton sequence with an exact Unicode, return that. */
10004
39.1M
    last_obj = NULL;
10005
39.1M
    if (seqlen == 1) {
10006
14.8M
        if (PyUnicode_CheckExact(items[0])) {
10007
13.4M
            res = items[0];
10008
13.4M
            return Py_NewRef(res);
10009
13.4M
        }
10010
1.42M
        seplen = 0;
10011
1.42M
        maxchar = 0;
10012
1.42M
    }
10013
24.3M
    else {
10014
        /* Set up sep and seplen */
10015
24.3M
        if (separator == NULL) {
10016
            /* fall back to a blank space separator */
10017
0
            sep = PyUnicode_FromOrdinal(' ');
10018
0
            if (!sep)
10019
0
                goto onError;
10020
0
            seplen = 1;
10021
0
            maxchar = 32;
10022
0
        }
10023
24.3M
        else {
10024
24.3M
            if (!PyUnicode_Check(separator)) {
10025
0
                PyErr_Format(PyExc_TypeError,
10026
0
                             "separator: expected str instance,"
10027
0
                             " %.80s found",
10028
0
                             Py_TYPE(separator)->tp_name);
10029
0
                goto onError;
10030
0
            }
10031
24.3M
            sep = separator;
10032
24.3M
            seplen = PyUnicode_GET_LENGTH(separator);
10033
24.3M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10034
            /* inc refcount to keep this code path symmetric with the
10035
               above case of a blank separator */
10036
24.3M
            Py_INCREF(sep);
10037
24.3M
        }
10038
24.3M
        last_obj = sep;
10039
24.3M
    }
10040
10041
    /* There are at least two things to join, or else we have a subclass
10042
     * of str in the sequence.
10043
     * Do a pre-pass to figure out the total amount of space we'll
10044
     * need (sz), and see whether all argument are strings.
10045
     */
10046
25.7M
    sz = 0;
10047
#ifdef Py_DEBUG
10048
    use_memcpy = 0;
10049
#else
10050
25.7M
    use_memcpy = 1;
10051
25.7M
#endif
10052
233M
    for (i = 0; i < seqlen; i++) {
10053
207M
        size_t add_sz;
10054
207M
        item = items[i];
10055
207M
        if (!PyUnicode_Check(item)) {
10056
0
            PyErr_Format(PyExc_TypeError,
10057
0
                         "sequence item %zd: expected str instance,"
10058
0
                         " %.80s found",
10059
0
                         i, Py_TYPE(item)->tp_name);
10060
0
            goto onError;
10061
0
        }
10062
207M
        add_sz = PyUnicode_GET_LENGTH(item);
10063
207M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10064
207M
        maxchar = Py_MAX(maxchar, item_maxchar);
10065
207M
        if (i != 0) {
10066
182M
            add_sz += seplen;
10067
182M
        }
10068
207M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10069
0
            PyErr_SetString(PyExc_OverflowError,
10070
0
                            "join() result is too long for a Python string");
10071
0
            goto onError;
10072
0
        }
10073
207M
        sz += add_sz;
10074
207M
        if (use_memcpy && last_obj != NULL) {
10075
130M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10076
3.00M
                use_memcpy = 0;
10077
130M
        }
10078
207M
        last_obj = item;
10079
207M
    }
10080
10081
25.7M
    res = PyUnicode_New(sz, maxchar);
10082
25.7M
    if (res == NULL)
10083
0
        goto onError;
10084
10085
    /* Catenate everything. */
10086
#ifdef Py_DEBUG
10087
    use_memcpy = 0;
10088
#else
10089
25.7M
    if (use_memcpy) {
10090
22.7M
        res_data = PyUnicode_1BYTE_DATA(res);
10091
22.7M
        kind = PyUnicode_KIND(res);
10092
22.7M
        if (seplen != 0)
10093
231k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10094
22.7M
    }
10095
25.7M
#endif
10096
25.7M
    if (use_memcpy) {
10097
133M
        for (i = 0; i < seqlen; ++i) {
10098
110M
            Py_ssize_t itemlen;
10099
110M
            item = items[i];
10100
10101
            /* Copy item, and maybe the separator. */
10102
110M
            if (i && seplen != 0) {
10103
791k
                memcpy(res_data,
10104
791k
                          sep_data,
10105
791k
                          kind * seplen);
10106
791k
                res_data += kind * seplen;
10107
791k
            }
10108
10109
110M
            itemlen = PyUnicode_GET_LENGTH(item);
10110
110M
            if (itemlen != 0) {
10111
100M
                memcpy(res_data,
10112
100M
                          PyUnicode_DATA(item),
10113
100M
                          kind * itemlen);
10114
100M
                res_data += kind * itemlen;
10115
100M
            }
10116
110M
        }
10117
22.7M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10118
22.7M
                           + kind * PyUnicode_GET_LENGTH(res));
10119
22.7M
    }
10120
3.00M
    else {
10121
99.8M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10122
96.8M
            Py_ssize_t itemlen;
10123
96.8M
            item = items[i];
10124
10125
            /* Copy item, and maybe the separator. */
10126
96.8M
            if (i && seplen != 0) {
10127
2.00M
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10128
2.00M
                res_offset += seplen;
10129
2.00M
            }
10130
10131
96.8M
            itemlen = PyUnicode_GET_LENGTH(item);
10132
96.8M
            if (itemlen != 0) {
10133
95.3M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10134
95.3M
                res_offset += itemlen;
10135
95.3M
            }
10136
96.8M
        }
10137
3.00M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10138
3.00M
    }
10139
10140
25.7M
    Py_XDECREF(sep);
10141
25.7M
    assert(_PyUnicode_CheckConsistency(res, 1));
10142
25.7M
    return res;
10143
10144
0
  onError:
10145
0
    Py_XDECREF(sep);
10146
0
    Py_XDECREF(res);
10147
0
    return NULL;
10148
25.7M
}
10149
10150
void
10151
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10152
                    Py_UCS4 fill_char)
10153
17.6k
{
10154
17.6k
    const int kind = PyUnicode_KIND(unicode);
10155
17.6k
    void *data = PyUnicode_DATA(unicode);
10156
17.6k
    assert(_PyUnicode_IsModifiable(unicode));
10157
17.6k
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10158
17.6k
    assert(start >= 0);
10159
17.6k
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10160
17.6k
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10161
17.6k
}
10162
10163
Py_ssize_t
10164
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10165
               Py_UCS4 fill_char)
10166
648
{
10167
648
    Py_ssize_t maxlen;
10168
10169
648
    if (!PyUnicode_Check(unicode)) {
10170
0
        PyErr_BadInternalCall();
10171
0
        return -1;
10172
0
    }
10173
648
    if (unicode_check_modifiable(unicode))
10174
0
        return -1;
10175
10176
648
    if (start < 0) {
10177
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10178
0
        return -1;
10179
0
    }
10180
648
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10181
0
        PyErr_SetString(PyExc_ValueError,
10182
0
                         "fill character is bigger than "
10183
0
                         "the string maximum character");
10184
0
        return -1;
10185
0
    }
10186
10187
648
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10188
648
    length = Py_MIN(maxlen, length);
10189
648
    if (length <= 0)
10190
0
        return 0;
10191
10192
648
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10193
648
    return length;
10194
648
}
10195
10196
static PyObject *
10197
pad(PyObject *self,
10198
    Py_ssize_t left,
10199
    Py_ssize_t right,
10200
    Py_UCS4 fill)
10201
68
{
10202
68
    PyObject *u;
10203
68
    Py_UCS4 maxchar;
10204
68
    int kind;
10205
68
    void *data;
10206
10207
68
    if (left < 0)
10208
0
        left = 0;
10209
68
    if (right < 0)
10210
0
        right = 0;
10211
10212
68
    if (left == 0 && right == 0)
10213
0
        return unicode_result_unchanged(self);
10214
10215
68
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10216
68
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10217
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10218
0
        return NULL;
10219
0
    }
10220
68
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10221
68
    maxchar = Py_MAX(maxchar, fill);
10222
68
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10223
68
    if (!u)
10224
0
        return NULL;
10225
10226
68
    kind = PyUnicode_KIND(u);
10227
68
    data = PyUnicode_DATA(u);
10228
68
    if (left)
10229
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10230
68
    if (right)
10231
68
        _PyUnicode_Fill(kind, data, fill,
10232
68
                        left + _PyUnicode_LENGTH(self), right);
10233
68
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10234
68
    assert(_PyUnicode_CheckConsistency(u, 1));
10235
68
    return u;
10236
68
}
10237
10238
PyObject *
10239
PyUnicode_Splitlines(PyObject *string, int keepends)
10240
17.3k
{
10241
17.3k
    PyObject *list;
10242
10243
17.3k
    if (ensure_unicode(string) < 0)
10244
0
        return NULL;
10245
10246
17.3k
    switch (PyUnicode_KIND(string)) {
10247
4.91k
    case PyUnicode_1BYTE_KIND:
10248
4.91k
        if (PyUnicode_IS_ASCII(string))
10249
3.92k
            list = asciilib_splitlines(
10250
3.92k
                string, PyUnicode_1BYTE_DATA(string),
10251
3.92k
                PyUnicode_GET_LENGTH(string), keepends);
10252
984
        else
10253
984
            list = ucs1lib_splitlines(
10254
984
                string, PyUnicode_1BYTE_DATA(string),
10255
984
                PyUnicode_GET_LENGTH(string), keepends);
10256
4.91k
        break;
10257
8.92k
    case PyUnicode_2BYTE_KIND:
10258
8.92k
        list = ucs2lib_splitlines(
10259
8.92k
            string, PyUnicode_2BYTE_DATA(string),
10260
8.92k
            PyUnicode_GET_LENGTH(string), keepends);
10261
8.92k
        break;
10262
3.50k
    case PyUnicode_4BYTE_KIND:
10263
3.50k
        list = ucs4lib_splitlines(
10264
3.50k
            string, PyUnicode_4BYTE_DATA(string),
10265
3.50k
            PyUnicode_GET_LENGTH(string), keepends);
10266
3.50k
        break;
10267
0
    default:
10268
0
        Py_UNREACHABLE();
10269
17.3k
    }
10270
17.3k
    return list;
10271
17.3k
}
10272
10273
static PyObject *
10274
split(PyObject *self,
10275
      PyObject *substring,
10276
      Py_ssize_t maxcount)
10277
22.0M
{
10278
22.0M
    int kind1, kind2;
10279
22.0M
    const void *buf1, *buf2;
10280
22.0M
    Py_ssize_t len1, len2;
10281
22.0M
    PyObject* out;
10282
22.0M
    len1 = PyUnicode_GET_LENGTH(self);
10283
22.0M
    kind1 = PyUnicode_KIND(self);
10284
10285
22.0M
    if (substring == NULL) {
10286
185k
        if (maxcount < 0) {
10287
161k
            maxcount = (len1 - 1) / 2 + 1;
10288
161k
        }
10289
185k
        switch (kind1) {
10290
115k
        case PyUnicode_1BYTE_KIND:
10291
115k
            if (PyUnicode_IS_ASCII(self))
10292
89.4k
                return asciilib_split_whitespace(
10293
89.4k
                    self,  PyUnicode_1BYTE_DATA(self),
10294
89.4k
                    len1, maxcount
10295
89.4k
                    );
10296
26.3k
            else
10297
26.3k
                return ucs1lib_split_whitespace(
10298
26.3k
                    self,  PyUnicode_1BYTE_DATA(self),
10299
26.3k
                    len1, maxcount
10300
26.3k
                    );
10301
56.6k
        case PyUnicode_2BYTE_KIND:
10302
56.6k
            return ucs2lib_split_whitespace(
10303
56.6k
                self,  PyUnicode_2BYTE_DATA(self),
10304
56.6k
                len1, maxcount
10305
56.6k
                );
10306
12.5k
        case PyUnicode_4BYTE_KIND:
10307
12.5k
            return ucs4lib_split_whitespace(
10308
12.5k
                self,  PyUnicode_4BYTE_DATA(self),
10309
12.5k
                len1, maxcount
10310
12.5k
                );
10311
0
        default:
10312
0
            Py_UNREACHABLE();
10313
185k
        }
10314
185k
    }
10315
10316
21.8M
    kind2 = PyUnicode_KIND(substring);
10317
21.8M
    len2 = PyUnicode_GET_LENGTH(substring);
10318
21.8M
    if (maxcount < 0) {
10319
        // if len2 == 0, it will raise ValueError.
10320
15.8M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10321
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10322
15.8M
        maxcount = maxcount < 0 ? len1 : maxcount;
10323
15.8M
    }
10324
21.8M
    if (kind1 < kind2 || len1 < len2) {
10325
1.71M
        out = PyList_New(1);
10326
1.71M
        if (out == NULL)
10327
0
            return NULL;
10328
1.71M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10329
1.71M
        return out;
10330
1.71M
    }
10331
20.1M
    buf1 = PyUnicode_DATA(self);
10332
20.1M
    buf2 = PyUnicode_DATA(substring);
10333
20.1M
    if (kind2 != kind1) {
10334
256k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10335
256k
        if (!buf2)
10336
0
            return NULL;
10337
256k
    }
10338
10339
20.1M
    switch (kind1) {
10340
19.9M
    case PyUnicode_1BYTE_KIND:
10341
19.9M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10342
18.7M
            out = asciilib_split(
10343
18.7M
                self,  buf1, len1, buf2, len2, maxcount);
10344
1.15M
        else
10345
1.15M
            out = ucs1lib_split(
10346
1.15M
                self,  buf1, len1, buf2, len2, maxcount);
10347
19.9M
        break;
10348
216k
    case PyUnicode_2BYTE_KIND:
10349
216k
        out = ucs2lib_split(
10350
216k
            self,  buf1, len1, buf2, len2, maxcount);
10351
216k
        break;
10352
40.3k
    case PyUnicode_4BYTE_KIND:
10353
40.3k
        out = ucs4lib_split(
10354
40.3k
            self,  buf1, len1, buf2, len2, maxcount);
10355
40.3k
        break;
10356
0
    default:
10357
0
        out = NULL;
10358
20.1M
    }
10359
20.1M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10360
20.1M
    if (kind2 != kind1)
10361
256k
        PyMem_Free((void *)buf2);
10362
20.1M
    return out;
10363
20.1M
}
10364
10365
static PyObject *
10366
rsplit(PyObject *self,
10367
       PyObject *substring,
10368
       Py_ssize_t maxcount)
10369
66
{
10370
66
    int kind1, kind2;
10371
66
    const void *buf1, *buf2;
10372
66
    Py_ssize_t len1, len2;
10373
66
    PyObject* out;
10374
10375
66
    len1 = PyUnicode_GET_LENGTH(self);
10376
66
    kind1 = PyUnicode_KIND(self);
10377
10378
66
    if (substring == NULL) {
10379
0
        if (maxcount < 0) {
10380
0
            maxcount = (len1 - 1) / 2 + 1;
10381
0
        }
10382
0
        switch (kind1) {
10383
0
        case PyUnicode_1BYTE_KIND:
10384
0
            if (PyUnicode_IS_ASCII(self))
10385
0
                return asciilib_rsplit_whitespace(
10386
0
                    self,  PyUnicode_1BYTE_DATA(self),
10387
0
                    len1, maxcount
10388
0
                    );
10389
0
            else
10390
0
                return ucs1lib_rsplit_whitespace(
10391
0
                    self,  PyUnicode_1BYTE_DATA(self),
10392
0
                    len1, maxcount
10393
0
                    );
10394
0
        case PyUnicode_2BYTE_KIND:
10395
0
            return ucs2lib_rsplit_whitespace(
10396
0
                self,  PyUnicode_2BYTE_DATA(self),
10397
0
                len1, maxcount
10398
0
                );
10399
0
        case PyUnicode_4BYTE_KIND:
10400
0
            return ucs4lib_rsplit_whitespace(
10401
0
                self,  PyUnicode_4BYTE_DATA(self),
10402
0
                len1, maxcount
10403
0
                );
10404
0
        default:
10405
0
            Py_UNREACHABLE();
10406
0
        }
10407
0
    }
10408
66
    kind2 = PyUnicode_KIND(substring);
10409
66
    len2 = PyUnicode_GET_LENGTH(substring);
10410
66
    if (maxcount < 0) {
10411
        // if len2 == 0, it will raise ValueError.
10412
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10413
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10414
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10415
0
    }
10416
66
    if (kind1 < kind2 || len1 < len2) {
10417
0
        out = PyList_New(1);
10418
0
        if (out == NULL)
10419
0
            return NULL;
10420
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10421
0
        return out;
10422
0
    }
10423
66
    buf1 = PyUnicode_DATA(self);
10424
66
    buf2 = PyUnicode_DATA(substring);
10425
66
    if (kind2 != kind1) {
10426
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10427
0
        if (!buf2)
10428
0
            return NULL;
10429
0
    }
10430
10431
66
    switch (kind1) {
10432
66
    case PyUnicode_1BYTE_KIND:
10433
66
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10434
66
            out = asciilib_rsplit(
10435
66
                self,  buf1, len1, buf2, len2, maxcount);
10436
0
        else
10437
0
            out = ucs1lib_rsplit(
10438
0
                self,  buf1, len1, buf2, len2, maxcount);
10439
66
        break;
10440
0
    case PyUnicode_2BYTE_KIND:
10441
0
        out = ucs2lib_rsplit(
10442
0
            self,  buf1, len1, buf2, len2, maxcount);
10443
0
        break;
10444
0
    case PyUnicode_4BYTE_KIND:
10445
0
        out = ucs4lib_rsplit(
10446
0
            self,  buf1, len1, buf2, len2, maxcount);
10447
0
        break;
10448
0
    default:
10449
0
        out = NULL;
10450
66
    }
10451
66
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10452
66
    if (kind2 != kind1)
10453
0
        PyMem_Free((void *)buf2);
10454
66
    return out;
10455
66
}
10456
10457
static Py_ssize_t
10458
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10459
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10460
23.2M
{
10461
23.2M
    switch (kind) {
10462
8.63M
    case PyUnicode_1BYTE_KIND:
10463
8.63M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10464
4.34M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10465
4.29M
        else
10466
4.29M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10467
6.68M
    case PyUnicode_2BYTE_KIND:
10468
6.68M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10469
7.93M
    case PyUnicode_4BYTE_KIND:
10470
7.93M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10471
23.2M
    }
10472
23.2M
    Py_UNREACHABLE();
10473
23.2M
}
10474
10475
static Py_ssize_t
10476
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10477
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10478
11.0M
{
10479
11.0M
    switch (kind) {
10480
10.2M
    case PyUnicode_1BYTE_KIND:
10481
10.2M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10482
734k
    case PyUnicode_2BYTE_KIND:
10483
734k
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10484
88.8k
    case PyUnicode_4BYTE_KIND:
10485
88.8k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10486
11.0M
    }
10487
11.0M
    Py_UNREACHABLE();
10488
11.0M
}
10489
10490
static void
10491
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10492
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10493
56.8k
{
10494
56.8k
    int kind = PyUnicode_KIND(u);
10495
56.8k
    void *data = PyUnicode_DATA(u);
10496
56.8k
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10497
56.8k
    if (kind == PyUnicode_1BYTE_KIND) {
10498
31.3k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10499
31.3k
                                      (Py_UCS1 *)data + len,
10500
31.3k
                                      u1, u2, maxcount);
10501
31.3k
    }
10502
25.5k
    else if (kind == PyUnicode_2BYTE_KIND) {
10503
20.4k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10504
20.4k
                                      (Py_UCS2 *)data + len,
10505
20.4k
                                      u1, u2, maxcount);
10506
20.4k
    }
10507
5.12k
    else {
10508
5.12k
        assert(kind == PyUnicode_4BYTE_KIND);
10509
5.12k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10510
5.12k
                                      (Py_UCS4 *)data + len,
10511
5.12k
                                      u1, u2, maxcount);
10512
5.12k
    }
10513
56.8k
}
10514
10515
static PyObject *
10516
replace(PyObject *self, PyObject *str1,
10517
        PyObject *str2, Py_ssize_t maxcount)
10518
19.6M
{
10519
19.6M
    PyObject *u;
10520
19.6M
    const char *sbuf = PyUnicode_DATA(self);
10521
19.6M
    const void *buf1 = PyUnicode_DATA(str1);
10522
19.6M
    const void *buf2 = PyUnicode_DATA(str2);
10523
19.6M
    int srelease = 0, release1 = 0, release2 = 0;
10524
19.6M
    int skind = PyUnicode_KIND(self);
10525
19.6M
    int kind1 = PyUnicode_KIND(str1);
10526
19.6M
    int kind2 = PyUnicode_KIND(str2);
10527
19.6M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10528
19.6M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10529
19.6M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10530
19.6M
    int mayshrink;
10531
19.6M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10532
10533
19.6M
    if (slen < len1)
10534
8.13M
        goto nothing;
10535
10536
11.5M
    if (maxcount < 0)
10537
11.5M
        maxcount = PY_SSIZE_T_MAX;
10538
0
    else if (maxcount == 0)
10539
0
        goto nothing;
10540
10541
11.5M
    if (str1 == str2)
10542
0
        goto nothing;
10543
10544
11.5M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10545
11.5M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10546
11.5M
    if (maxchar < maxchar_str1)
10547
        /* substring too wide to be present */
10548
0
        goto nothing;
10549
11.5M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10550
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10551
       result string. */
10552
11.5M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10553
11.5M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10554
10555
11.5M
    if (len1 == len2) {
10556
        /* same length */
10557
457k
        if (len1 == 0)
10558
0
            goto nothing;
10559
457k
        if (len1 == 1) {
10560
            /* replace characters */
10561
450k
            Py_UCS4 u1, u2;
10562
450k
            Py_ssize_t pos;
10563
10564
450k
            u1 = PyUnicode_READ(kind1, buf1, 0);
10565
450k
            pos = findchar(sbuf, skind, slen, u1, 1);
10566
450k
            if (pos < 0)
10567
393k
                goto nothing;
10568
56.8k
            u2 = PyUnicode_READ(kind2, buf2, 0);
10569
56.8k
            u = PyUnicode_New(slen, maxchar);
10570
56.8k
            if (!u)
10571
0
                goto error;
10572
10573
56.8k
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10574
56.8k
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10575
56.8k
        }
10576
7.26k
        else {
10577
7.26k
            int rkind = skind;
10578
7.26k
            char *res;
10579
7.26k
            Py_ssize_t i;
10580
10581
7.26k
            if (kind1 < rkind) {
10582
                /* widen substring */
10583
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10584
0
                if (!buf1) goto error;
10585
0
                release1 = 1;
10586
0
            }
10587
7.26k
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10588
7.26k
            if (i < 0)
10589
7.26k
                goto nothing;
10590
0
            if (rkind > kind2) {
10591
                /* widen replacement */
10592
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10593
0
                if (!buf2) goto error;
10594
0
                release2 = 1;
10595
0
            }
10596
0
            else if (rkind < kind2) {
10597
                /* widen self and buf1 */
10598
0
                rkind = kind2;
10599
0
                if (release1) {
10600
0
                    assert(buf1 != PyUnicode_DATA(str1));
10601
0
                    PyMem_Free((void *)buf1);
10602
0
                    buf1 = PyUnicode_DATA(str1);
10603
0
                    release1 = 0;
10604
0
                }
10605
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10606
0
                if (!sbuf) goto error;
10607
0
                srelease = 1;
10608
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10609
0
                if (!buf1) goto error;
10610
0
                release1 = 1;
10611
0
            }
10612
0
            u = PyUnicode_New(slen, maxchar);
10613
0
            if (!u)
10614
0
                goto error;
10615
0
            assert(PyUnicode_KIND(u) == rkind);
10616
0
            res = PyUnicode_DATA(u);
10617
10618
0
            memcpy(res, sbuf, rkind * slen);
10619
            /* change everything in-place, starting with this one */
10620
0
            memcpy(res + rkind * i,
10621
0
                   buf2,
10622
0
                   rkind * len2);
10623
0
            i += len1;
10624
10625
0
            while ( --maxcount > 0) {
10626
0
                i = anylib_find(rkind, self,
10627
0
                                sbuf+rkind*i, slen-i,
10628
0
                                str1, buf1, len1, i);
10629
0
                if (i == -1)
10630
0
                    break;
10631
0
                memcpy(res + rkind * i,
10632
0
                       buf2,
10633
0
                       rkind * len2);
10634
0
                i += len1;
10635
0
            }
10636
0
        }
10637
457k
    }
10638
11.0M
    else {
10639
11.0M
        Py_ssize_t n, i, j, ires;
10640
11.0M
        Py_ssize_t new_size;
10641
11.0M
        int rkind = skind;
10642
11.0M
        char *res;
10643
10644
11.0M
        if (kind1 < rkind) {
10645
            /* widen substring */
10646
823k
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10647
823k
            if (!buf1) goto error;
10648
823k
            release1 = 1;
10649
823k
        }
10650
11.0M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10651
11.0M
        if (n == 0)
10652
9.79M
            goto nothing;
10653
1.30M
        if (kind2 < rkind) {
10654
            /* widen replacement */
10655
45.1k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10656
45.1k
            if (!buf2) goto error;
10657
45.1k
            release2 = 1;
10658
45.1k
        }
10659
1.25M
        else if (kind2 > rkind) {
10660
            /* widen self and buf1 */
10661
0
            rkind = kind2;
10662
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10663
0
            if (!sbuf) goto error;
10664
0
            srelease = 1;
10665
0
            if (release1) {
10666
0
                assert(buf1 != PyUnicode_DATA(str1));
10667
0
                PyMem_Free((void *)buf1);
10668
0
                buf1 = PyUnicode_DATA(str1);
10669
0
                release1 = 0;
10670
0
            }
10671
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10672
0
            if (!buf1) goto error;
10673
0
            release1 = 1;
10674
0
        }
10675
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10676
           PyUnicode_GET_LENGTH(str1)); */
10677
1.30M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10678
0
                PyErr_SetString(PyExc_OverflowError,
10679
0
                                "replace string is too long");
10680
0
                goto error;
10681
0
        }
10682
1.30M
        new_size = slen + n * (len2 - len1);
10683
1.30M
        if (new_size == 0) {
10684
0
            u = _PyUnicode_GetEmpty();
10685
0
            goto done;
10686
0
        }
10687
1.30M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10688
0
            PyErr_SetString(PyExc_OverflowError,
10689
0
                            "replace string is too long");
10690
0
            goto error;
10691
0
        }
10692
1.30M
        u = PyUnicode_New(new_size, maxchar);
10693
1.30M
        if (!u)
10694
0
            goto error;
10695
1.30M
        assert(PyUnicode_KIND(u) == rkind);
10696
1.30M
        res = PyUnicode_DATA(u);
10697
1.30M
        ires = i = 0;
10698
1.30M
        if (len1 > 0) {
10699
24.5M
            while (n-- > 0) {
10700
                /* look for next match */
10701
23.2M
                j = anylib_find(rkind, self,
10702
23.2M
                                sbuf + rkind * i, slen-i,
10703
23.2M
                                str1, buf1, len1, i);
10704
23.2M
                if (j == -1)
10705
0
                    break;
10706
23.2M
                else if (j > i) {
10707
                    /* copy unchanged part [i:j] */
10708
4.63M
                    memcpy(res + rkind * ires,
10709
4.63M
                           sbuf + rkind * i,
10710
4.63M
                           rkind * (j-i));
10711
4.63M
                    ires += j - i;
10712
4.63M
                }
10713
                /* copy substitution string */
10714
23.2M
                if (len2 > 0) {
10715
23.2M
                    memcpy(res + rkind * ires,
10716
23.2M
                           buf2,
10717
23.2M
                           rkind * len2);
10718
23.2M
                    ires += len2;
10719
23.2M
                }
10720
23.2M
                i = j + len1;
10721
23.2M
            }
10722
1.30M
            if (i < slen)
10723
                /* copy tail [i:] */
10724
1.29M
                memcpy(res + rkind * ires,
10725
1.29M
                       sbuf + rkind * i,
10726
1.29M
                       rkind * (slen-i));
10727
1.30M
        }
10728
0
        else {
10729
            /* interleave */
10730
0
            while (n > 0) {
10731
0
                memcpy(res + rkind * ires,
10732
0
                       buf2,
10733
0
                       rkind * len2);
10734
0
                ires += len2;
10735
0
                if (--n <= 0)
10736
0
                    break;
10737
0
                memcpy(res + rkind * ires,
10738
0
                       sbuf + rkind * i,
10739
0
                       rkind);
10740
0
                ires++;
10741
0
                i++;
10742
0
            }
10743
0
            memcpy(res + rkind * ires,
10744
0
                   sbuf + rkind * i,
10745
0
                   rkind * (slen-i));
10746
0
        }
10747
1.30M
    }
10748
10749
1.35M
    if (mayshrink) {
10750
0
        unicode_adjust_maxchar(&u);
10751
0
        if (u == NULL)
10752
0
            goto error;
10753
0
    }
10754
10755
1.35M
  done:
10756
1.35M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10757
1.35M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10758
1.35M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10759
1.35M
    if (srelease)
10760
0
        PyMem_Free((void *)sbuf);
10761
1.35M
    if (release1)
10762
45.1k
        PyMem_Free((void *)buf1);
10763
1.35M
    if (release2)
10764
45.1k
        PyMem_Free((void *)buf2);
10765
1.35M
    assert(_PyUnicode_CheckConsistency(u, 1));
10766
1.35M
    return u;
10767
10768
18.3M
  nothing:
10769
    /* nothing to replace; return original string (when possible) */
10770
18.3M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10771
18.3M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10772
18.3M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10773
18.3M
    if (srelease)
10774
0
        PyMem_Free((void *)sbuf);
10775
18.3M
    if (release1)
10776
778k
        PyMem_Free((void *)buf1);
10777
18.3M
    if (release2)
10778
0
        PyMem_Free((void *)buf2);
10779
18.3M
    return unicode_result_unchanged(self);
10780
10781
0
  error:
10782
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10783
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10784
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10785
0
    if (srelease)
10786
0
        PyMem_Free((void *)sbuf);
10787
0
    if (release1)
10788
0
        PyMem_Free((void *)buf1);
10789
0
    if (release2)
10790
0
        PyMem_Free((void *)buf2);
10791
0
    return NULL;
10792
1.35M
}
10793
10794
/* --- Unicode Object Methods --------------------------------------------- */
10795
10796
/*[clinic input]
10797
str.title as unicode_title
10798
10799
Return a version of the string where each word is titlecased.
10800
10801
More specifically, words start with uppercased characters and all
10802
remaining cased characters have lower case.
10803
[clinic start generated code]*/
10804
10805
static PyObject *
10806
unicode_title_impl(PyObject *self)
10807
/*[clinic end generated code: output=c75ae03809574902 input=2a07e2c7df94627a]*/
10808
0
{
10809
0
    return case_operation(self, do_title);
10810
0
}
10811
10812
/*[clinic input]
10813
str.capitalize as unicode_capitalize
10814
10815
Return a capitalized version of the string.
10816
10817
More specifically, make the first character have upper case and the
10818
rest lower case.
10819
[clinic start generated code]*/
10820
10821
static PyObject *
10822
unicode_capitalize_impl(PyObject *self)
10823
/*[clinic end generated code: output=e49a4c333cdb7667 input=e50e50ed45a654cf]*/
10824
0
{
10825
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10826
0
        return unicode_result_unchanged(self);
10827
0
    return case_operation(self, do_capitalize);
10828
0
}
10829
10830
/*[clinic input]
10831
str.casefold as unicode_casefold
10832
10833
Return a version of the string suitable for caseless comparisons.
10834
[clinic start generated code]*/
10835
10836
static PyObject *
10837
unicode_casefold_impl(PyObject *self)
10838
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10839
0
{
10840
0
    if (PyUnicode_IS_ASCII(self))
10841
0
        return ascii_upper_or_lower(self, 1);
10842
0
    return case_operation(self, do_casefold);
10843
0
}
10844
10845
10846
/* Argument converter. Accepts a single Unicode character. */
10847
10848
static int
10849
convert_uc(PyObject *obj, void *addr)
10850
130
{
10851
130
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10852
10853
130
    if (!PyUnicode_Check(obj)) {
10854
0
        PyErr_Format(PyExc_TypeError,
10855
0
                     "The fill character must be a unicode character, "
10856
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10857
0
        return 0;
10858
0
    }
10859
130
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10860
0
        PyErr_SetString(PyExc_TypeError,
10861
0
                        "The fill character must be exactly one character long");
10862
0
        return 0;
10863
0
    }
10864
130
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10865
130
    return 1;
10866
130
}
10867
10868
/*[clinic input]
10869
str.center as unicode_center
10870
10871
    width: Py_ssize_t
10872
    fillchar: Py_UCS4 = ' '
10873
    /
10874
10875
Return a centered string of length width.
10876
10877
Padding is done using the specified fill character (default is
10878
a space).
10879
[clinic start generated code]*/
10880
10881
static PyObject *
10882
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10883
/*[clinic end generated code: output=420c8859effc7c0c input=df91017dfd186a78]*/
10884
0
{
10885
0
    Py_ssize_t marg, left;
10886
10887
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10888
0
        return unicode_result_unchanged(self);
10889
10890
0
    marg = width - PyUnicode_GET_LENGTH(self);
10891
0
    left = marg / 2 + (marg & width & 1);
10892
10893
0
    return pad(self, left, marg - left, fillchar);
10894
0
}
10895
10896
/* This function assumes that str1 and str2 are readied by the caller. */
10897
10898
static int
10899
unicode_compare(PyObject *str1, PyObject *str2)
10900
28.1M
{
10901
28.1M
#define COMPARE(TYPE1, TYPE2) \
10902
28.1M
    do { \
10903
25.8M
        TYPE1* p1 = (TYPE1 *)data1; \
10904
25.8M
        TYPE2* p2 = (TYPE2 *)data2; \
10905
25.8M
        TYPE1* end = p1 + len; \
10906
25.8M
        Py_UCS4 c1, c2; \
10907
25.8M
        for (; p1 != end; p1++, p2++) { \
10908
25.8M
            c1 = *p1; \
10909
25.8M
            c2 = *p2; \
10910
25.8M
            if (c1 != c2) \
10911
25.8M
                return (c1 < c2) ? -1 : 1; \
10912
25.8M
        } \
10913
25.8M
    } \
10914
25.8M
    while (0)
10915
10916
28.1M
    int kind1, kind2;
10917
28.1M
    const void *data1, *data2;
10918
28.1M
    Py_ssize_t len1, len2, len;
10919
10920
28.1M
    kind1 = PyUnicode_KIND(str1);
10921
28.1M
    kind2 = PyUnicode_KIND(str2);
10922
28.1M
    data1 = PyUnicode_DATA(str1);
10923
28.1M
    data2 = PyUnicode_DATA(str2);
10924
28.1M
    len1 = PyUnicode_GET_LENGTH(str1);
10925
28.1M
    len2 = PyUnicode_GET_LENGTH(str2);
10926
28.1M
    len = Py_MIN(len1, len2);
10927
10928
28.1M
    switch(kind1) {
10929
4.25M
    case PyUnicode_1BYTE_KIND:
10930
4.25M
    {
10931
4.25M
        switch(kind2) {
10932
433k
        case PyUnicode_1BYTE_KIND:
10933
433k
        {
10934
433k
            int cmp = memcmp(data1, data2, len);
10935
            /* normalize result of memcmp() into the range [-1; 1] */
10936
433k
            if (cmp < 0)
10937
372k
                return -1;
10938
60.9k
            if (cmp > 0)
10939
54.7k
                return 1;
10940
6.22k
            break;
10941
60.9k
        }
10942
3.32M
        case PyUnicode_2BYTE_KIND:
10943
3.32M
            COMPARE(Py_UCS1, Py_UCS2);
10944
0
            break;
10945
494k
        case PyUnicode_4BYTE_KIND:
10946
494k
            COMPARE(Py_UCS1, Py_UCS4);
10947
0
            break;
10948
0
        default:
10949
0
            Py_UNREACHABLE();
10950
4.25M
        }
10951
6.22k
        break;
10952
4.25M
    }
10953
21.7M
    case PyUnicode_2BYTE_KIND:
10954
21.7M
    {
10955
21.7M
        switch(kind2) {
10956
79.4k
        case PyUnicode_1BYTE_KIND:
10957
79.4k
            COMPARE(Py_UCS2, Py_UCS1);
10958
0
            break;
10959
21.2M
        case PyUnicode_2BYTE_KIND:
10960
21.2M
        {
10961
21.2M
            COMPARE(Py_UCS2, Py_UCS2);
10962
0
            break;
10963
21.2M
        }
10964
383k
        case PyUnicode_4BYTE_KIND:
10965
383k
            COMPARE(Py_UCS2, Py_UCS4);
10966
0
            break;
10967
0
        default:
10968
0
            Py_UNREACHABLE();
10969
21.7M
        }
10970
0
        break;
10971
21.7M
    }
10972
2.14M
    case PyUnicode_4BYTE_KIND:
10973
2.14M
    {
10974
2.14M
        switch(kind2) {
10975
8.09k
        case PyUnicode_1BYTE_KIND:
10976
8.09k
            COMPARE(Py_UCS4, Py_UCS1);
10977
0
            break;
10978
284k
        case PyUnicode_2BYTE_KIND:
10979
284k
            COMPARE(Py_UCS4, Py_UCS2);
10980
0
            break;
10981
1.85M
        case PyUnicode_4BYTE_KIND:
10982
1.85M
        {
10983
1.85M
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10984
1.85M
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10985
            /* normalize result of wmemcmp() into the range [-1; 1] */
10986
1.85M
            if (cmp < 0)
10987
917k
                return -1;
10988
934k
            if (cmp > 0)
10989
934k
                return 1;
10990
#else
10991
            COMPARE(Py_UCS4, Py_UCS4);
10992
#endif
10993
0
            break;
10994
934k
        }
10995
0
        default:
10996
0
            Py_UNREACHABLE();
10997
2.14M
        }
10998
0
        break;
10999
2.14M
    }
11000
0
    default:
11001
0
        Py_UNREACHABLE();
11002
28.1M
    }
11003
11004
6.22k
    if (len1 == len2)
11005
6.18k
        return 0;
11006
39
    if (len1 < len2)
11007
14
        return -1;
11008
25
    else
11009
25
        return 1;
11010
11011
39
#undef COMPARE
11012
39
}
11013
11014
11015
int
11016
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11017
605M
{
11018
605M
    assert(PyUnicode_Check(str1));
11019
605M
    assert(PyUnicode_Check(str2));
11020
605M
    if (str1 == str2) {
11021
84.1M
        return 1;
11022
84.1M
    }
11023
521M
    return unicode_eq(str1, str2);
11024
605M
}
11025
11026
11027
int
11028
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11029
0
{
11030
0
    if (!PyUnicode_Check(str1)) {
11031
0
        PyErr_Format(PyExc_TypeError,
11032
0
                     "first argument must be str, not %T", str1);
11033
0
        return -1;
11034
0
    }
11035
0
    if (!PyUnicode_Check(str2)) {
11036
0
        PyErr_Format(PyExc_TypeError,
11037
0
                     "second argument must be str, not %T", str2);
11038
0
        return -1;
11039
0
    }
11040
11041
0
    return _PyUnicode_Equal(str1, str2);
11042
0
}
11043
11044
11045
int
11046
PyUnicode_Compare(PyObject *left, PyObject *right)
11047
276k
{
11048
276k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11049
        /* a string is equal to itself */
11050
276k
        if (left == right)
11051
0
            return 0;
11052
11053
276k
        return unicode_compare(left, right);
11054
276k
    }
11055
0
    PyErr_Format(PyExc_TypeError,
11056
0
                 "Can't compare %.100s and %.100s",
11057
0
                 Py_TYPE(left)->tp_name,
11058
0
                 Py_TYPE(right)->tp_name);
11059
0
    return -1;
11060
276k
}
11061
11062
int
11063
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11064
12.2M
{
11065
12.2M
    Py_ssize_t i;
11066
12.2M
    int kind;
11067
12.2M
    Py_UCS4 chr;
11068
11069
12.2M
    assert(_PyUnicode_CHECK(uni));
11070
12.2M
    kind = PyUnicode_KIND(uni);
11071
12.2M
    if (kind == PyUnicode_1BYTE_KIND) {
11072
12.2M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11073
12.2M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11074
12.2M
        size_t len, len2 = strlen(str);
11075
12.2M
        int cmp;
11076
11077
12.2M
        len = Py_MIN(len1, len2);
11078
12.2M
        cmp = memcmp(data, str, len);
11079
12.2M
        if (cmp != 0) {
11080
8.09M
            if (cmp < 0)
11081
49.3k
                return -1;
11082
8.04M
            else
11083
8.04M
                return 1;
11084
8.09M
        }
11085
4.14M
        if (len1 > len2)
11086
199
            return 1; /* uni is longer */
11087
4.14M
        if (len1 < len2)
11088
496
            return -1; /* str is longer */
11089
4.14M
        return 0;
11090
4.14M
    }
11091
1.33k
    else {
11092
1.33k
        const void *data = PyUnicode_DATA(uni);
11093
        /* Compare Unicode string and source character set string */
11094
1.86k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11095
1.79k
            if (chr != (unsigned char)str[i])
11096
1.25k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11097
        /* This check keeps Python strings that end in '\0' from comparing equal
11098
         to C strings identical up to that point. */
11099
75
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11100
75
            return 1; /* uni is longer */
11101
0
        if (str[i])
11102
0
            return -1; /* str is longer */
11103
0
        return 0;
11104
0
    }
11105
12.2M
}
11106
11107
int
11108
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11109
24
{
11110
24
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11111
24
}
11112
11113
int
11114
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11115
24
{
11116
24
    assert(_PyUnicode_CHECK(unicode));
11117
24
    assert(str);
11118
11119
24
    if (PyUnicode_IS_ASCII(unicode)) {
11120
24
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11121
24
        return size == len &&
11122
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11123
24
    }
11124
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11125
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11126
0
        return size == len &&
11127
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11128
0
    }
11129
11130
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11131
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11132
0
        return 0;
11133
0
    }
11134
0
    const unsigned char *s = (const unsigned char *)str;
11135
0
    const unsigned char *ends = s + (size_t)size;
11136
0
    int kind = PyUnicode_KIND(unicode);
11137
0
    const void *data = PyUnicode_DATA(unicode);
11138
    /* Compare Unicode string and UTF-8 string */
11139
0
    for (Py_ssize_t i = 0; i < len; i++) {
11140
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11141
0
        if (ch < 0x80) {
11142
0
            if (ends == s || s[0] != ch) {
11143
0
                return 0;
11144
0
            }
11145
0
            s += 1;
11146
0
        }
11147
0
        else if (ch < 0x800) {
11148
0
            if ((ends - s) < 2 ||
11149
0
                s[0] != (0xc0 | (ch >> 6)) ||
11150
0
                s[1] != (0x80 | (ch & 0x3f)))
11151
0
            {
11152
0
                return 0;
11153
0
            }
11154
0
            s += 2;
11155
0
        }
11156
0
        else if (ch < 0x10000) {
11157
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11158
0
                (ends - s) < 3 ||
11159
0
                s[0] != (0xe0 | (ch >> 12)) ||
11160
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11161
0
                s[2] != (0x80 | (ch & 0x3f)))
11162
0
            {
11163
0
                return 0;
11164
0
            }
11165
0
            s += 3;
11166
0
        }
11167
0
        else {
11168
0
            assert(ch <= MAX_UNICODE);
11169
0
            if ((ends - s) < 4 ||
11170
0
                s[0] != (0xf0 | (ch >> 18)) ||
11171
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11172
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11173
0
                s[3] != (0x80 | (ch & 0x3f)))
11174
0
            {
11175
0
                return 0;
11176
0
            }
11177
0
            s += 4;
11178
0
        }
11179
0
    }
11180
0
    return s == ends;
11181
0
}
11182
11183
int
11184
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11185
37.9M
{
11186
37.9M
    size_t len;
11187
37.9M
    assert(_PyUnicode_CHECK(unicode));
11188
37.9M
    assert(str);
11189
#ifndef NDEBUG
11190
    for (const char *p = str; *p; p++) {
11191
        assert((unsigned char)*p < 128);
11192
    }
11193
#endif
11194
37.9M
    if (!PyUnicode_IS_ASCII(unicode))
11195
169k
        return 0;
11196
37.7M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11197
37.7M
    return strlen(str) == len &&
11198
707k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11199
37.9M
}
11200
11201
PyObject *
11202
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11203
64.9M
{
11204
64.9M
    int result;
11205
11206
64.9M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11207
237k
        Py_RETURN_NOTIMPLEMENTED;
11208
11209
64.7M
    if (left == right) {
11210
2.37k
        switch (op) {
11211
2.13k
        case Py_EQ:
11212
2.13k
        case Py_LE:
11213
2.13k
        case Py_GE:
11214
            /* a string is equal to itself */
11215
2.13k
            Py_RETURN_TRUE;
11216
238
        case Py_NE:
11217
238
        case Py_LT:
11218
238
        case Py_GT:
11219
238
            Py_RETURN_FALSE;
11220
0
        default:
11221
0
            PyErr_BadArgument();
11222
0
            return NULL;
11223
2.37k
        }
11224
2.37k
    }
11225
64.7M
    else if (op == Py_EQ || op == Py_NE) {
11226
36.8M
        result = unicode_eq(left, right);
11227
36.8M
        result ^= (op == Py_NE);
11228
36.8M
        return PyBool_FromLong(result);
11229
36.8M
    }
11230
27.8M
    else {
11231
27.8M
        result = unicode_compare(left, right);
11232
27.8M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11233
27.8M
    }
11234
64.7M
}
11235
11236
int
11237
PyUnicode_Contains(PyObject *str, PyObject *substr)
11238
224M
{
11239
224M
    int kind1, kind2;
11240
224M
    const void *buf1, *buf2;
11241
224M
    Py_ssize_t len1, len2;
11242
224M
    int result;
11243
11244
224M
    if (!PyUnicode_Check(substr)) {
11245
0
        PyErr_Format(PyExc_TypeError,
11246
0
                     "'in <string>' requires string as left operand, not %.100s",
11247
0
                     Py_TYPE(substr)->tp_name);
11248
0
        return -1;
11249
0
    }
11250
224M
    if (ensure_unicode(str) < 0)
11251
0
        return -1;
11252
11253
224M
    kind1 = PyUnicode_KIND(str);
11254
224M
    kind2 = PyUnicode_KIND(substr);
11255
224M
    if (kind1 < kind2)
11256
16.0M
        return 0;
11257
208M
    len1 = PyUnicode_GET_LENGTH(str);
11258
208M
    len2 = PyUnicode_GET_LENGTH(substr);
11259
208M
    if (len1 < len2)
11260
1.07M
        return 0;
11261
207M
    buf1 = PyUnicode_DATA(str);
11262
207M
    buf2 = PyUnicode_DATA(substr);
11263
207M
    if (len2 == 1) {
11264
185M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11265
185M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11266
185M
        return result;
11267
185M
    }
11268
22.0M
    if (kind2 != kind1) {
11269
18.8k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11270
18.8k
        if (!buf2)
11271
0
            return -1;
11272
18.8k
    }
11273
11274
22.0M
    switch (kind1) {
11275
21.9M
    case PyUnicode_1BYTE_KIND:
11276
21.9M
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11277
21.9M
        break;
11278
14.2k
    case PyUnicode_2BYTE_KIND:
11279
14.2k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11280
14.2k
        break;
11281
4.60k
    case PyUnicode_4BYTE_KIND:
11282
4.60k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11283
4.60k
        break;
11284
0
    default:
11285
0
        Py_UNREACHABLE();
11286
22.0M
    }
11287
11288
22.0M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11289
22.0M
    if (kind2 != kind1)
11290
18.8k
        PyMem_Free((void *)buf2);
11291
11292
22.0M
    return result;
11293
22.0M
}
11294
11295
/* Concat to string or Unicode object giving a new Unicode object. */
11296
11297
PyObject *
11298
PyUnicode_Concat(PyObject *left, PyObject *right)
11299
26.8M
{
11300
26.8M
    PyObject *result;
11301
26.8M
    Py_UCS4 maxchar, maxchar2;
11302
26.8M
    Py_ssize_t left_len, right_len, new_len;
11303
11304
26.8M
    if (ensure_unicode(left) < 0)
11305
0
        return NULL;
11306
11307
26.8M
    if (!PyUnicode_Check(right)) {
11308
0
        PyErr_Format(PyExc_TypeError,
11309
0
            "can only concatenate str (not \"%.200s\") to str",
11310
0
            Py_TYPE(right)->tp_name);
11311
0
        return NULL;
11312
0
    }
11313
11314
    /* Shortcuts */
11315
26.8M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11316
26.8M
    if (left == empty) {
11317
436k
        return PyUnicode_FromObject(right);
11318
436k
    }
11319
26.4M
    if (right == empty) {
11320
1.60M
        return PyUnicode_FromObject(left);
11321
1.60M
    }
11322
11323
24.8M
    left_len = PyUnicode_GET_LENGTH(left);
11324
24.8M
    right_len = PyUnicode_GET_LENGTH(right);
11325
24.8M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11326
0
        PyErr_SetString(PyExc_OverflowError,
11327
0
                        "strings are too large to concat");
11328
0
        return NULL;
11329
0
    }
11330
24.8M
    new_len = left_len + right_len;
11331
11332
24.8M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11333
24.8M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11334
24.8M
    maxchar = Py_MAX(maxchar, maxchar2);
11335
11336
    /* Concat the two Unicode strings */
11337
24.8M
    result = PyUnicode_New(new_len, maxchar);
11338
24.8M
    if (result == NULL)
11339
0
        return NULL;
11340
24.8M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11341
24.8M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11342
24.8M
    assert(_PyUnicode_CheckConsistency(result, 1));
11343
24.8M
    return result;
11344
24.8M
}
11345
11346
void
11347
PyUnicode_Append(PyObject **p_left, PyObject *right)
11348
5.56M
{
11349
5.56M
    PyObject *left, *res;
11350
5.56M
    Py_UCS4 maxchar, maxchar2;
11351
5.56M
    Py_ssize_t left_len, right_len, new_len;
11352
11353
5.56M
    if (p_left == NULL) {
11354
0
        if (!PyErr_Occurred())
11355
0
            PyErr_BadInternalCall();
11356
0
        return;
11357
0
    }
11358
5.56M
    left = *p_left;
11359
5.56M
    if (right == NULL || left == NULL
11360
5.56M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11361
0
        if (!PyErr_Occurred())
11362
0
            PyErr_BadInternalCall();
11363
0
        goto error;
11364
0
    }
11365
11366
    /* Shortcuts */
11367
5.56M
    PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
11368
5.56M
    if (left == empty) {
11369
435k
        Py_DECREF(left);
11370
435k
        *p_left = Py_NewRef(right);
11371
435k
        return;
11372
435k
    }
11373
5.13M
    if (right == empty) {
11374
12.7k
        return;
11375
12.7k
    }
11376
11377
5.11M
    left_len = PyUnicode_GET_LENGTH(left);
11378
5.11M
    right_len = PyUnicode_GET_LENGTH(right);
11379
5.11M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11380
0
        PyErr_SetString(PyExc_OverflowError,
11381
0
                        "strings are too large to concat");
11382
0
        goto error;
11383
0
    }
11384
5.11M
    new_len = left_len + right_len;
11385
11386
5.11M
    if (_PyUnicode_IsModifiable(left)
11387
5.11M
        && PyUnicode_CheckExact(right)
11388
5.11M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11389
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11390
           to change the structure size, but characters are stored just after
11391
           the structure, and so it requires to move all characters which is
11392
           not so different than duplicating the string. */
11393
1.93M
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11394
1.93M
    {
11395
        /* append inplace */
11396
1.93M
        if (unicode_resize(p_left, new_len) != 0)
11397
0
            goto error;
11398
11399
        /* copy 'right' into the newly allocated area of 'left' */
11400
1.93M
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11401
1.93M
    }
11402
3.18M
    else {
11403
3.18M
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11404
3.18M
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11405
3.18M
        maxchar = Py_MAX(maxchar, maxchar2);
11406
11407
        /* Concat the two Unicode strings */
11408
3.18M
        res = PyUnicode_New(new_len, maxchar);
11409
3.18M
        if (res == NULL)
11410
0
            goto error;
11411
3.18M
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11412
3.18M
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11413
3.18M
        Py_DECREF(left);
11414
3.18M
        *p_left = res;
11415
3.18M
    }
11416
5.11M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11417
5.11M
    return;
11418
11419
0
error:
11420
0
    Py_CLEAR(*p_left);
11421
0
}
11422
11423
void
11424
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11425
8
{
11426
8
    PyUnicode_Append(pleft, right);
11427
8
    Py_XDECREF(right);
11428
8
}
11429
11430
/*[clinic input]
11431
@permit_long_summary
11432
@text_signature "($self, sub[, start[, end]], /)"
11433
str.count as unicode_count -> Py_ssize_t
11434
11435
    self as str: self
11436
    sub as substr: unicode
11437
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11438
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11439
    /
11440
11441
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11442
11443
Optional arguments start and end are interpreted as in slice
11444
notation.
11445
[clinic start generated code]*/
11446
11447
static Py_ssize_t
11448
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11449
                   Py_ssize_t end)
11450
/*[clinic end generated code: output=8fcc3aef0b18edbf input=c9209e05438cc352]*/
11451
27.0M
{
11452
27.0M
    assert(PyUnicode_Check(str));
11453
27.0M
    assert(PyUnicode_Check(substr));
11454
11455
27.0M
    Py_ssize_t result;
11456
27.0M
    int kind1, kind2;
11457
27.0M
    const void *buf1 = NULL, *buf2 = NULL;
11458
27.0M
    Py_ssize_t len1, len2;
11459
11460
27.0M
    kind1 = PyUnicode_KIND(str);
11461
27.0M
    kind2 = PyUnicode_KIND(substr);
11462
27.0M
    if (kind1 < kind2)
11463
0
        return 0;
11464
11465
27.0M
    len1 = PyUnicode_GET_LENGTH(str);
11466
27.0M
    len2 = PyUnicode_GET_LENGTH(substr);
11467
27.0M
    ADJUST_INDICES(start, end, len1);
11468
27.0M
    if (end - start < len2)
11469
4.49M
        return 0;
11470
11471
22.5M
    buf1 = PyUnicode_DATA(str);
11472
22.5M
    buf2 = PyUnicode_DATA(substr);
11473
22.5M
    if (kind2 != kind1) {
11474
6.06M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11475
6.06M
        if (!buf2)
11476
0
            goto onError;
11477
6.06M
    }
11478
11479
    // We don't reuse `anylib_count` here because of the explicit casts.
11480
22.5M
    switch (kind1) {
11481
16.4M
    case PyUnicode_1BYTE_KIND:
11482
16.4M
        result = ucs1lib_count(
11483
16.4M
            ((const Py_UCS1*)buf1) + start, end - start,
11484
16.4M
            buf2, len2, PY_SSIZE_T_MAX
11485
16.4M
            );
11486
16.4M
        break;
11487
3.70M
    case PyUnicode_2BYTE_KIND:
11488
3.70M
        result = ucs2lib_count(
11489
3.70M
            ((const Py_UCS2*)buf1) + start, end - start,
11490
3.70M
            buf2, len2, PY_SSIZE_T_MAX
11491
3.70M
            );
11492
3.70M
        break;
11493
2.36M
    case PyUnicode_4BYTE_KIND:
11494
2.36M
        result = ucs4lib_count(
11495
2.36M
            ((const Py_UCS4*)buf1) + start, end - start,
11496
2.36M
            buf2, len2, PY_SSIZE_T_MAX
11497
2.36M
            );
11498
2.36M
        break;
11499
0
    default:
11500
0
        Py_UNREACHABLE();
11501
22.5M
    }
11502
11503
22.5M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11504
22.5M
    if (kind2 != kind1)
11505
6.06M
        PyMem_Free((void *)buf2);
11506
11507
22.5M
    return result;
11508
0
  onError:
11509
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11510
0
    if (kind2 != kind1)
11511
0
        PyMem_Free((void *)buf2);
11512
0
    return -1;
11513
22.5M
}
11514
11515
/*[clinic input]
11516
str.encode as unicode_encode
11517
11518
    encoding: str(c_default="NULL") = 'utf-8'
11519
        The encoding in which to encode the string.
11520
    errors: str(c_default="NULL") = 'strict'
11521
        The error handling scheme to use for encoding errors.
11522
        The default is 'strict' meaning that encoding errors raise a
11523
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace'
11524
        and 'xmlcharrefreplace' as well as any other name registered with
11525
        codecs.register_error that can handle UnicodeEncodeErrors.
11526
11527
Encode the string using the codec registered for encoding.
11528
[clinic start generated code]*/
11529
11530
static PyObject *
11531
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11532
/*[clinic end generated code: output=bf78b6e2a9470e3c input=b85a9645cb33b729]*/
11533
19.3M
{
11534
19.3M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11535
19.3M
}
11536
11537
/*[clinic input]
11538
str.expandtabs as unicode_expandtabs
11539
11540
    tabsize: int = 8
11541
11542
Return a copy where all tab characters are expanded using spaces.
11543
11544
If tabsize is not given, a tab size of 8 characters is assumed.
11545
[clinic start generated code]*/
11546
11547
static PyObject *
11548
unicode_expandtabs_impl(PyObject *self, int tabsize)
11549
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11550
1.22M
{
11551
1.22M
    Py_ssize_t i, j, line_pos, src_len, incr;
11552
1.22M
    Py_UCS4 ch;
11553
1.22M
    PyObject *u;
11554
1.22M
    const void *src_data;
11555
1.22M
    void *dest_data;
11556
1.22M
    int kind;
11557
1.22M
    int found;
11558
11559
    /* First pass: determine size of output string */
11560
1.22M
    src_len = PyUnicode_GET_LENGTH(self);
11561
1.22M
    i = j = line_pos = 0;
11562
1.22M
    kind = PyUnicode_KIND(self);
11563
1.22M
    src_data = PyUnicode_DATA(self);
11564
1.22M
    found = 0;
11565
2.78M
    for (; i < src_len; i++) {
11566
1.55M
        ch = PyUnicode_READ(kind, src_data, i);
11567
1.55M
        if (ch == '\t') {
11568
370k
            found = 1;
11569
370k
            if (tabsize > 0) {
11570
370k
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11571
370k
                if (j > PY_SSIZE_T_MAX - incr)
11572
0
                    goto overflow;
11573
370k
                line_pos += incr;
11574
370k
                j += incr;
11575
370k
            }
11576
370k
        }
11577
1.18M
        else {
11578
1.18M
            if (j > PY_SSIZE_T_MAX - 1)
11579
0
                goto overflow;
11580
1.18M
            line_pos++;
11581
1.18M
            j++;
11582
1.18M
            if (ch == '\n' || ch == '\r')
11583
4.54k
                line_pos = 0;
11584
1.18M
        }
11585
1.55M
    }
11586
1.22M
    if (!found)
11587
1.20M
        return unicode_result_unchanged(self);
11588
11589
    /* Second pass: create output string and fill it */
11590
26.4k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11591
26.4k
    if (!u)
11592
0
        return NULL;
11593
26.4k
    dest_data = PyUnicode_DATA(u);
11594
11595
26.4k
    i = j = line_pos = 0;
11596
11597
717k
    for (; i < src_len; i++) {
11598
690k
        ch = PyUnicode_READ(kind, src_data, i);
11599
690k
        if (ch == '\t') {
11600
370k
            if (tabsize > 0) {
11601
370k
                incr = tabsize - (line_pos % tabsize);
11602
370k
                line_pos += incr;
11603
370k
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11604
370k
                j += incr;
11605
370k
            }
11606
370k
        }
11607
320k
        else {
11608
320k
            line_pos++;
11609
320k
            PyUnicode_WRITE(kind, dest_data, j, ch);
11610
320k
            j++;
11611
320k
            if (ch == '\n' || ch == '\r')
11612
0
                line_pos = 0;
11613
320k
        }
11614
690k
    }
11615
26.4k
    assert (j == PyUnicode_GET_LENGTH(u));
11616
26.4k
    return unicode_result(u);
11617
11618
0
  overflow:
11619
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11620
0
    return NULL;
11621
26.4k
}
11622
11623
/*[clinic input]
11624
@permit_long_summary
11625
str.find as unicode_find = str.count
11626
11627
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11628
11629
Optional arguments start and end are interpreted as in slice
11630
notation.  Return -1 on failure.
11631
[clinic start generated code]*/
11632
11633
static Py_ssize_t
11634
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11635
                  Py_ssize_t end)
11636
/*[clinic end generated code: output=51dbe6255712e278 input=f57e93c59d1ee927]*/
11637
26.4M
{
11638
26.4M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11639
26.4M
    if (result < 0) {
11640
5.72M
        return -1;
11641
5.72M
    }
11642
20.7M
    return result;
11643
26.4M
}
11644
11645
static PyObject *
11646
unicode_getitem(PyObject *self, Py_ssize_t index)
11647
61.6M
{
11648
61.6M
    const void *data;
11649
61.6M
    int kind;
11650
61.6M
    Py_UCS4 ch;
11651
11652
61.6M
    if (!PyUnicode_Check(self)) {
11653
0
        PyErr_BadArgument();
11654
0
        return NULL;
11655
0
    }
11656
61.6M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11657
14.9k
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11658
14.9k
        return NULL;
11659
14.9k
    }
11660
61.6M
    kind = PyUnicode_KIND(self);
11661
61.6M
    data = PyUnicode_DATA(self);
11662
61.6M
    ch = PyUnicode_READ(kind, data, index);
11663
61.6M
    return unicode_char(ch);
11664
61.6M
}
11665
11666
/* Believe it or not, this produces the same value for ASCII strings
11667
   as bytes_hash(). */
11668
static Py_hash_t
11669
unicode_hash(PyObject *self)
11670
121M
{
11671
121M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11672
11673
#ifdef Py_DEBUG
11674
    assert(_Py_HashSecret_Initialized);
11675
#endif
11676
121M
    Py_hash_t hash = PyUnicode_HASH(self);
11677
121M
    if (hash != -1) {
11678
70.0M
        return hash;
11679
70.0M
    }
11680
51.1M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11681
51.1M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11682
11683
51.1M
    PyUnicode_SET_HASH(self, x);
11684
51.1M
    return x;
11685
121M
}
11686
11687
/*[clinic input]
11688
@permit_long_summary
11689
str.index as unicode_index = str.count
11690
11691
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11692
11693
Optional arguments start and end are interpreted as in slice
11694
notation.  Raises ValueError when the substring is not found.
11695
[clinic start generated code]*/
11696
11697
static Py_ssize_t
11698
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11699
                   Py_ssize_t end)
11700
/*[clinic end generated code: output=77558288837cdf40 input=5900ab84de55e628]*/
11701
45.0k
{
11702
45.0k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11703
45.0k
    if (result == -1) {
11704
712
        PyErr_SetString(PyExc_ValueError, "substring not found");
11705
712
    }
11706
44.3k
    else if (result < 0) {
11707
0
        return -1;
11708
0
    }
11709
45.0k
    return result;
11710
45.0k
}
11711
11712
/*[clinic input]
11713
@permit_long_summary
11714
str.isascii as unicode_isascii
11715
11716
Return True if all characters in the string are ASCII, False otherwise.
11717
11718
ASCII characters have code points in the range U+0000-U+007F.
11719
Empty string is ASCII too.
11720
[clinic start generated code]*/
11721
11722
static PyObject *
11723
unicode_isascii_impl(PyObject *self)
11724
/*[clinic end generated code: output=c5910d64b5a8003f input=dc74e1ced821159f]*/
11725
5.27k
{
11726
5.27k
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11727
5.27k
}
11728
11729
/*[clinic input]
11730
str.islower as unicode_islower
11731
11732
Return True if the string is a lowercase string, False otherwise.
11733
11734
A string is lowercase if all cased characters in the string are
11735
lowercase and there is at least one cased character in the string.
11736
[clinic start generated code]*/
11737
11738
static PyObject *
11739
unicode_islower_impl(PyObject *self)
11740
/*[clinic end generated code: output=dbd41995bd005b81 input=1879b48dfc628366]*/
11741
0
{
11742
0
    Py_ssize_t i, length;
11743
0
    int kind;
11744
0
    const void *data;
11745
0
    int cased;
11746
11747
0
    length = PyUnicode_GET_LENGTH(self);
11748
0
    kind = PyUnicode_KIND(self);
11749
0
    data = PyUnicode_DATA(self);
11750
11751
    /* Shortcut for single character strings */
11752
0
    if (length == 1)
11753
0
        return PyBool_FromLong(
11754
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11755
11756
    /* Special case for empty strings */
11757
0
    if (length == 0)
11758
0
        Py_RETURN_FALSE;
11759
11760
0
    cased = 0;
11761
0
    for (i = 0; i < length; i++) {
11762
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11763
11764
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11765
0
            Py_RETURN_FALSE;
11766
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11767
0
            cased = 1;
11768
0
    }
11769
0
    return PyBool_FromLong(cased);
11770
0
}
11771
11772
/*[clinic input]
11773
str.isupper as unicode_isupper
11774
11775
Return True if the string is an uppercase string, False otherwise.
11776
11777
A string is uppercase if all cased characters in the string are
11778
uppercase and there is at least one cased character in the string.
11779
[clinic start generated code]*/
11780
11781
static PyObject *
11782
unicode_isupper_impl(PyObject *self)
11783
/*[clinic end generated code: output=049209c8e7f15f59 input=77d29904aef0e3a0]*/
11784
10.7k
{
11785
10.7k
    Py_ssize_t i, length;
11786
10.7k
    int kind;
11787
10.7k
    const void *data;
11788
10.7k
    int cased;
11789
11790
10.7k
    length = PyUnicode_GET_LENGTH(self);
11791
10.7k
    kind = PyUnicode_KIND(self);
11792
10.7k
    data = PyUnicode_DATA(self);
11793
11794
    /* Shortcut for single character strings */
11795
10.7k
    if (length == 1)
11796
0
        return PyBool_FromLong(
11797
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11798
11799
    /* Special case for empty strings */
11800
10.7k
    if (length == 0)
11801
0
        Py_RETURN_FALSE;
11802
11803
10.7k
    cased = 0;
11804
135k
    for (i = 0; i < length; i++) {
11805
126k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11806
11807
126k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11808
1.36k
            Py_RETURN_FALSE;
11809
124k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11810
9.56k
            cased = 1;
11811
126k
    }
11812
9.42k
    return PyBool_FromLong(cased);
11813
10.7k
}
11814
11815
/*[clinic input]
11816
str.istitle as unicode_istitle
11817
11818
Return True if the string is a title-cased string, False otherwise.
11819
11820
In a title-cased string, upper- and title-case characters may only
11821
follow uncased characters and lowercase characters only cased ones.
11822
[clinic start generated code]*/
11823
11824
static PyObject *
11825
unicode_istitle_impl(PyObject *self)
11826
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11827
0
{
11828
0
    Py_ssize_t i, length;
11829
0
    int kind;
11830
0
    const void *data;
11831
0
    int cased, previous_is_cased;
11832
11833
0
    length = PyUnicode_GET_LENGTH(self);
11834
0
    kind = PyUnicode_KIND(self);
11835
0
    data = PyUnicode_DATA(self);
11836
11837
    /* Shortcut for single character strings */
11838
0
    if (length == 1) {
11839
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11840
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11841
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11842
0
    }
11843
11844
    /* Special case for empty strings */
11845
0
    if (length == 0)
11846
0
        Py_RETURN_FALSE;
11847
11848
0
    cased = 0;
11849
0
    previous_is_cased = 0;
11850
0
    for (i = 0; i < length; i++) {
11851
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11852
11853
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11854
0
            if (previous_is_cased)
11855
0
                Py_RETURN_FALSE;
11856
0
            previous_is_cased = 1;
11857
0
            cased = 1;
11858
0
        }
11859
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11860
0
            if (!previous_is_cased)
11861
0
                Py_RETURN_FALSE;
11862
0
            previous_is_cased = 1;
11863
0
            cased = 1;
11864
0
        }
11865
0
        else
11866
0
            previous_is_cased = 0;
11867
0
    }
11868
0
    return PyBool_FromLong(cased);
11869
0
}
11870
11871
/*[clinic input]
11872
str.isspace as unicode_isspace
11873
11874
Return True if the string is a whitespace string, False otherwise.
11875
11876
A string is whitespace if all characters in the string are
11877
whitespace and there is at least one character in the string.
11878
[clinic start generated code]*/
11879
11880
static PyObject *
11881
unicode_isspace_impl(PyObject *self)
11882
/*[clinic end generated code: output=163a63bfa08ac2b9 input=29e09560fc23fbeb]*/
11883
1.44M
{
11884
1.44M
    Py_ssize_t i, length;
11885
1.44M
    int kind;
11886
1.44M
    const void *data;
11887
11888
1.44M
    length = PyUnicode_GET_LENGTH(self);
11889
1.44M
    kind = PyUnicode_KIND(self);
11890
1.44M
    data = PyUnicode_DATA(self);
11891
11892
    /* Shortcut for single character strings */
11893
1.44M
    if (length == 1)
11894
1.44M
        return PyBool_FromLong(
11895
1.44M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11896
11897
    /* Special case for empty strings */
11898
1.33k
    if (length == 0)
11899
284
        Py_RETURN_FALSE;
11900
11901
7.60k
    for (i = 0; i < length; i++) {
11902
7.50k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11903
7.50k
        if (!Py_UNICODE_ISSPACE(ch))
11904
944
            Py_RETURN_FALSE;
11905
7.50k
    }
11906
1.04k
    Py_RETURN_TRUE;
11907
1.04k
}
11908
11909
/*[clinic input]
11910
str.isalpha as unicode_isalpha
11911
11912
Return True if the string is an alphabetic string, False otherwise.
11913
11914
A string is alphabetic if all characters in the string are
11915
alphabetic and there is at least one character in the string.
11916
[clinic start generated code]*/
11917
11918
static PyObject *
11919
unicode_isalpha_impl(PyObject *self)
11920
/*[clinic end generated code: output=cc81b9ac3883ec4f input=9906a07f3e04892e]*/
11921
19
{
11922
19
    Py_ssize_t i, length;
11923
19
    int kind;
11924
19
    const void *data;
11925
11926
19
    length = PyUnicode_GET_LENGTH(self);
11927
19
    kind = PyUnicode_KIND(self);
11928
19
    data = PyUnicode_DATA(self);
11929
11930
    /* Shortcut for single character strings */
11931
19
    if (length == 1)
11932
14
        return PyBool_FromLong(
11933
14
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11934
11935
    /* Special case for empty strings */
11936
5
    if (length == 0)
11937
0
        Py_RETURN_FALSE;
11938
11939
5
    for (i = 0; i < length; i++) {
11940
5
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11941
5
            Py_RETURN_FALSE;
11942
5
    }
11943
5
    Py_RETURN_TRUE;
11944
5
}
11945
11946
/*[clinic input]
11947
@permit_long_summary
11948
str.isalnum as unicode_isalnum
11949
11950
Return True if the string is an alpha-numeric string, False otherwise.
11951
11952
A string is alpha-numeric if all characters in the string are
11953
alpha-numeric and there is at least one character in the string.
11954
[clinic start generated code]*/
11955
11956
static PyObject *
11957
unicode_isalnum_impl(PyObject *self)
11958
/*[clinic end generated code: output=a5a23490ffc3660c input=892f64ebc171fd4f]*/
11959
0
{
11960
0
    int kind;
11961
0
    const void *data;
11962
0
    Py_ssize_t len, i;
11963
11964
0
    kind = PyUnicode_KIND(self);
11965
0
    data = PyUnicode_DATA(self);
11966
0
    len = PyUnicode_GET_LENGTH(self);
11967
11968
    /* Shortcut for single character strings */
11969
0
    if (len == 1) {
11970
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11971
0
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11972
0
    }
11973
11974
    /* Special case for empty strings */
11975
0
    if (len == 0)
11976
0
        Py_RETURN_FALSE;
11977
11978
0
    for (i = 0; i < len; i++) {
11979
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11980
0
        if (!Py_UNICODE_ISALNUM(ch))
11981
0
            Py_RETURN_FALSE;
11982
0
    }
11983
0
    Py_RETURN_TRUE;
11984
0
}
11985
11986
/*[clinic input]
11987
str.isdecimal as unicode_isdecimal
11988
11989
Return True if the string is a decimal string, False otherwise.
11990
11991
A string is a decimal string if all characters in the string are
11992
decimal and there is at least one character in the string.
11993
[clinic start generated code]*/
11994
11995
static PyObject *
11996
unicode_isdecimal_impl(PyObject *self)
11997
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=63b0453c48cad0af]*/
11998
1.43k
{
11999
1.43k
    Py_ssize_t i, length;
12000
1.43k
    int kind;
12001
1.43k
    const void *data;
12002
12003
1.43k
    length = PyUnicode_GET_LENGTH(self);
12004
1.43k
    kind = PyUnicode_KIND(self);
12005
1.43k
    data = PyUnicode_DATA(self);
12006
12007
    /* Shortcut for single character strings */
12008
1.43k
    if (length == 1)
12009
214
        return PyBool_FromLong(
12010
214
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12011
12012
    /* Special case for empty strings */
12013
1.22k
    if (length == 0)
12014
0
        Py_RETURN_FALSE;
12015
12016
8.04k
    for (i = 0; i < length; i++) {
12017
7.45k
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12018
636
            Py_RETURN_FALSE;
12019
7.45k
    }
12020
1.22k
    Py_RETURN_TRUE;
12021
1.22k
}
12022
12023
/*[clinic input]
12024
str.isdigit as unicode_isdigit
12025
12026
Return True if the string is a digit string, False otherwise.
12027
12028
A string is a digit string if all characters in the string are
12029
digits and there is at least one character in the string.
12030
[clinic start generated code]*/
12031
12032
static PyObject *
12033
unicode_isdigit_impl(PyObject *self)
12034
/*[clinic end generated code: output=10a6985311da6858 input=353b03747b062e4b]*/
12035
1.19M
{
12036
1.19M
    Py_ssize_t i, length;
12037
1.19M
    int kind;
12038
1.19M
    const void *data;
12039
12040
1.19M
    length = PyUnicode_GET_LENGTH(self);
12041
1.19M
    kind = PyUnicode_KIND(self);
12042
1.19M
    data = PyUnicode_DATA(self);
12043
12044
    /* Shortcut for single character strings */
12045
1.19M
    if (length == 1) {
12046
1.19M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12047
1.19M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12048
1.19M
    }
12049
12050
    /* Special case for empty strings */
12051
408
    if (length == 0)
12052
0
        Py_RETURN_FALSE;
12053
12054
1.45k
    for (i = 0; i < length; i++) {
12055
1.04k
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12056
0
            Py_RETURN_FALSE;
12057
1.04k
    }
12058
408
    Py_RETURN_TRUE;
12059
408
}
12060
12061
/*[clinic input]
12062
str.isnumeric as unicode_isnumeric
12063
12064
Return True if the string is a numeric string, False otherwise.
12065
12066
A string is numeric if all characters in the string are numeric and
12067
there is at least one character in the string.
12068
[clinic start generated code]*/
12069
12070
static PyObject *
12071
unicode_isnumeric_impl(PyObject *self)
12072
/*[clinic end generated code: output=9172a32d9013051a input=83b2a072ed7aff48]*/
12073
0
{
12074
0
    Py_ssize_t i, length;
12075
0
    int kind;
12076
0
    const void *data;
12077
12078
0
    length = PyUnicode_GET_LENGTH(self);
12079
0
    kind = PyUnicode_KIND(self);
12080
0
    data = PyUnicode_DATA(self);
12081
12082
    /* Shortcut for single character strings */
12083
0
    if (length == 1)
12084
0
        return PyBool_FromLong(
12085
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12086
12087
    /* Special case for empty strings */
12088
0
    if (length == 0)
12089
0
        Py_RETURN_FALSE;
12090
12091
0
    for (i = 0; i < length; i++) {
12092
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12093
0
            Py_RETURN_FALSE;
12094
0
    }
12095
0
    Py_RETURN_TRUE;
12096
0
}
12097
12098
Py_ssize_t
12099
_PyUnicode_ScanIdentifier(PyObject *self)
12100
61.8k
{
12101
61.8k
    Py_ssize_t i;
12102
61.8k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12103
61.8k
    if (len == 0) {
12104
        /* an empty string is not a valid identifier */
12105
0
        return 0;
12106
0
    }
12107
12108
61.8k
    int kind = PyUnicode_KIND(self);
12109
61.8k
    const void *data = PyUnicode_DATA(self);
12110
61.8k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12111
    /* PEP 3131 says that the first character must be in
12112
       XID_Start and subsequent characters in XID_Continue,
12113
       and for the ASCII range, the 2.x rules apply (i.e
12114
       start with letters and underscore, continue with
12115
       letters, digits, underscore). However, given the current
12116
       definition of XID_Start and XID_Continue, it is sufficient
12117
       to check just for these, except that _ must be allowed
12118
       as starting an identifier.  */
12119
61.8k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12120
796
        return 0;
12121
796
    }
12122
12123
510k
    for (i = 1; i < len; i++) {
12124
449k
        ch = PyUnicode_READ(kind, data, i);
12125
449k
        if (!_PyUnicode_IsXidContinue(ch)) {
12126
342
            return i;
12127
342
        }
12128
449k
    }
12129
60.6k
    return i;
12130
61.0k
}
12131
12132
int
12133
PyUnicode_IsIdentifier(PyObject *self)
12134
50.6k
{
12135
50.6k
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12136
50.6k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12137
    /* an empty string is not a valid identifier */
12138
50.6k
    return len && i == len;
12139
50.6k
}
12140
12141
/*[clinic input]
12142
@permit_long_summary
12143
str.isidentifier as unicode_isidentifier
12144
12145
Return True if the string is a valid Python identifier, False otherwise.
12146
12147
Call keyword.iskeyword(s) to test whether string s is a reserved
12148
identifier, such as "def" or "class".
12149
[clinic start generated code]*/
12150
12151
static PyObject *
12152
unicode_isidentifier_impl(PyObject *self)
12153
/*[clinic end generated code: output=fe585a9666572905 input=cabde62c20a3be6b]*/
12154
48.2k
{
12155
48.2k
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12156
48.2k
}
12157
12158
/*[clinic input]
12159
@permit_long_summary
12160
str.isprintable as unicode_isprintable
12161
12162
Return True if all characters in the string are printable, False otherwise.
12163
12164
A character is printable if repr() may use it in its output.
12165
[clinic start generated code]*/
12166
12167
static PyObject *
12168
unicode_isprintable_impl(PyObject *self)
12169
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12170
1.85M
{
12171
1.85M
    Py_ssize_t i, length;
12172
1.85M
    int kind;
12173
1.85M
    const void *data;
12174
12175
1.85M
    length = PyUnicode_GET_LENGTH(self);
12176
1.85M
    kind = PyUnicode_KIND(self);
12177
1.85M
    data = PyUnicode_DATA(self);
12178
12179
    /* Shortcut for single character strings */
12180
1.85M
    if (length == 1)
12181
1.85M
        return PyBool_FromLong(
12182
1.85M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12183
12184
0
    for (i = 0; i < length; i++) {
12185
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12186
0
            Py_RETURN_FALSE;
12187
0
        }
12188
0
    }
12189
0
    Py_RETURN_TRUE;
12190
0
}
12191
12192
/*[clinic input]
12193
str.join as unicode_join
12194
12195
    iterable: object
12196
    /
12197
12198
Concatenate any number of strings.
12199
12200
The string whose method is called is inserted in between each given
12201
string.  The result is returned as a new string.
12202
12203
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12204
[clinic start generated code]*/
12205
12206
static PyObject *
12207
unicode_join(PyObject *self, PyObject *iterable)
12208
/*[clinic end generated code: output=6857e7cecfe7bf98 input=fd330a11ee845fb2]*/
12209
19.5M
{
12210
19.5M
    return PyUnicode_Join(self, iterable);
12211
19.5M
}
12212
12213
static Py_ssize_t
12214
unicode_length(PyObject *self)
12215
33.0M
{
12216
33.0M
    return PyUnicode_GET_LENGTH(self);
12217
33.0M
}
12218
12219
/*[clinic input]
12220
str.ljust as unicode_ljust
12221
12222
    width: Py_ssize_t
12223
    fillchar: Py_UCS4 = ' '
12224
    /
12225
12226
Return a left-justified string of length width.
12227
12228
Padding is done using the specified fill character (default is
12229
a space).
12230
[clinic start generated code]*/
12231
12232
static PyObject *
12233
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12234
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=8a55f06694c20ed6]*/
12235
130
{
12236
130
    if (PyUnicode_GET_LENGTH(self) >= width)
12237
62
        return unicode_result_unchanged(self);
12238
12239
68
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12240
130
}
12241
12242
/*[clinic input]
12243
str.lower as unicode_lower
12244
12245
Return a copy of the string converted to lowercase.
12246
[clinic start generated code]*/
12247
12248
static PyObject *
12249
unicode_lower_impl(PyObject *self)
12250
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12251
76.3M
{
12252
76.3M
    if (PyUnicode_IS_ASCII(self))
12253
70.2M
        return ascii_upper_or_lower(self, 1);
12254
6.04M
    return case_operation(self, do_lower);
12255
76.3M
}
12256
12257
67.1M
#define LEFTSTRIP 0
12258
81.5M
#define RIGHTSTRIP 1
12259
46.2M
#define BOTHSTRIP 2
12260
12261
/* Arrays indexed by above */
12262
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12263
12264
0
#define STRIPNAME(i) (stripfuncnames[i])
12265
12266
/* externally visible for str.strip(unicode) */
12267
PyObject *
12268
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12269
4.30M
{
12270
4.30M
    const void *data;
12271
4.30M
    int kind;
12272
4.30M
    Py_ssize_t i, j, len;
12273
4.30M
    BLOOM_MASK sepmask;
12274
4.30M
    Py_ssize_t seplen;
12275
12276
4.30M
    kind = PyUnicode_KIND(self);
12277
4.30M
    data = PyUnicode_DATA(self);
12278
4.30M
    len = PyUnicode_GET_LENGTH(self);
12279
4.30M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12280
4.30M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12281
4.30M
                              PyUnicode_DATA(sepobj),
12282
4.30M
                              seplen);
12283
12284
4.30M
    i = 0;
12285
4.30M
    if (striptype != RIGHTSTRIP) {
12286
476k
        while (i < len) {
12287
473k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12288
473k
            if (!BLOOM(sepmask, ch))
12289
437k
                break;
12290
35.6k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12291
2.74k
                break;
12292
32.9k
            i++;
12293
32.9k
        }
12294
443k
    }
12295
12296
4.30M
    j = len;
12297
4.30M
    if (striptype != LEFTSTRIP) {
12298
3.86M
        j--;
12299
4.49M
        while (j >= i) {
12300
3.41M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12301
3.41M
            if (!BLOOM(sepmask, ch))
12302
2.66M
                break;
12303
757k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12304
120k
                break;
12305
636k
            j--;
12306
636k
        }
12307
12308
3.86M
        j++;
12309
3.86M
    }
12310
12311
4.30M
    return PyUnicode_Substring(self, i, j);
12312
4.30M
}
12313
12314
PyObject*
12315
_PyUnicode_BinarySlice(PyObject *container, PyObject *start_o, PyObject *stop_o)
12316
31.9M
{
12317
31.9M
    assert(PyUnicode_CheckExact(container));
12318
31.9M
    Py_ssize_t len = PyUnicode_GET_LENGTH(container);
12319
31.9M
    Py_ssize_t istart, istop;
12320
31.9M
    if (!_PyEval_UnpackIndices(start_o, stop_o, len, &istart, &istop)) {
12321
0
        return NULL;
12322
0
    }
12323
31.9M
    return PyUnicode_Substring(container, istart, istop);
12324
31.9M
}
12325
12326
PyObject*
12327
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12328
272M
{
12329
272M
    const unsigned char *data;
12330
272M
    int kind;
12331
272M
    Py_ssize_t length;
12332
12333
272M
    length = PyUnicode_GET_LENGTH(self);
12334
272M
    end = Py_MIN(end, length);
12335
12336
272M
    if (start == 0 && end == length)
12337
68.8M
        return unicode_result_unchanged(self);
12338
12339
203M
    if (start < 0 || end < 0) {
12340
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12341
0
        return NULL;
12342
0
    }
12343
203M
    if (start >= length || end < start)
12344
5.44M
        _Py_RETURN_UNICODE_EMPTY();
12345
12346
198M
    length = end - start;
12347
198M
    if (PyUnicode_IS_ASCII(self)) {
12348
64.3M
        data = PyUnicode_1BYTE_DATA(self);
12349
64.3M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12350
64.3M
    }
12351
133M
    else {
12352
133M
        kind = PyUnicode_KIND(self);
12353
133M
        data = PyUnicode_1BYTE_DATA(self);
12354
133M
        return PyUnicode_FromKindAndData(kind,
12355
133M
                                         data + kind * start,
12356
133M
                                         length);
12357
133M
    }
12358
198M
}
12359
12360
static PyObject *
12361
do_strip(PyObject *self, int striptype)
12362
60.6M
{
12363
60.6M
    Py_ssize_t len, i, j;
12364
12365
60.6M
    len = PyUnicode_GET_LENGTH(self);
12366
12367
60.6M
    if (PyUnicode_IS_ASCII(self)) {
12368
47.7M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12369
12370
47.7M
        i = 0;
12371
47.7M
        if (striptype != RIGHTSTRIP) {
12372
43.0M
            while (i < len) {
12373
34.5M
                Py_UCS1 ch = data[i];
12374
34.5M
                if (!_Py_ascii_whitespace[ch])
12375
29.5M
                    break;
12376
5.01M
                i++;
12377
5.01M
            }
12378
37.9M
        }
12379
12380
47.7M
        j = len;
12381
47.7M
        if (striptype != LEFTSTRIP) {
12382
47.3M
            j--;
12383
52.3M
            while (j >= i) {
12384
38.7M
                Py_UCS1 ch = data[j];
12385
38.7M
                if (!_Py_ascii_whitespace[ch])
12386
33.7M
                    break;
12387
4.99M
                j--;
12388
4.99M
            }
12389
47.3M
            j++;
12390
47.3M
        }
12391
47.7M
    }
12392
12.9M
    else {
12393
12.9M
        int kind = PyUnicode_KIND(self);
12394
12.9M
        const void *data = PyUnicode_DATA(self);
12395
12396
12.9M
        i = 0;
12397
12.9M
        if (striptype != RIGHTSTRIP) {
12398
12.1M
            while (i < len) {
12399
12.1M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12400
12.1M
                if (!Py_UNICODE_ISSPACE(ch))
12401
9.96M
                    break;
12402
2.14M
                i++;
12403
2.14M
            }
12404
9.96M
        }
12405
12406
12.9M
        j = len;
12407
12.9M
        if (striptype != LEFTSTRIP) {
12408
11.6M
            j--;
12409
14.6M
            while (j >= i) {
12410
14.5M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12411
14.5M
                if (!Py_UNICODE_ISSPACE(ch))
12412
11.6M
                    break;
12413
2.96M
                j--;
12414
2.96M
            }
12415
11.6M
            j++;
12416
11.6M
        }
12417
12.9M
    }
12418
12419
60.6M
    return PyUnicode_Substring(self, i, j);
12420
60.6M
}
12421
12422
12423
static PyObject *
12424
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12425
64.9M
{
12426
64.9M
    if (sep != Py_None) {
12427
4.30M
        if (PyUnicode_Check(sep))
12428
4.30M
            return _PyUnicode_XStrip(self, striptype, sep);
12429
0
        else {
12430
0
            PyErr_Format(PyExc_TypeError,
12431
0
                         "%s arg must be None or str",
12432
0
                         STRIPNAME(striptype));
12433
0
            return NULL;
12434
0
        }
12435
4.30M
    }
12436
12437
60.6M
    return do_strip(self, striptype);
12438
64.9M
}
12439
12440
12441
/*[clinic input]
12442
@permit_long_summary
12443
str.strip as unicode_strip
12444
12445
    chars: object = None
12446
    /
12447
12448
Return a copy of the string with leading and trailing whitespace removed.
12449
12450
If chars is given and not None, remove characters in chars instead.
12451
[clinic start generated code]*/
12452
12453
static PyObject *
12454
unicode_strip_impl(PyObject *self, PyObject *chars)
12455
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12456
46.2M
{
12457
46.2M
    return do_argstrip(self, BOTHSTRIP, chars);
12458
46.2M
}
12459
12460
12461
/*[clinic input]
12462
str.lstrip as unicode_lstrip
12463
12464
    chars: object = None
12465
    /
12466
12467
Return a copy of the string with leading whitespace removed.
12468
12469
If chars is given and not None, remove characters in chars instead.
12470
[clinic start generated code]*/
12471
12472
static PyObject *
12473
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12474
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12475
2.16M
{
12476
2.16M
    return do_argstrip(self, LEFTSTRIP, chars);
12477
2.16M
}
12478
12479
12480
/*[clinic input]
12481
str.rstrip as unicode_rstrip
12482
12483
    chars: object = None
12484
    /
12485
12486
Return a copy of the string with trailing whitespace removed.
12487
12488
If chars is given and not None, remove characters in chars instead.
12489
[clinic start generated code]*/
12490
12491
static PyObject *
12492
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12493
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12494
16.5M
{
12495
16.5M
    return do_argstrip(self, RIGHTSTRIP, chars);
12496
16.5M
}
12497
12498
12499
PyObject *
12500
_PyUnicode_Repeat(PyObject *str, Py_ssize_t len)
12501
293k
{
12502
293k
    PyObject *u;
12503
293k
    Py_ssize_t nchars, n;
12504
12505
293k
    if (len < 1)
12506
33.8k
        _Py_RETURN_UNICODE_EMPTY();
12507
12508
    /* no repeat, return original string */
12509
259k
    if (len == 1)
12510
28.2k
        return unicode_result_unchanged(str);
12511
12512
231k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12513
0
        PyErr_SetString(PyExc_OverflowError,
12514
0
                        "repeated string is too long");
12515
0
        return NULL;
12516
0
    }
12517
231k
    nchars = len * PyUnicode_GET_LENGTH(str);
12518
12519
231k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12520
231k
    if (!u)
12521
0
        return NULL;
12522
231k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12523
12524
231k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12525
229k
        int kind = PyUnicode_KIND(str);
12526
229k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12527
229k
        if (kind == PyUnicode_1BYTE_KIND) {
12528
229k
            void *to = PyUnicode_DATA(u);
12529
229k
            memset(to, (unsigned char)fill_char, len);
12530
229k
        }
12531
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12532
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12533
0
            for (n = 0; n < len; ++n)
12534
0
                ucs2[n] = fill_char;
12535
0
        } else {
12536
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12537
0
            assert(kind == PyUnicode_4BYTE_KIND);
12538
0
            for (n = 0; n < len; ++n)
12539
0
                ucs4[n] = fill_char;
12540
0
        }
12541
229k
    }
12542
2.33k
    else {
12543
2.33k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12544
2.33k
        char *to = (char *) PyUnicode_DATA(u);
12545
2.33k
        _PyBytes_RepeatBuffer(to, nchars * char_size, PyUnicode_DATA(str),
12546
2.33k
            PyUnicode_GET_LENGTH(str) * char_size);
12547
2.33k
    }
12548
12549
231k
    assert(_PyUnicode_CheckConsistency(u, 1));
12550
231k
    return u;
12551
231k
}
12552
12553
PyObject *
12554
PyUnicode_Replace(PyObject *str,
12555
                  PyObject *substr,
12556
                  PyObject *replstr,
12557
                  Py_ssize_t maxcount)
12558
0
{
12559
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12560
0
            ensure_unicode(replstr) < 0)
12561
0
        return NULL;
12562
0
    return replace(str, substr, replstr, maxcount);
12563
0
}
12564
12565
/*[clinic input]
12566
str.replace as unicode_replace
12567
12568
    old: unicode
12569
    new: unicode
12570
    /
12571
    count: Py_ssize_t = -1
12572
        Maximum number of occurrences to replace.
12573
        -1 (the default value) means replace all occurrences.
12574
12575
Return a copy with all occurrences of substring old replaced by new.
12576
12577
If count is given, only the first count occurrences are replaced.
12578
If count is not specified or -1, then all occurrences are replaced.
12579
[clinic start generated code]*/
12580
12581
static PyObject *
12582
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12583
                     Py_ssize_t count)
12584
/*[clinic end generated code: output=b63f1a8b5eebf448 input=d15a6886b05e2edc]*/
12585
19.6M
{
12586
19.6M
    return replace(self, old, new, count);
12587
19.6M
}
12588
12589
/*[clinic input]
12590
str.removeprefix as unicode_removeprefix
12591
12592
    prefix: unicode
12593
    /
12594
12595
Return a str with the given prefix string removed if present.
12596
12597
If the string starts with the prefix string, return
12598
string[len(prefix):].  Otherwise, return a copy of the original
12599
string.
12600
[clinic start generated code]*/
12601
12602
static PyObject *
12603
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12604
/*[clinic end generated code: output=f1e5945e9763bcb9 input=90d162724944bfa7]*/
12605
28
{
12606
28
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12607
28
    if (match == -1) {
12608
0
        return NULL;
12609
0
    }
12610
28
    if (match) {
12611
28
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12612
28
                                   PyUnicode_GET_LENGTH(self));
12613
28
    }
12614
0
    return unicode_result_unchanged(self);
12615
28
}
12616
12617
/*[clinic input]
12618
str.removesuffix as unicode_removesuffix
12619
12620
    suffix: unicode
12621
    /
12622
12623
Return a str with the given suffix string removed if present.
12624
12625
If the string ends with the suffix string and that suffix is not
12626
empty, return string[:-len(suffix)].  Otherwise, return a copy of
12627
the original string.
12628
[clinic start generated code]*/
12629
12630
static PyObject *
12631
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12632
/*[clinic end generated code: output=d36629e227636822 input=6efc96152d4bfcd5]*/
12633
0
{
12634
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12635
0
    if (match == -1) {
12636
0
        return NULL;
12637
0
    }
12638
0
    if (match) {
12639
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12640
0
                                            - PyUnicode_GET_LENGTH(suffix));
12641
0
    }
12642
0
    return unicode_result_unchanged(self);
12643
0
}
12644
12645
static PyObject *
12646
unicode_repr(PyObject *unicode)
12647
12.6M
{
12648
12.6M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12649
12.6M
    const void *idata = PyUnicode_DATA(unicode);
12650
12651
    /* Compute length of output, quote characters, and
12652
       maximum character */
12653
12.6M
    Py_ssize_t osize = 0;
12654
12.6M
    Py_UCS4 maxch = 127;
12655
12.6M
    Py_ssize_t squote = 0;
12656
12.6M
    Py_ssize_t dquote = 0;
12657
12.6M
    int ikind = PyUnicode_KIND(unicode);
12658
310M
    for (Py_ssize_t i = 0; i < isize; i++) {
12659
298M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12660
298M
        Py_ssize_t incr = 1;
12661
298M
        switch (ch) {
12662
372k
        case '\'': squote++; break;
12663
557k
        case '"':  dquote++; break;
12664
4.48M
        case '\\': case '\t': case '\r': case '\n':
12665
4.48M
            incr = 2;
12666
4.48M
            break;
12667
292M
        default:
12668
            /* Fast-path ASCII */
12669
292M
            if (ch < ' ' || ch == 0x7f)
12670
170M
                incr = 4; /* \xHH */
12671
122M
            else if (ch < 0x7f)
12672
108M
                ;
12673
13.7M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12674
13.4M
                maxch = (ch > maxch) ? ch : maxch;
12675
334k
            else if (ch < 0x100)
12676
67.4k
                incr = 4; /* \xHH */
12677
267k
            else if (ch < 0x10000)
12678
89.8k
                incr = 6; /* \uHHHH */
12679
177k
            else
12680
177k
                incr = 10; /* \uHHHHHHHH */
12681
298M
        }
12682
298M
        if (osize > PY_SSIZE_T_MAX - incr) {
12683
0
            PyErr_SetString(PyExc_OverflowError,
12684
0
                            "string is too long to generate repr");
12685
0
            return NULL;
12686
0
        }
12687
298M
        osize += incr;
12688
298M
    }
12689
12690
12.6M
    Py_UCS4 quote = '\'';
12691
12.6M
    int changed = (osize != isize);
12692
12.6M
    if (squote) {
12693
104k
        changed = 1;
12694
104k
        if (dquote)
12695
            /* Both squote and dquote present. Use squote,
12696
               and escape them */
12697
7.44k
            osize += squote;
12698
97.4k
        else
12699
97.4k
            quote = '"';
12700
104k
    }
12701
12.6M
    osize += 2;   /* quotes */
12702
12703
12.6M
    PyObject *repr = PyUnicode_New(osize, maxch);
12704
12.6M
    if (repr == NULL)
12705
0
        return NULL;
12706
12.6M
    int okind = PyUnicode_KIND(repr);
12707
12.6M
    void *odata = PyUnicode_DATA(repr);
12708
12709
12.6M
    if (!changed) {
12710
7.02M
        PyUnicode_WRITE(okind, odata, 0, quote);
12711
12712
7.02M
        _PyUnicode_FastCopyCharacters(repr, 1,
12713
7.02M
                                      unicode, 0,
12714
7.02M
                                      isize);
12715
12716
7.02M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12717
7.02M
    }
12718
5.67M
    else {
12719
5.67M
        switch (okind) {
12720
5.39M
        case PyUnicode_1BYTE_KIND:
12721
5.39M
            ucs1lib_repr(unicode, quote, odata);
12722
5.39M
            break;
12723
267k
        case PyUnicode_2BYTE_KIND:
12724
267k
            ucs2lib_repr(unicode, quote, odata);
12725
267k
            break;
12726
8.10k
        default:
12727
8.10k
            assert(okind == PyUnicode_4BYTE_KIND);
12728
8.10k
            ucs4lib_repr(unicode, quote, odata);
12729
5.67M
        }
12730
5.67M
    }
12731
12732
12.6M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12733
12.6M
    return repr;
12734
12.6M
}
12735
12736
/*[clinic input]
12737
@permit_long_summary
12738
str.rfind as unicode_rfind = str.count
12739
12740
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12741
12742
Optional arguments start and end are interpreted as in slice
12743
notation.  Return -1 on failure.
12744
[clinic start generated code]*/
12745
12746
static Py_ssize_t
12747
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12748
                   Py_ssize_t end)
12749
/*[clinic end generated code: output=880b29f01dd014c8 input=2e67789533baf2f5]*/
12750
215k
{
12751
215k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12752
215k
    if (result < 0) {
12753
10.3k
        return -1;
12754
10.3k
    }
12755
205k
    return result;
12756
215k
}
12757
12758
/*[clinic input]
12759
@permit_long_summary
12760
str.rindex as unicode_rindex = str.count
12761
12762
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12763
12764
Optional arguments start and end are interpreted as in slice
12765
notation.  Raises ValueError when the substring is not found.
12766
[clinic start generated code]*/
12767
12768
static Py_ssize_t
12769
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12770
                    Py_ssize_t end)
12771
/*[clinic end generated code: output=5f3aef124c867fe1 input=e29d446c8234c9d9]*/
12772
165k
{
12773
165k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12774
165k
    if (result == -1) {
12775
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12776
0
    }
12777
165k
    else if (result < 0) {
12778
0
        return -1;
12779
0
    }
12780
165k
    return result;
12781
165k
}
12782
12783
/*[clinic input]
12784
str.rjust as unicode_rjust
12785
12786
    width: Py_ssize_t
12787
    fillchar: Py_UCS4 = ' '
12788
    /
12789
12790
Return a right-justified string of length width.
12791
12792
Padding is done using the specified fill character (default is
12793
a space).
12794
[clinic start generated code]*/
12795
12796
static PyObject *
12797
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12798
/*[clinic end generated code: output=804a1a57fbe8d5cf input=1256a8d659589907]*/
12799
0
{
12800
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12801
0
        return unicode_result_unchanged(self);
12802
12803
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12804
0
}
12805
12806
PyObject *
12807
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12808
0
{
12809
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12810
0
        return NULL;
12811
12812
0
    return split(s, sep, maxsplit);
12813
0
}
12814
12815
/*[clinic input]
12816
@permit_long_summary
12817
str.split as unicode_split
12818
12819
    sep: object = None
12820
        The separator used to split the string.
12821
12822
        When set to None (the default value), will split on any
12823
        whitespace character (including \n \r \t \f and spaces) and
12824
        will discard empty strings from the result.
12825
    maxsplit: Py_ssize_t = -1
12826
        Maximum number of splits.
12827
        -1 (the default value) means no limit.
12828
12829
Return a list of the substrings in the string, using sep as the separator string.
12830
12831
Splitting starts at the front of the string and works to the end.
12832
12833
Note, str.split() is mainly useful for data that has been
12834
intentionally delimited.  With natural text that includes
12835
punctuation, consider using the regular expression module.
12836
12837
[clinic start generated code]*/
12838
12839
static PyObject *
12840
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12841
/*[clinic end generated code: output=3a65b1db356948dc input=288cfd6bc8828f5a]*/
12842
22.0M
{
12843
22.0M
    if (sep == Py_None)
12844
185k
        return split(self, NULL, maxsplit);
12845
21.8M
    if (PyUnicode_Check(sep))
12846
21.8M
        return split(self, sep, maxsplit);
12847
12848
0
    PyErr_Format(PyExc_TypeError,
12849
0
                 "must be str or None, not %.100s",
12850
0
                 Py_TYPE(sep)->tp_name);
12851
0
    return NULL;
12852
21.8M
}
12853
12854
PyObject *
12855
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12856
8.72M
{
12857
8.72M
    PyObject* out;
12858
8.72M
    int kind1, kind2;
12859
8.72M
    const void *buf1, *buf2;
12860
8.72M
    Py_ssize_t len1, len2;
12861
12862
8.72M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12863
0
        return NULL;
12864
12865
8.72M
    kind1 = PyUnicode_KIND(str_obj);
12866
8.72M
    kind2 = PyUnicode_KIND(sep_obj);
12867
8.72M
    len1 = PyUnicode_GET_LENGTH(str_obj);
12868
8.72M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12869
8.72M
    if (kind1 < kind2 || len1 < len2) {
12870
1.32k
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12871
1.32k
        return PyTuple_Pack(3, str_obj, empty, empty);
12872
1.32k
    }
12873
8.72M
    buf1 = PyUnicode_DATA(str_obj);
12874
8.72M
    buf2 = PyUnicode_DATA(sep_obj);
12875
8.72M
    if (kind2 != kind1) {
12876
95.4k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12877
95.4k
        if (!buf2)
12878
0
            return NULL;
12879
95.4k
    }
12880
12881
8.72M
    switch (kind1) {
12882
8.62M
    case PyUnicode_1BYTE_KIND:
12883
8.62M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12884
3.04M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12885
5.57M
        else
12886
5.57M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12887
8.62M
        break;
12888
82.9k
    case PyUnicode_2BYTE_KIND:
12889
82.9k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12890
82.9k
        break;
12891
12.5k
    case PyUnicode_4BYTE_KIND:
12892
12.5k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12893
12.5k
        break;
12894
0
    default:
12895
0
        Py_UNREACHABLE();
12896
8.72M
    }
12897
12898
8.72M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12899
8.72M
    if (kind2 != kind1)
12900
95.4k
        PyMem_Free((void *)buf2);
12901
12902
8.72M
    return out;
12903
8.72M
}
12904
12905
12906
PyObject *
12907
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12908
57.0k
{
12909
57.0k
    PyObject* out;
12910
57.0k
    int kind1, kind2;
12911
57.0k
    const void *buf1, *buf2;
12912
57.0k
    Py_ssize_t len1, len2;
12913
12914
57.0k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12915
0
        return NULL;
12916
12917
57.0k
    kind1 = PyUnicode_KIND(str_obj);
12918
57.0k
    kind2 = PyUnicode_KIND(sep_obj);
12919
57.0k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12920
57.0k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12921
57.0k
    if (kind1 < kind2 || len1 < len2) {
12922
0
        PyObject *empty = _PyUnicode_GetEmpty();  // Borrowed reference
12923
0
        return PyTuple_Pack(3, empty, empty, str_obj);
12924
0
    }
12925
57.0k
    buf1 = PyUnicode_DATA(str_obj);
12926
57.0k
    buf2 = PyUnicode_DATA(sep_obj);
12927
57.0k
    if (kind2 != kind1) {
12928
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
12929
0
        if (!buf2)
12930
0
            return NULL;
12931
0
    }
12932
12933
57.0k
    switch (kind1) {
12934
57.0k
    case PyUnicode_1BYTE_KIND:
12935
57.0k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12936
57.0k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12937
0
        else
12938
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12939
57.0k
        break;
12940
0
    case PyUnicode_2BYTE_KIND:
12941
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12942
0
        break;
12943
0
    case PyUnicode_4BYTE_KIND:
12944
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12945
0
        break;
12946
0
    default:
12947
0
        Py_UNREACHABLE();
12948
57.0k
    }
12949
12950
57.0k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
12951
57.0k
    if (kind2 != kind1)
12952
0
        PyMem_Free((void *)buf2);
12953
12954
57.0k
    return out;
12955
57.0k
}
12956
12957
/*[clinic input]
12958
str.partition as unicode_partition
12959
12960
    sep: object
12961
    /
12962
12963
Partition the string into three parts using the given separator.
12964
12965
This will search for the separator in the string.  If the separator
12966
is found, returns a 3-tuple containing the part before the
12967
separator, the separator itself, and the part after it.
12968
12969
If the separator is not found, returns a 3-tuple containing
12970
the original string and two empty strings.
12971
[clinic start generated code]*/
12972
12973
static PyObject *
12974
unicode_partition(PyObject *self, PyObject *sep)
12975
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=e45faa8c26270cb1]*/
12976
8.72M
{
12977
8.72M
    return PyUnicode_Partition(self, sep);
12978
8.72M
}
12979
12980
/*[clinic input]
12981
str.rpartition as unicode_rpartition = str.partition
12982
12983
Partition the string into three parts using the given separator.
12984
12985
This will search for the separator in the string, starting at the
12986
end.  If the separator is found, returns a 3-tuple containing the
12987
part before the separator, the separator itself, and the part after
12988
it.
12989
12990
If the separator is not found, returns a 3-tuple containing two
12991
empty strings and the original string.
12992
[clinic start generated code]*/
12993
12994
static PyObject *
12995
unicode_rpartition(PyObject *self, PyObject *sep)
12996
/*[clinic end generated code: output=1aa13cf1156572aa input=53a7f8cb19975b7c]*/
12997
57.0k
{
12998
57.0k
    return PyUnicode_RPartition(self, sep);
12999
57.0k
}
13000
13001
PyObject *
13002
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13003
0
{
13004
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13005
0
        return NULL;
13006
13007
0
    return rsplit(s, sep, maxsplit);
13008
0
}
13009
13010
/*[clinic input]
13011
@permit_long_summary
13012
str.rsplit as unicode_rsplit = str.split
13013
13014
Return a list of the substrings in the string, using sep as the separator string.
13015
13016
Splitting starts at the end of the string and works to the front.
13017
[clinic start generated code]*/
13018
13019
static PyObject *
13020
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13021
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13022
66
{
13023
66
    if (sep == Py_None)
13024
0
        return rsplit(self, NULL, maxsplit);
13025
66
    if (PyUnicode_Check(sep))
13026
66
        return rsplit(self, sep, maxsplit);
13027
13028
0
    PyErr_Format(PyExc_TypeError,
13029
0
                 "must be str or None, not %.100s",
13030
0
                 Py_TYPE(sep)->tp_name);
13031
0
    return NULL;
13032
66
}
13033
13034
/*[clinic input]
13035
@permit_long_summary
13036
str.splitlines as unicode_splitlines
13037
13038
    keepends: bool = False
13039
13040
Return a list of the lines in the string, breaking at line boundaries.
13041
13042
Line breaks are not included in the resulting list unless keepends
13043
is given and true.
13044
[clinic start generated code]*/
13045
13046
static PyObject *
13047
unicode_splitlines_impl(PyObject *self, int keepends)
13048
/*[clinic end generated code: output=f664dcdad153ec40 input=b45ea0f87645a06d]*/
13049
17.3k
{
13050
17.3k
    return PyUnicode_Splitlines(self, keepends);
13051
17.3k
}
13052
13053
static
13054
PyObject *unicode_str(PyObject *self)
13055
3.17M
{
13056
3.17M
    return unicode_result_unchanged(self);
13057
3.17M
}
13058
13059
/*[clinic input]
13060
@permit_long_summary
13061
str.swapcase as unicode_swapcase
13062
13063
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13064
[clinic start generated code]*/
13065
13066
static PyObject *
13067
unicode_swapcase_impl(PyObject *self)
13068
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13069
0
{
13070
0
    return case_operation(self, do_swapcase);
13071
0
}
13072
13073
static int
13074
unicode_maketrans_from_dict(PyObject *x, PyObject *newdict)
13075
0
{
13076
0
    PyObject *key, *value;
13077
0
    Py_ssize_t i = 0;
13078
0
    int res;
13079
0
    while (PyDict_Next(x, &i, &key, &value)) {
13080
0
        if (PyUnicode_Check(key)) {
13081
0
            PyObject *newkey;
13082
0
            int kind;
13083
0
            const void *data;
13084
0
            if (PyUnicode_GET_LENGTH(key) != 1) {
13085
0
                PyErr_SetString(PyExc_ValueError, "string keys in translate"
13086
0
                                "table must be of length 1");
13087
0
                return -1;
13088
0
            }
13089
0
            kind = PyUnicode_KIND(key);
13090
0
            data = PyUnicode_DATA(key);
13091
0
            newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13092
0
            if (!newkey)
13093
0
                return -1;
13094
0
            res = PyDict_SetItem(newdict, newkey, value);
13095
0
            Py_DECREF(newkey);
13096
0
            if (res < 0)
13097
0
                return -1;
13098
0
        }
13099
0
        else if (PyLong_Check(key)) {
13100
0
            if (PyDict_SetItem(newdict, key, value) < 0)
13101
0
                return -1;
13102
0
        }
13103
0
        else {
13104
0
            PyErr_SetString(PyExc_TypeError, "keys in translate table must"
13105
0
                            "be strings or integers");
13106
0
            return -1;
13107
0
        }
13108
0
    }
13109
0
    return 0;
13110
0
}
13111
13112
/*[clinic input]
13113
13114
@staticmethod
13115
str.maketrans as unicode_maketrans
13116
13117
  x: object
13118
13119
  y: unicode=NULL
13120
13121
  z: unicode=NULL
13122
13123
  /
13124
13125
Return a translation table usable for str.translate().
13126
13127
If there is only one argument, it must be a dictionary mapping
13128
Unicode ordinals (integers) or characters to Unicode ordinals,
13129
strings or None.  Character keys will be then converted to ordinals.
13130
If there are two arguments, they must be strings of equal length,
13131
and in the resulting dictionary, each character in x will be mapped
13132
to the character at the same position in y.  If there is a third
13133
argument, it must be a string, whose characters will be mapped to
13134
None in the result.
13135
[clinic start generated code]*/
13136
13137
static PyObject *
13138
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13139
/*[clinic end generated code: output=a925c89452bd5881 input=66bc00a1b4258a6e]*/
13140
4
{
13141
4
    PyObject *new = NULL, *key, *value;
13142
4
    Py_ssize_t i = 0;
13143
4
    int res;
13144
13145
4
    new = PyDict_New();
13146
4
    if (!new)
13147
0
        return NULL;
13148
4
    if (y != NULL) {
13149
4
        int x_kind, y_kind, z_kind;
13150
4
        const void *x_data, *y_data, *z_data;
13151
13152
        /* x must be a string too, of equal length */
13153
4
        if (!PyUnicode_Check(x)) {
13154
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13155
0
                            "be a string if there is a second argument");
13156
0
            goto err;
13157
0
        }
13158
4
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13159
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13160
0
                            "arguments must have equal length");
13161
0
            goto err;
13162
0
        }
13163
        /* create entries for translating chars in x to those in y */
13164
4
        x_kind = PyUnicode_KIND(x);
13165
4
        y_kind = PyUnicode_KIND(y);
13166
4
        x_data = PyUnicode_DATA(x);
13167
4
        y_data = PyUnicode_DATA(y);
13168
36
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13169
32
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13170
32
            if (!key)
13171
0
                goto err;
13172
32
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13173
32
            if (!value) {
13174
0
                Py_DECREF(key);
13175
0
                goto err;
13176
0
            }
13177
32
            res = PyDict_SetItem(new, key, value);
13178
32
            Py_DECREF(key);
13179
32
            Py_DECREF(value);
13180
32
            if (res < 0)
13181
0
                goto err;
13182
32
        }
13183
        /* create entries for deleting chars in z */
13184
4
        if (z != NULL) {
13185
0
            z_kind = PyUnicode_KIND(z);
13186
0
            z_data = PyUnicode_DATA(z);
13187
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13188
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13189
0
                if (!key)
13190
0
                    goto err;
13191
0
                res = PyDict_SetItem(new, key, Py_None);
13192
0
                Py_DECREF(key);
13193
0
                if (res < 0)
13194
0
                    goto err;
13195
0
            }
13196
0
        }
13197
4
    } else {
13198
        /* x must be a dict */
13199
0
        if (!PyAnyDict_CheckExact(x)) {
13200
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13201
0
                            "to maketrans it must be a dict");
13202
0
            goto err;
13203
0
        }
13204
        /* copy entries into the new dict, converting string keys to int keys */
13205
0
        int errcode;
13206
0
        Py_BEGIN_CRITICAL_SECTION(x);
13207
0
        errcode = unicode_maketrans_from_dict(x, new);
13208
0
        Py_END_CRITICAL_SECTION();
13209
0
        if (errcode < 0)
13210
0
            goto err;
13211
0
    }
13212
4
    return new;
13213
0
  err:
13214
0
    Py_DECREF(new);
13215
0
    return NULL;
13216
4
}
13217
13218
/*[clinic input]
13219
@permit_long_summary
13220
str.translate as unicode_translate
13221
13222
    table: object
13223
        Translation table, which must be a mapping of Unicode ordinals
13224
        to Unicode ordinals, strings, or None.
13225
    /
13226
13227
Replace each character in the string using the given translation table.
13228
13229
The table must implement lookup/indexing via __getitem__, for
13230
instance a dictionary or list.  If this operation raises
13231
LookupError, the character is left untouched.  Characters mapped to
13232
None are deleted.
13233
[clinic start generated code]*/
13234
13235
static PyObject *
13236
unicode_translate(PyObject *self, PyObject *table)
13237
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=48cf0efe06bc1b75]*/
13238
12.3k
{
13239
12.3k
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13240
12.3k
}
13241
13242
/*[clinic input]
13243
str.upper as unicode_upper
13244
13245
Return a copy of the string converted to uppercase.
13246
[clinic start generated code]*/
13247
13248
static PyObject *
13249
unicode_upper_impl(PyObject *self)
13250
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13251
306
{
13252
306
    if (PyUnicode_IS_ASCII(self))
13253
306
        return ascii_upper_or_lower(self, 0);
13254
0
    return case_operation(self, do_upper);
13255
306
}
13256
13257
/*[clinic input]
13258
@permit_long_summary
13259
str.zfill as unicode_zfill
13260
13261
    width: Py_ssize_t
13262
    /
13263
13264
Pad a numeric string with zeros on the left, to fill a field of the given width.
13265
13266
The string is never truncated.
13267
[clinic start generated code]*/
13268
13269
static PyObject *
13270
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13271
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13272
0
{
13273
0
    Py_ssize_t fill;
13274
0
    PyObject *u;
13275
0
    int kind;
13276
0
    const void *data;
13277
0
    Py_UCS4 chr;
13278
13279
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13280
0
        return unicode_result_unchanged(self);
13281
13282
0
    fill = width - PyUnicode_GET_LENGTH(self);
13283
13284
0
    u = pad(self, fill, 0, '0');
13285
13286
0
    if (u == NULL)
13287
0
        return NULL;
13288
13289
0
    kind = PyUnicode_KIND(u);
13290
0
    data = PyUnicode_DATA(u);
13291
0
    chr = PyUnicode_READ(kind, data, fill);
13292
13293
0
    if (chr == '+' || chr == '-') {
13294
        /* move sign to beginning of string */
13295
0
        PyUnicode_WRITE(kind, data, 0, chr);
13296
0
        PyUnicode_WRITE(kind, data, fill, '0');
13297
0
    }
13298
13299
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13300
0
    return u;
13301
0
}
13302
13303
/*[clinic input]
13304
@permit_long_summary
13305
@text_signature "($self, prefix[, start[, end]], /)"
13306
str.startswith as unicode_startswith
13307
13308
    prefix as subobj: object
13309
        A string or a tuple of strings to try.
13310
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13311
        Optional start position. Default: start of the string.
13312
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13313
        Optional stop position. Default: end of the string.
13314
    /
13315
13316
Return True if the string starts with the specified prefix, False otherwise.
13317
[clinic start generated code]*/
13318
13319
static PyObject *
13320
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13321
                        Py_ssize_t end)
13322
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13323
43.3M
{
13324
43.3M
    if (PyTuple_Check(subobj)) {
13325
1.65M
        Py_ssize_t i;
13326
6.03M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13327
4.39M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13328
4.39M
            if (!PyUnicode_Check(substring)) {
13329
0
                PyErr_Format(PyExc_TypeError,
13330
0
                             "tuple for startswith must only contain str, "
13331
0
                             "not %.100s",
13332
0
                             Py_TYPE(substring)->tp_name);
13333
0
                return NULL;
13334
0
            }
13335
4.39M
            int result = tailmatch(self, substring, start, end, -1);
13336
4.39M
            if (result < 0) {
13337
0
                return NULL;
13338
0
            }
13339
4.39M
            if (result) {
13340
18.6k
                Py_RETURN_TRUE;
13341
18.6k
            }
13342
4.39M
        }
13343
        /* nothing matched */
13344
1.65M
        Py_RETURN_FALSE;
13345
1.65M
    }
13346
41.7M
    if (!PyUnicode_Check(subobj)) {
13347
0
        PyErr_Format(PyExc_TypeError,
13348
0
                     "startswith first arg must be str or "
13349
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13350
0
        return NULL;
13351
0
    }
13352
41.7M
    int result = tailmatch(self, subobj, start, end, -1);
13353
41.7M
    if (result < 0) {
13354
0
        return NULL;
13355
0
    }
13356
41.7M
    return PyBool_FromLong(result);
13357
41.7M
}
13358
13359
13360
/*[clinic input]
13361
@permit_long_summary
13362
@text_signature "($self, suffix[, start[, end]], /)"
13363
str.endswith as unicode_endswith
13364
13365
    suffix as subobj: object
13366
        A string or a tuple of strings to try.
13367
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13368
        Optional start position. Default: start of the string.
13369
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13370
        Optional stop position. Default: end of the string.
13371
    /
13372
13373
Return True if the string ends with the specified suffix, False otherwise.
13374
[clinic start generated code]*/
13375
13376
static PyObject *
13377
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13378
                      Py_ssize_t end)
13379
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13380
10.8M
{
13381
10.8M
    if (PyTuple_Check(subobj)) {
13382
190k
        Py_ssize_t i;
13383
355k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13384
328k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13385
328k
            if (!PyUnicode_Check(substring)) {
13386
0
                PyErr_Format(PyExc_TypeError,
13387
0
                             "tuple for endswith must only contain str, "
13388
0
                             "not %.100s",
13389
0
                             Py_TYPE(substring)->tp_name);
13390
0
                return NULL;
13391
0
            }
13392
328k
            int result = tailmatch(self, substring, start, end, +1);
13393
328k
            if (result < 0) {
13394
0
                return NULL;
13395
0
            }
13396
328k
            if (result) {
13397
162k
                Py_RETURN_TRUE;
13398
162k
            }
13399
328k
        }
13400
190k
        Py_RETURN_FALSE;
13401
190k
    }
13402
10.6M
    if (!PyUnicode_Check(subobj)) {
13403
0
        PyErr_Format(PyExc_TypeError,
13404
0
                     "endswith first arg must be str or "
13405
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13406
0
        return NULL;
13407
0
    }
13408
10.6M
    int result = tailmatch(self, subobj, start, end, +1);
13409
10.6M
    if (result < 0) {
13410
0
        return NULL;
13411
0
    }
13412
10.6M
    return PyBool_FromLong(result);
13413
10.6M
}
13414
13415
13416
#include "stringlib/unicode_format.h"
13417
13418
PyDoc_STRVAR(format__doc__,
13419
             "format($self, /, *args, **kwargs)\n\
13420
--\n\
13421
\n\
13422
Return a formatted version of the string, using substitutions from args and kwargs.\n\
13423
The substitutions are identified by braces ('{' and '}').");
13424
13425
PyDoc_STRVAR(format_map__doc__,
13426
             "format_map($self, mapping, /)\n\
13427
--\n\
13428
\n\
13429
Return a formatted version of the string, using substitutions from mapping.\n\
13430
The substitutions are identified by braces ('{' and '}').");
13431
13432
/*[clinic input]
13433
@permit_long_summary
13434
str.__format__ as unicode___format__
13435
13436
    format_spec: unicode
13437
    /
13438
13439
Return a formatted version of the string as described by format_spec.
13440
[clinic start generated code]*/
13441
13442
static PyObject *
13443
unicode___format___impl(PyObject *self, PyObject *format_spec)
13444
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=77a2a19f3f7969f2]*/
13445
0
{
13446
0
    _PyUnicodeWriter writer;
13447
0
    int ret;
13448
13449
0
    _PyUnicodeWriter_Init(&writer);
13450
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13451
0
                                          self, format_spec, 0,
13452
0
                                          PyUnicode_GET_LENGTH(format_spec));
13453
0
    if (ret == -1) {
13454
0
        _PyUnicodeWriter_Dealloc(&writer);
13455
0
        return NULL;
13456
0
    }
13457
0
    return _PyUnicodeWriter_Finish(&writer);
13458
0
}
13459
13460
/*[clinic input]
13461
str.__sizeof__ as unicode_sizeof
13462
13463
Return the size of the string in memory, in bytes.
13464
[clinic start generated code]*/
13465
13466
static PyObject *
13467
unicode_sizeof_impl(PyObject *self)
13468
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13469
0
{
13470
0
    Py_ssize_t size;
13471
13472
    /* If it's a compact object, account for base structure +
13473
       character data. */
13474
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
13475
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13476
0
    }
13477
0
    else if (PyUnicode_IS_COMPACT(self)) {
13478
0
        size = sizeof(PyCompactUnicodeObject) +
13479
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13480
0
    }
13481
0
    else {
13482
        /* If it is a two-block object, account for base object, and
13483
           for character block if present. */
13484
0
        size = sizeof(PyUnicodeObject);
13485
0
        if (_PyUnicode_DATA_ANY(self))
13486
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13487
0
                PyUnicode_KIND(self);
13488
0
    }
13489
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13490
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13491
13492
0
    return PyLong_FromSsize_t(size);
13493
0
}
13494
13495
static PyObject *
13496
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13497
0
{
13498
0
    PyObject *copy = _PyUnicode_Copy(v);
13499
0
    if (!copy)
13500
0
        return NULL;
13501
0
    return Py_BuildValue("(N)", copy);
13502
0
}
13503
13504
/*
13505
This function searchs the longest common leading whitespace
13506
of all lines in the [src, end).
13507
It returns the length of the common leading whitespace and sets `output` to
13508
point to the beginning of the common leading whitespace if length > 0.
13509
*/
13510
static Py_ssize_t
13511
search_longest_common_leading_whitespace(
13512
    const char *const src,
13513
    const char *const end,
13514
    const char **output)
13515
0
{
13516
    // [_start, _start + _len)
13517
    // describes the current longest common leading whitespace
13518
0
    const char *_start = NULL;
13519
0
    Py_ssize_t _len = 0;
13520
13521
0
    for (const char *iter = src; iter < end; ++iter) {
13522
0
        const char *line_start = iter;
13523
0
        const char *leading_whitespace_end = NULL;
13524
13525
        // scan the whole line
13526
0
        while (iter < end && *iter != '\n') {
13527
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
13528
                /* `iter` points to the first non-whitespace character
13529
                   in this line */
13530
0
                if (iter == line_start) {
13531
                    // some line has no indent, fast exit!
13532
0
                    return 0;
13533
0
                }
13534
0
                leading_whitespace_end = iter;
13535
0
            }
13536
0
            ++iter;
13537
0
        }
13538
13539
        // if this line has all white space, skip it
13540
0
        if (!leading_whitespace_end) {
13541
0
            continue;
13542
0
        }
13543
13544
0
        if (!_start) {
13545
            // update the first leading whitespace
13546
0
            _start = line_start;
13547
0
            _len = leading_whitespace_end - line_start;
13548
0
            assert(_len > 0);
13549
0
        }
13550
0
        else {
13551
            /* We then compare with the current longest leading whitespace.
13552
13553
               [line_start, leading_whitespace_end) is the leading
13554
               whitespace of this line,
13555
13556
               [_start, _start + _len) is the leading whitespace of the
13557
               current longest leading whitespace. */
13558
0
            Py_ssize_t new_len = 0;
13559
0
            const char *_iter = _start, *line_iter = line_start;
13560
13561
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
13562
0
                   && *_iter == *line_iter)
13563
0
            {
13564
0
                ++_iter;
13565
0
                ++line_iter;
13566
0
                ++new_len;
13567
0
            }
13568
13569
0
            _len = new_len;
13570
0
            if (_len == 0) {
13571
                // No common things now, fast exit!
13572
0
                return 0;
13573
0
            }
13574
0
        }
13575
0
    }
13576
13577
0
    assert(_len >= 0);
13578
0
    if (_len > 0) {
13579
0
        *output = _start;
13580
0
    }
13581
0
    return _len;
13582
0
}
13583
13584
/* Dedent a string.
13585
   Intended to dedent Python source. Unlike `textwrap.dedent`, this
13586
   only supports spaces and tabs and doesn't normalize empty lines.
13587
   Return a new reference on success, NULL with exception set on error.
13588
   */
13589
PyObject *
13590
_PyUnicode_Dedent(PyObject *unicode)
13591
0
{
13592
0
    Py_ssize_t src_len = 0;
13593
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
13594
0
    if (!src) {
13595
0
        return NULL;
13596
0
    }
13597
0
    assert(src_len >= 0);
13598
0
    if (src_len == 0) {
13599
0
        return Py_NewRef(unicode);
13600
0
    }
13601
13602
0
    const char *const end = src + src_len;
13603
13604
    // [whitespace_start, whitespace_start + whitespace_len)
13605
    // describes the current longest common leading whitespace
13606
0
    const char *whitespace_start = NULL;
13607
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
13608
0
        src, end, &whitespace_start);
13609
13610
0
    if (whitespace_len == 0) {
13611
0
        return Py_NewRef(unicode);
13612
0
    }
13613
13614
    // now we should trigger a dedent
13615
0
    char *dest = PyMem_Malloc(src_len);
13616
0
    if (!dest) {
13617
0
        PyErr_NoMemory();
13618
0
        return NULL;
13619
0
    }
13620
0
    char *dest_iter = dest;
13621
13622
0
    for (const char *iter = src; iter < end; ++iter) {
13623
0
        const char *line_start = iter;
13624
0
        bool in_leading_space = true;
13625
13626
        // iterate over a line to find the end of a line
13627
0
        while (iter < end && *iter != '\n') {
13628
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
13629
0
                in_leading_space = false;
13630
0
            }
13631
0
            ++iter;
13632
0
        }
13633
13634
        // invariant: *iter == '\n' or iter == end
13635
0
        bool append_newline = iter < end;
13636
13637
        // if this line has all white space, write '\n' and continue
13638
0
        if (in_leading_space && append_newline) {
13639
0
            *dest_iter++ = '\n';
13640
0
            continue;
13641
0
        }
13642
13643
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
13644
            conditionally append '\n' */
13645
13646
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
13647
0
        assert(new_line_len >= 0);
13648
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
13649
13650
0
        dest_iter += new_line_len;
13651
13652
0
        if (append_newline) {
13653
0
            *dest_iter++ = '\n';
13654
0
        }
13655
0
    }
13656
13657
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
13658
0
    PyMem_Free(dest);
13659
0
    return res;
13660
0
}
13661
13662
static PyMethodDef unicode_methods[] = {
13663
    UNICODE_ENCODE_METHODDEF
13664
    UNICODE_REPLACE_METHODDEF
13665
    UNICODE_SPLIT_METHODDEF
13666
    UNICODE_RSPLIT_METHODDEF
13667
    UNICODE_JOIN_METHODDEF
13668
    UNICODE_CAPITALIZE_METHODDEF
13669
    UNICODE_CASEFOLD_METHODDEF
13670
    UNICODE_TITLE_METHODDEF
13671
    UNICODE_CENTER_METHODDEF
13672
    UNICODE_COUNT_METHODDEF
13673
    UNICODE_EXPANDTABS_METHODDEF
13674
    UNICODE_FIND_METHODDEF
13675
    UNICODE_PARTITION_METHODDEF
13676
    UNICODE_INDEX_METHODDEF
13677
    UNICODE_LJUST_METHODDEF
13678
    UNICODE_LOWER_METHODDEF
13679
    UNICODE_LSTRIP_METHODDEF
13680
    UNICODE_RFIND_METHODDEF
13681
    UNICODE_RINDEX_METHODDEF
13682
    UNICODE_RJUST_METHODDEF
13683
    UNICODE_RSTRIP_METHODDEF
13684
    UNICODE_RPARTITION_METHODDEF
13685
    UNICODE_SPLITLINES_METHODDEF
13686
    UNICODE_STRIP_METHODDEF
13687
    UNICODE_SWAPCASE_METHODDEF
13688
    UNICODE_TRANSLATE_METHODDEF
13689
    UNICODE_UPPER_METHODDEF
13690
    UNICODE_STARTSWITH_METHODDEF
13691
    UNICODE_ENDSWITH_METHODDEF
13692
    UNICODE_REMOVEPREFIX_METHODDEF
13693
    UNICODE_REMOVESUFFIX_METHODDEF
13694
    UNICODE_ISASCII_METHODDEF
13695
    UNICODE_ISLOWER_METHODDEF
13696
    UNICODE_ISUPPER_METHODDEF
13697
    UNICODE_ISTITLE_METHODDEF
13698
    UNICODE_ISSPACE_METHODDEF
13699
    UNICODE_ISDECIMAL_METHODDEF
13700
    UNICODE_ISDIGIT_METHODDEF
13701
    UNICODE_ISNUMERIC_METHODDEF
13702
    UNICODE_ISALPHA_METHODDEF
13703
    UNICODE_ISALNUM_METHODDEF
13704
    UNICODE_ISIDENTIFIER_METHODDEF
13705
    UNICODE_ISPRINTABLE_METHODDEF
13706
    UNICODE_ZFILL_METHODDEF
13707
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
13708
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
13709
    UNICODE___FORMAT___METHODDEF
13710
    UNICODE_MAKETRANS_METHODDEF
13711
    UNICODE_SIZEOF_METHODDEF
13712
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13713
    {NULL, NULL}
13714
};
13715
13716
static PyObject *
13717
unicode_mod(PyObject *v, PyObject *w)
13718
12.6M
{
13719
12.6M
    if (!PyUnicode_Check(v))
13720
0
        Py_RETURN_NOTIMPLEMENTED;
13721
12.6M
    return PyUnicode_Format(v, w);
13722
12.6M
}
13723
13724
static PyNumberMethods unicode_as_number = {
13725
    0,              /*nb_add*/
13726
    0,              /*nb_subtract*/
13727
    0,              /*nb_multiply*/
13728
    unicode_mod,            /*nb_remainder*/
13729
};
13730
13731
static PySequenceMethods unicode_as_sequence = {
13732
    unicode_length,     /* sq_length */
13733
    PyUnicode_Concat,   /* sq_concat */
13734
    _PyUnicode_Repeat,  /* sq_repeat */
13735
    unicode_getitem,    /* sq_item */
13736
    0,                  /* sq_slice */
13737
    0,                  /* sq_ass_item */
13738
    0,                  /* sq_ass_slice */
13739
    PyUnicode_Contains, /* sq_contains */
13740
};
13741
13742
static PyObject*
13743
unicode_subscript(PyObject* self, PyObject* item)
13744
81.3M
{
13745
81.3M
    if (_PyIndex_Check(item)) {
13746
61.6M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13747
61.6M
        if (i == -1 && PyErr_Occurred())
13748
0
            return NULL;
13749
61.6M
        if (i < 0)
13750
70.4k
            i += PyUnicode_GET_LENGTH(self);
13751
61.6M
        return unicode_getitem(self, i);
13752
61.6M
    } else if (PySlice_Check(item)) {
13753
19.7M
        Py_ssize_t start, stop, step, slicelength, i;
13754
19.7M
        size_t cur;
13755
19.7M
        PyObject *result;
13756
19.7M
        const void *src_data;
13757
19.7M
        void *dest_data;
13758
19.7M
        int src_kind, dest_kind;
13759
19.7M
        Py_UCS4 ch, max_char, kind_limit;
13760
13761
19.7M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
13762
0
            return NULL;
13763
0
        }
13764
19.7M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13765
19.7M
                                            &start, &stop, step);
13766
13767
19.7M
        if (slicelength <= 0) {
13768
1.21M
            _Py_RETURN_UNICODE_EMPTY();
13769
18.5M
        } else if (start == 0 && step == 1 &&
13770
6.78M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
13771
5.04M
            return unicode_result_unchanged(self);
13772
13.4M
        } else if (step == 1) {
13773
13.4M
            return PyUnicode_Substring(self,
13774
13.4M
                                       start, start + slicelength);
13775
13.4M
        }
13776
        /* General case */
13777
0
        src_kind = PyUnicode_KIND(self);
13778
0
        src_data = PyUnicode_DATA(self);
13779
0
        if (!PyUnicode_IS_ASCII(self)) {
13780
0
            kind_limit = kind_maxchar_limit(src_kind);
13781
0
            max_char = 0;
13782
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13783
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
13784
0
                if (ch > max_char) {
13785
0
                    max_char = ch;
13786
0
                    if (max_char >= kind_limit)
13787
0
                        break;
13788
0
                }
13789
0
            }
13790
0
        }
13791
0
        else
13792
0
            max_char = 127;
13793
0
        result = PyUnicode_New(slicelength, max_char);
13794
0
        if (result == NULL)
13795
0
            return NULL;
13796
0
        dest_kind = PyUnicode_KIND(result);
13797
0
        dest_data = PyUnicode_DATA(result);
13798
13799
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13800
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13801
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13802
0
        }
13803
0
        assert(_PyUnicode_CheckConsistency(result, 1));
13804
0
        return result;
13805
0
    } else {
13806
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
13807
0
                     Py_TYPE(item)->tp_name);
13808
0
        return NULL;
13809
0
    }
13810
81.3M
}
13811
13812
static PyMappingMethods unicode_as_mapping = {
13813
    unicode_length,     /* mp_length */
13814
    unicode_subscript,  /* mp_subscript */
13815
    0,                  /* mp_ass_subscript */
13816
};
13817
13818
13819
static PyObject *
13820
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
13821
13822
/*[clinic input]
13823
@classmethod
13824
str.__new__ as unicode_new
13825
13826
    object as x: object = NULL
13827
    encoding: str = NULL
13828
    errors: str = NULL
13829
13830
[clinic start generated code]*/
13831
13832
static PyObject *
13833
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
13834
                 const char *errors)
13835
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
13836
16.6M
{
13837
16.6M
    PyObject *unicode;
13838
16.6M
    if (x == NULL) {
13839
0
        unicode = _PyUnicode_GetEmpty();
13840
0
    }
13841
16.6M
    else if (encoding == NULL && errors == NULL) {
13842
16.6M
        unicode = PyObject_Str(x);
13843
16.6M
    }
13844
0
    else {
13845
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
13846
0
    }
13847
13848
16.6M
    if (unicode != NULL && type != &PyUnicode_Type) {
13849
16.6M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
13850
16.6M
    }
13851
16.6M
    return unicode;
13852
16.6M
}
13853
13854
static const char *
13855
arg_as_utf8(PyObject *obj, const char *name)
13856
2.86M
{
13857
2.86M
    if (!PyUnicode_Check(obj)) {
13858
0
        PyErr_Format(PyExc_TypeError,
13859
0
                     "str() argument '%s' must be str, not %T",
13860
0
                     name, obj);
13861
0
        return NULL;
13862
0
    }
13863
2.86M
    return _PyUnicode_AsUTF8NoNUL(obj);
13864
2.86M
}
13865
13866
static PyObject *
13867
unicode_vectorcall(PyObject *type, PyObject *const *args,
13868
                   size_t nargsf, PyObject *kwnames)
13869
2.53M
{
13870
2.53M
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
13871
13872
2.53M
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
13873
2.53M
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
13874
        // Fallback to unicode_new()
13875
0
        PyObject *tuple = PyTuple_FromArray(args, nargs);
13876
0
        if (tuple == NULL) {
13877
0
            return NULL;
13878
0
        }
13879
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
13880
0
        if (dict == NULL) {
13881
0
            Py_DECREF(tuple);
13882
0
            return NULL;
13883
0
        }
13884
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
13885
0
        Py_DECREF(tuple);
13886
0
        Py_DECREF(dict);
13887
0
        return ret;
13888
0
    }
13889
2.53M
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
13890
0
        return NULL;
13891
0
    }
13892
2.53M
    if (nargs == 0) {
13893
4.47k
        return _PyUnicode_GetEmpty();
13894
4.47k
    }
13895
2.52M
    PyObject *object = args[0];
13896
2.52M
    if (nargs == 1) {
13897
1.17k
        return PyObject_Str(object);
13898
1.17k
    }
13899
2.52M
    const char *encoding = arg_as_utf8(args[1], "encoding");
13900
2.52M
    if (encoding == NULL) {
13901
0
        return NULL;
13902
0
    }
13903
2.52M
    const char *errors = NULL;
13904
2.52M
    if (nargs == 3) {
13905
338k
        errors = arg_as_utf8(args[2], "errors");
13906
338k
        if (errors == NULL) {
13907
0
            return NULL;
13908
0
        }
13909
338k
    }
13910
2.52M
    return PyUnicode_FromEncodedObject(object, encoding, errors);
13911
2.52M
}
13912
13913
static PyObject *
13914
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
13915
16.6M
{
13916
16.6M
    PyObject *self;
13917
16.6M
    Py_ssize_t length, char_size;
13918
16.6M
    int share_utf8;
13919
16.6M
    int kind;
13920
16.6M
    void *data;
13921
13922
16.6M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13923
16.6M
    assert(_PyUnicode_CHECK(unicode));
13924
13925
16.6M
    self = type->tp_alloc(type, 0);
13926
16.6M
    if (self == NULL) {
13927
0
        return NULL;
13928
0
    }
13929
16.6M
    kind = PyUnicode_KIND(unicode);
13930
16.6M
    length = PyUnicode_GET_LENGTH(unicode);
13931
13932
16.6M
    _PyUnicode_LENGTH(self) = length;
13933
#ifdef Py_DEBUG
13934
    _PyUnicode_HASH(self) = -1;
13935
#else
13936
16.6M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13937
16.6M
#endif
13938
16.6M
    _PyUnicode_STATE(self).interned = 0;
13939
16.6M
    _PyUnicode_STATE(self).kind = kind;
13940
16.6M
    _PyUnicode_STATE(self).compact = 0;
13941
16.6M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13942
16.6M
    _PyUnicode_STATE(self).statically_allocated = 0;
13943
16.6M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
13944
16.6M
    PyUnicode_SET_UTF8(self, NULL);
13945
16.6M
    _PyUnicode_DATA_ANY(self) = NULL;
13946
13947
16.6M
    share_utf8 = 0;
13948
16.6M
    if (kind == PyUnicode_1BYTE_KIND) {
13949
14.0M
        char_size = 1;
13950
14.0M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13951
14.0M
            share_utf8 = 1;
13952
14.0M
    }
13953
2.59M
    else if (kind == PyUnicode_2BYTE_KIND) {
13954
2.52M
        char_size = 2;
13955
2.52M
    }
13956
67.9k
    else {
13957
67.9k
        assert(kind == PyUnicode_4BYTE_KIND);
13958
67.9k
        char_size = 4;
13959
67.9k
    }
13960
13961
    /* Ensure we won't overflow the length. */
13962
16.6M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13963
0
        PyErr_NoMemory();
13964
0
        goto onError;
13965
0
    }
13966
16.6M
    data = PyMem_Malloc((length + 1) * char_size);
13967
16.6M
    if (data == NULL) {
13968
0
        PyErr_NoMemory();
13969
0
        goto onError;
13970
0
    }
13971
13972
16.6M
    _PyUnicode_DATA_ANY(self) = data;
13973
16.6M
    if (share_utf8) {
13974
14.0M
        PyUnicode_SET_UTF8_LENGTH(self, length);
13975
14.0M
        PyUnicode_SET_UTF8(self, data);
13976
14.0M
    }
13977
13978
16.6M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
13979
16.6M
    assert(_PyUnicode_CheckConsistency(self, 1));
13980
#ifdef Py_DEBUG
13981
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13982
#endif
13983
16.6M
    return self;
13984
13985
0
onError:
13986
0
    Py_DECREF(self);
13987
0
    return NULL;
13988
16.6M
}
13989
13990
static _PyObjectIndexPair
13991
unicode_iteritem(PyObject *obj, Py_ssize_t index)
13992
42.5M
{
13993
42.5M
    if (index >= PyUnicode_GET_LENGTH(obj)) {
13994
3.89M
        return (_PyObjectIndexPair) { .object = NULL, .index = index };
13995
3.89M
    }
13996
38.6M
    const void *data = PyUnicode_DATA(obj);
13997
38.6M
    int kind = PyUnicode_KIND(obj);
13998
38.6M
    Py_UCS4 ch = PyUnicode_READ(kind, data, index);
13999
38.6M
    PyObject *result = unicode_char(ch);
14000
38.6M
    index = (result == NULL) ? -1 : index + 1;
14001
38.6M
    return (_PyObjectIndexPair) { .object = result, .index = index };
14002
42.5M
}
14003
14004
void
14005
_PyUnicode_ExactDealloc(PyObject *op)
14006
77.0M
{
14007
77.0M
    assert(PyUnicode_CheckExact(op));
14008
77.0M
    unicode_dealloc(op);
14009
77.0M
}
14010
14011
PyDoc_STRVAR(unicode_doc,
14012
"str(object='') -> str\n\
14013
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14014
\n\
14015
Create a new string object from the given object. If encoding or\n\
14016
errors is specified, then the object must expose a data buffer\n\
14017
that will be decoded using the given encoding and error handler.\n\
14018
Otherwise, returns the result of object.__str__() (if defined)\n\
14019
or repr(object).\n\
14020
encoding defaults to 'utf-8'.\n\
14021
errors defaults to 'strict'.");
14022
14023
static PyObject *unicode_iter(PyObject *seq);
14024
14025
PyTypeObject PyUnicode_Type = {
14026
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14027
    "str",                        /* tp_name */
14028
    sizeof(PyUnicodeObject),      /* tp_basicsize */
14029
    0,                            /* tp_itemsize */
14030
    /* Slots */
14031
    unicode_dealloc,              /* tp_dealloc */
14032
    0,                            /* tp_vectorcall_offset */
14033
    0,                            /* tp_getattr */
14034
    0,                            /* tp_setattr */
14035
    0,                            /* tp_as_async */
14036
    unicode_repr,                 /* tp_repr */
14037
    &unicode_as_number,           /* tp_as_number */
14038
    &unicode_as_sequence,         /* tp_as_sequence */
14039
    &unicode_as_mapping,          /* tp_as_mapping */
14040
    unicode_hash,                 /* tp_hash*/
14041
    0,                            /* tp_call*/
14042
    unicode_str,                  /* tp_str */
14043
    PyObject_GenericGetAttr,      /* tp_getattro */
14044
    0,                            /* tp_setattro */
14045
    0,                            /* tp_as_buffer */
14046
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14047
        Py_TPFLAGS_UNICODE_SUBCLASS |
14048
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
14049
    unicode_doc,                  /* tp_doc */
14050
    0,                            /* tp_traverse */
14051
    0,                            /* tp_clear */
14052
    PyUnicode_RichCompare,        /* tp_richcompare */
14053
    0,                            /* tp_weaklistoffset */
14054
    unicode_iter,                 /* tp_iter */
14055
    0,                            /* tp_iternext */
14056
    unicode_methods,              /* tp_methods */
14057
    0,                            /* tp_members */
14058
    0,                            /* tp_getset */
14059
    0,                            /* tp_base */
14060
    0,                            /* tp_dict */
14061
    0,                            /* tp_descr_get */
14062
    0,                            /* tp_descr_set */
14063
    0,                            /* tp_dictoffset */
14064
    0,                            /* tp_init */
14065
    0,                            /* tp_alloc */
14066
    unicode_new,                  /* tp_new */
14067
    PyObject_Free,                /* tp_free */
14068
    .tp_vectorcall = unicode_vectorcall,
14069
    ._tp_iteritem = unicode_iteritem,
14070
};
14071
14072
/* Initialize the Unicode implementation */
14073
14074
static void
14075
_init_global_state(void)
14076
37
{
14077
37
    static int initialized = 0;
14078
37
    if (initialized) {
14079
0
        return;
14080
0
    }
14081
37
    initialized = 1;
14082
14083
    /* initialize the linebreak bloom filter */
14084
37
    const Py_UCS2 linebreak[] = {
14085
37
        0x000A, /* LINE FEED */
14086
37
        0x000D, /* CARRIAGE RETURN */
14087
37
        0x001C, /* FILE SEPARATOR */
14088
37
        0x001D, /* GROUP SEPARATOR */
14089
37
        0x001E, /* RECORD SEPARATOR */
14090
37
        0x0085, /* NEXT LINE */
14091
37
        0x2028, /* LINE SEPARATOR */
14092
37
        0x2029, /* PARAGRAPH SEPARATOR */
14093
37
    };
14094
37
    bloom_linebreak = make_bloom_mask(
14095
37
        PyUnicode_2BYTE_KIND, linebreak,
14096
37
        Py_ARRAY_LENGTH(linebreak));
14097
37
}
14098
14099
void
14100
_PyUnicode_InitState(PyInterpreterState *interp)
14101
37
{
14102
37
    if (!_Py_IsMainInterpreter(interp)) {
14103
0
        return;
14104
0
    }
14105
37
    _init_global_state();
14106
37
}
14107
14108
14109
PyStatus
14110
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
14111
37
{
14112
37
    if (_Py_IsMainInterpreter(interp)) {
14113
37
        PyStatus status = init_global_interned_strings(interp);
14114
37
        if (_PyStatus_EXCEPTION(status)) {
14115
0
            return status;
14116
0
        }
14117
37
    }
14118
37
    assert(INTERNED_STRINGS);
14119
14120
37
    if (init_interned_dict(interp)) {
14121
0
        PyErr_Clear();
14122
0
        return _PyStatus_ERR("failed to create interned dict");
14123
0
    }
14124
14125
37
    return _PyStatus_OK();
14126
37
}
14127
14128
14129
PyStatus
14130
_PyUnicode_InitTypes(PyInterpreterState *interp)
14131
37
{
14132
37
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
14133
0
        goto error;
14134
0
    }
14135
37
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
14136
0
        goto error;
14137
0
    }
14138
37
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
14139
0
        goto error;
14140
0
    }
14141
37
    return _PyStatus_OK();
14142
14143
0
error:
14144
0
    return _PyStatus_ERR("Can't initialize unicode types");
14145
37
}
14146
14147
static /* non-null */ PyObject*
14148
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
14149
41.4k
{
14150
    // Note that this steals a reference to `s`, but in many cases that
14151
    // stolen ref is returned, requiring no decref/incref.
14152
14153
41.4k
    assert(s != NULL);
14154
41.4k
    assert(_PyUnicode_CHECK(s));
14155
41.4k
    assert(_PyUnicode_STATE(s).statically_allocated);
14156
41.4k
    assert(!PyUnicode_CHECK_INTERNED(s));
14157
14158
#ifdef Py_DEBUG
14159
    /* We must not add process-global interned string if there's already a
14160
     * per-interpreter interned_dict, which might contain duplicates.
14161
     */
14162
    PyObject *interned = get_interned_dict(interp);
14163
    assert(interned == NULL);
14164
#endif
14165
14166
    /* Look in the global cache first. */
14167
41.4k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14168
    /* We should only init each string once */
14169
41.4k
    assert(r == NULL);
14170
    /* but just in case (for the non-debug build), handle this */
14171
41.4k
    if (r != NULL && r != s) {
14172
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
14173
0
        assert(_PyUnicode_CHECK(r));
14174
0
        Py_DECREF(s);
14175
0
        return Py_NewRef(r);
14176
0
    }
14177
14178
41.4k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
14179
0
        Py_FatalError("failed to intern static string");
14180
0
    }
14181
14182
41.4k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
14183
41.4k
    return s;
14184
41.4k
}
14185
14186
void
14187
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
14188
41.4k
{
14189
    // This should only be called as part of runtime initialization
14190
41.4k
    assert(!Py_IsInitialized());
14191
14192
41.4k
    *p = intern_static(interp, *p);
14193
41.4k
    assert(*p);
14194
41.4k
}
14195
14196
static void
14197
immortalize_interned(PyObject *s)
14198
290k
{
14199
290k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
14200
290k
    assert(!_Py_IsImmortal(s));
14201
#ifdef Py_REF_DEBUG
14202
    /* The reference count value should be excluded from the RefTotal.
14203
       The decrements to these objects will not be registered so they
14204
       need to be accounted for in here. */
14205
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
14206
        _Py_DecRefTotal(_PyThreadState_GET());
14207
    }
14208
#endif
14209
290k
    _Py_SetImmortal(s);
14210
    // The switch to SSTATE_INTERNED_IMMORTAL must be the last thing done here
14211
    // to synchronize with the check in intern_common() that avoids locking if
14212
    // the string is already immortal.
14213
290k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
14214
290k
}
14215
14216
#ifdef Py_GIL_DISABLED
14217
static bool
14218
can_immortalize_safely(PyObject *s)
14219
{
14220
    if (_Py_IsOwnedByCurrentThread(s) || _Py_IsImmortal(s)) {
14221
        return true;
14222
    }
14223
    Py_ssize_t shared = _Py_atomic_load_ssize(&s->ob_ref_shared);
14224
    return _Py_REF_IS_MERGED(shared);
14225
}
14226
#endif
14227
14228
static /* non-null */ PyObject*
14229
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
14230
              bool immortalize)
14231
95.7M
{
14232
    // Note that this steals a reference to `s`, but in many cases that
14233
    // stolen ref is returned, requiring no decref/incref.
14234
14235
#ifdef Py_DEBUG
14236
    assert(s != NULL);
14237
    assert(_PyUnicode_CHECK(s));
14238
#else
14239
95.7M
    if (s == NULL || !PyUnicode_Check(s)) {
14240
0
        return s;
14241
0
    }
14242
95.7M
#endif
14243
14244
    /* If it's a subclass, we don't really know what putting
14245
       it in the interned dict might do. */
14246
95.7M
    if (!PyUnicode_CheckExact(s)) {
14247
0
        return s;
14248
0
    }
14249
14250
    /* Is it already interned? */
14251
95.7M
    switch (PyUnicode_CHECK_INTERNED(s)) {
14252
6.08M
        case SSTATE_NOT_INTERNED:
14253
            // no, go on
14254
6.08M
            break;
14255
29.1k
        case SSTATE_INTERNED_MORTAL:
14256
29.1k
#ifndef Py_GIL_DISABLED
14257
            // yes but we might need to make it immortal
14258
29.1k
            if (immortalize) {
14259
1.78k
                immortalize_interned(s);
14260
1.78k
            }
14261
29.1k
            return s;
14262
#else
14263
            // not fully interned yet; fall through to the locking path
14264
            break;
14265
#endif
14266
89.6M
        default:
14267
            // all done
14268
89.6M
            return s;
14269
95.7M
    }
14270
14271
    /* Statically allocated strings must be already interned. */
14272
95.7M
    assert(!_PyUnicode_STATE(s).statically_allocated);
14273
14274
#if Py_GIL_DISABLED
14275
    /* In the free-threaded build, all interned strings are immortal */
14276
    immortalize = 1;
14277
#endif
14278
14279
    /* If it's already immortal, intern it as such */
14280
6.08M
    if (_Py_IsImmortal(s)) {
14281
0
        immortalize = 1;
14282
0
    }
14283
14284
    /* if it's a short string, get the singleton */
14285
6.08M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
14286
17.2k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
14287
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
14288
0
        assert(PyUnicode_CHECK_INTERNED(r));
14289
0
        Py_DECREF(s);
14290
0
        return r;
14291
0
    }
14292
#ifdef Py_DEBUG
14293
    assert(!unicode_is_singleton(s));
14294
#endif
14295
14296
    /* Look in the global cache now. */
14297
6.08M
    {
14298
6.08M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
14299
6.08M
        if (r != NULL) {
14300
542k
            assert(_PyUnicode_STATE(r).statically_allocated);
14301
542k
            assert(r != s);  // r must be statically_allocated; s is not
14302
542k
            Py_DECREF(s);
14303
542k
            return Py_NewRef(r);
14304
542k
        }
14305
6.08M
    }
14306
14307
    /* Do a setdefault on the per-interpreter cache. */
14308
5.54M
    PyObject *interned = get_interned_dict(interp);
14309
5.54M
    assert(interned != NULL);
14310
#ifdef Py_GIL_DISABLED
14311
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
14312
    // Lock-free fast path: check if there's already an interned copy that
14313
    // is in its final immortal state.
14314
    PyObject *r;
14315
    int res = PyDict_GetItemRef(interned, s, &r);
14316
    if (res < 0) {
14317
        PyErr_Clear();
14318
        return s;
14319
    }
14320
    if (res > 0) {
14321
        unsigned int state = _Py_atomic_load_uint8(&_PyUnicode_STATE(r).interned);
14322
        if (state == SSTATE_INTERNED_IMMORTAL) {
14323
            Py_DECREF(s);
14324
            return r;
14325
        }
14326
        // Not yet fully interned; fall through to the locking path.
14327
        Py_DECREF(r);
14328
    }
14329
#endif
14330
14331
#ifdef Py_GIL_DISABLED
14332
    // Immortalization writes to the refcount fields non-atomically. That
14333
    // races with Py_INCREF / Py_DECREF on the thread that owns `s`. If we
14334
    // don't own it (and its refcount hasn't been merged), intern a copy
14335
    // we own instead.
14336
    if (!can_immortalize_safely(s)) {
14337
        PyObject *copy = _PyUnicode_Copy(s);
14338
        if (copy == NULL) {
14339
            PyErr_Clear();
14340
            return s;
14341
        }
14342
        Py_DECREF(s);
14343
        s = copy;
14344
    }
14345
#endif
14346
14347
5.54M
    FT_MUTEX_LOCK(INTERN_MUTEX);
14348
5.54M
    PyObject *t;
14349
5.54M
    {
14350
5.54M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
14351
5.54M
        if (res < 0) {
14352
0
            PyErr_Clear();
14353
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14354
0
            return s;
14355
0
        }
14356
5.54M
        else if (res == 1) {
14357
            // value was already present (not inserted)
14358
4.69M
            Py_DECREF(s);
14359
4.69M
            if (immortalize &&
14360
1.13M
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
14361
10.2k
                immortalize_interned(t);
14362
10.2k
            }
14363
4.69M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
14364
4.69M
            return t;
14365
4.69M
        }
14366
850k
        else {
14367
            // value was newly inserted
14368
850k
            assert (s == t);
14369
850k
            Py_DECREF(t);
14370
850k
        }
14371
5.54M
    }
14372
14373
    /* NOT_INTERNED -> INTERNED_MORTAL */
14374
14375
5.54M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
14376
14377
850k
    if (!_Py_IsImmortal(s)) {
14378
        /* The two references in interned dict (key and value) are not counted.
14379
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
14380
850k
        Py_DECREF(s);
14381
850k
        Py_DECREF(s);
14382
850k
    }
14383
850k
    FT_ATOMIC_STORE_UINT8(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
14384
14385
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
14386
14387
#ifdef Py_DEBUG
14388
    if (_Py_IsImmortal(s)) {
14389
        assert(immortalize);
14390
    }
14391
#endif
14392
850k
    if (immortalize) {
14393
278k
        immortalize_interned(s);
14394
278k
    }
14395
14396
850k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
14397
850k
    return s;
14398
5.54M
}
14399
14400
void
14401
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
14402
14.9M
{
14403
14.9M
    *p = intern_common(interp, *p, 1);
14404
14.9M
    assert(*p);
14405
14.9M
}
14406
14407
void
14408
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
14409
80.8M
{
14410
80.8M
    *p = intern_common(interp, *p, 0);
14411
80.8M
    assert(*p);
14412
80.8M
}
14413
14414
14415
void
14416
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
14417
0
{
14418
0
    _PyUnicode_InternImmortal(interp, p);
14419
0
    return;
14420
0
}
14421
14422
void
14423
PyUnicode_InternInPlace(PyObject **p)
14424
0
{
14425
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14426
0
    _PyUnicode_InternMortal(interp, p);
14427
0
}
14428
14429
// Public-looking name kept for the stable ABI; user should not call this:
14430
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
14431
void
14432
PyUnicode_InternImmortal(PyObject **p)
14433
0
{
14434
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
14435
0
    _PyUnicode_InternImmortal(interp, p);
14436
0
}
14437
14438
PyObject *
14439
PyUnicode_InternFromString(const char *cp)
14440
1.33M
{
14441
1.33M
    PyObject *s = PyUnicode_FromString(cp);
14442
1.33M
    if (s == NULL) {
14443
0
        return NULL;
14444
0
    }
14445
1.33M
    PyInterpreterState *interp = _PyInterpreterState_GET();
14446
1.33M
    _PyUnicode_InternMortal(interp, &s);
14447
1.33M
    return s;
14448
1.33M
}
14449
14450
14451
void
14452
_PyUnicode_ClearInterned(PyInterpreterState *interp)
14453
0
{
14454
0
    PyObject *interned = get_interned_dict(interp);
14455
0
    if (interned == NULL) {
14456
0
        return;
14457
0
    }
14458
0
    assert(PyDict_CheckExact(interned));
14459
14460
0
    if (has_shared_intern_dict(interp)) {
14461
        // the dict doesn't belong to this interpreter, skip the debug
14462
        // checks on it and just clear the pointer to it
14463
0
        clear_interned_dict(interp);
14464
0
        return;
14465
0
    }
14466
14467
#ifdef INTERNED_STATS
14468
    fprintf(stderr, "releasing %zd interned strings\n",
14469
            PyDict_GET_SIZE(interned));
14470
14471
    Py_ssize_t total_length = 0;
14472
#endif
14473
0
    Py_ssize_t pos = 0;
14474
0
    PyObject *s, *ignored_value;
14475
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
14476
0
        int shared = 0;
14477
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
14478
0
        case SSTATE_INTERNED_IMMORTAL:
14479
            /* Make immortal interned strings mortal again. */
14480
            // Skip the Immortal Instance check and restore
14481
            // the two references (key and value) ignored
14482
            // by PyUnicode_InternInPlace().
14483
0
            _Py_SetMortal(s, 2);
14484
#ifdef Py_REF_DEBUG
14485
            /* let's be pedantic with the ref total */
14486
            _Py_IncRefTotal(_PyThreadState_GET());
14487
            _Py_IncRefTotal(_PyThreadState_GET());
14488
#endif
14489
#ifdef INTERNED_STATS
14490
            total_length += PyUnicode_GET_LENGTH(s);
14491
#endif
14492
0
            break;
14493
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
14494
            /* It is shared between interpreters, so we should unmark it
14495
               only when this is the last interpreter in which it's
14496
               interned.  We immortalize all the statically initialized
14497
               strings during startup, so we can rely on the
14498
               main interpreter to be the last one. */
14499
0
            if (!_Py_IsMainInterpreter(interp)) {
14500
0
                shared = 1;
14501
0
            }
14502
0
            break;
14503
0
        case SSTATE_INTERNED_MORTAL:
14504
            // Restore 2 references held by the interned dict; these will
14505
            // be decref'd by clear_interned_dict's PyDict_Clear.
14506
0
            _Py_RefcntAdd(s, 2);
14507
#ifdef Py_REF_DEBUG
14508
            /* let's be pedantic with the ref total */
14509
            _Py_IncRefTotal(_PyThreadState_GET());
14510
            _Py_IncRefTotal(_PyThreadState_GET());
14511
#endif
14512
0
            break;
14513
0
        case SSTATE_NOT_INTERNED:
14514
0
            _Py_FALLTHROUGH;
14515
0
        default:
14516
0
            Py_UNREACHABLE();
14517
0
        }
14518
0
        if (!shared) {
14519
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
14520
0
        }
14521
0
    }
14522
#ifdef INTERNED_STATS
14523
    fprintf(stderr,
14524
            "total length of all interned strings: %zd characters\n",
14525
            total_length);
14526
#endif
14527
14528
0
    struct _Py_unicode_state *state = &interp->unicode;
14529
0
    struct _Py_unicode_ids *ids = &state->ids;
14530
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
14531
0
        Py_XINCREF(ids->array[i]);
14532
0
    }
14533
0
    clear_interned_dict(interp);
14534
0
    if (_Py_IsMainInterpreter(interp)) {
14535
0
        clear_global_interned_strings();
14536
0
    }
14537
0
}
14538
14539
14540
/********************* Unicode Iterator **************************/
14541
14542
typedef struct {
14543
    PyObject_HEAD
14544
    Py_ssize_t it_index;
14545
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14546
} unicodeiterobject;
14547
14548
static void
14549
unicodeiter_dealloc(PyObject *op)
14550
1.97M
{
14551
1.97M
    unicodeiterobject *it = (unicodeiterobject *)op;
14552
1.97M
    _PyObject_GC_UNTRACK(it);
14553
1.97M
    Py_XDECREF(it->it_seq);
14554
1.97M
    PyObject_GC_Del(it);
14555
1.97M
}
14556
14557
static int
14558
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
14559
10
{
14560
10
    unicodeiterobject *it = (unicodeiterobject *)op;
14561
10
    Py_VISIT(it->it_seq);
14562
10
    return 0;
14563
10
}
14564
14565
static PyObject *
14566
unicodeiter_next(PyObject *op)
14567
59.1M
{
14568
59.1M
    unicodeiterobject *it = (unicodeiterobject *)op;
14569
59.1M
    PyObject *seq;
14570
14571
59.1M
    assert(it != NULL);
14572
59.1M
    seq = it->it_seq;
14573
59.1M
    if (seq == NULL)
14574
0
        return NULL;
14575
59.1M
    assert(_PyUnicode_CHECK(seq));
14576
14577
59.1M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14578
57.7M
        int kind = PyUnicode_KIND(seq);
14579
57.7M
        const void *data = PyUnicode_DATA(seq);
14580
57.7M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14581
57.7M
        it->it_index++;
14582
57.7M
        return unicode_char(chr);
14583
57.7M
    }
14584
14585
1.39M
    it->it_seq = NULL;
14586
1.39M
    Py_DECREF(seq);
14587
1.39M
    return NULL;
14588
59.1M
}
14589
14590
static PyObject *
14591
unicode_ascii_iter_next(PyObject *op)
14592
5.61M
{
14593
5.61M
    unicodeiterobject *it = (unicodeiterobject *)op;
14594
5.61M
    assert(it != NULL);
14595
5.61M
    PyObject *seq = it->it_seq;
14596
5.61M
    if (seq == NULL) {
14597
0
        return NULL;
14598
0
    }
14599
5.61M
    assert(_PyUnicode_CHECK(seq));
14600
5.61M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
14601
5.61M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14602
5.04M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
14603
5.04M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
14604
5.04M
                                              data, it->it_index);
14605
5.04M
        it->it_index++;
14606
5.04M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
14607
5.04M
    }
14608
562k
    it->it_seq = NULL;
14609
562k
    Py_DECREF(seq);
14610
562k
    return NULL;
14611
5.61M
}
14612
14613
static PyObject *
14614
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
14615
1.52M
{
14616
1.52M
    unicodeiterobject *it = (unicodeiterobject *)op;
14617
1.52M
    Py_ssize_t len = 0;
14618
1.52M
    if (it->it_seq)
14619
1.52M
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14620
1.52M
    return PyLong_FromSsize_t(len);
14621
1.52M
}
14622
14623
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14624
14625
static PyObject *
14626
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
14627
0
{
14628
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14629
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
14630
14631
    /* _PyEval_GetBuiltin can invoke arbitrary code,
14632
     * call must be before access of iterator pointers.
14633
     * see issue #101765 */
14634
14635
0
    if (it->it_seq != NULL) {
14636
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
14637
0
    } else {
14638
0
        PyObject *u = _PyUnicode_GetEmpty();
14639
0
        if (u == NULL) {
14640
0
            Py_XDECREF(iter);
14641
0
            return NULL;
14642
0
        }
14643
0
        return Py_BuildValue("N(N)", iter, u);
14644
0
    }
14645
0
}
14646
14647
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14648
14649
static PyObject *
14650
unicodeiter_setstate(PyObject *op, PyObject *state)
14651
0
{
14652
0
    unicodeiterobject *it = (unicodeiterobject *)op;
14653
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
14654
0
    if (index == -1 && PyErr_Occurred())
14655
0
        return NULL;
14656
0
    if (it->it_seq != NULL) {
14657
0
        if (index < 0)
14658
0
            index = 0;
14659
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
14660
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
14661
0
        it->it_index = index;
14662
0
    }
14663
0
    Py_RETURN_NONE;
14664
0
}
14665
14666
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14667
14668
static PyMethodDef unicodeiter_methods[] = {
14669
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
14670
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
14671
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
14672
    {NULL,      NULL}       /* sentinel */
14673
};
14674
14675
PyTypeObject PyUnicodeIter_Type = {
14676
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14677
    "str_iterator",         /* tp_name */
14678
    sizeof(unicodeiterobject),      /* tp_basicsize */
14679
    0,                  /* tp_itemsize */
14680
    /* methods */
14681
    unicodeiter_dealloc,/* tp_dealloc */
14682
    0,                  /* tp_vectorcall_offset */
14683
    0,                  /* tp_getattr */
14684
    0,                  /* tp_setattr */
14685
    0,                  /* tp_as_async */
14686
    0,                  /* tp_repr */
14687
    0,                  /* tp_as_number */
14688
    0,                  /* tp_as_sequence */
14689
    0,                  /* tp_as_mapping */
14690
    0,                  /* tp_hash */
14691
    0,                  /* tp_call */
14692
    0,                  /* tp_str */
14693
    PyObject_GenericGetAttr,        /* tp_getattro */
14694
    0,                  /* tp_setattro */
14695
    0,                  /* tp_as_buffer */
14696
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14697
    0,                  /* tp_doc */
14698
    unicodeiter_traverse, /* tp_traverse */
14699
    0,                  /* tp_clear */
14700
    0,                  /* tp_richcompare */
14701
    0,                  /* tp_weaklistoffset */
14702
    PyObject_SelfIter,          /* tp_iter */
14703
    unicodeiter_next,   /* tp_iternext */
14704
    unicodeiter_methods,            /* tp_methods */
14705
    0,
14706
};
14707
14708
PyTypeObject _PyUnicodeASCIIIter_Type = {
14709
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14710
    .tp_name = "str_ascii_iterator",
14711
    .tp_basicsize = sizeof(unicodeiterobject),
14712
    .tp_dealloc = unicodeiter_dealloc,
14713
    .tp_getattro = PyObject_GenericGetAttr,
14714
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
14715
    .tp_traverse = unicodeiter_traverse,
14716
    .tp_iter = PyObject_SelfIter,
14717
    .tp_iternext = unicode_ascii_iter_next,
14718
    .tp_methods = unicodeiter_methods,
14719
};
14720
14721
static PyObject *
14722
unicode_iter(PyObject *seq)
14723
1.97M
{
14724
1.97M
    unicodeiterobject *it;
14725
14726
1.97M
    if (!PyUnicode_Check(seq)) {
14727
0
        PyErr_BadInternalCall();
14728
0
        return NULL;
14729
0
    }
14730
1.97M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
14731
574k
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
14732
574k
    }
14733
1.39M
    else {
14734
1.39M
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14735
1.39M
    }
14736
1.97M
    if (it == NULL)
14737
0
        return NULL;
14738
1.97M
    it->it_index = 0;
14739
1.97M
    it->it_seq = Py_NewRef(seq);
14740
1.97M
    _PyObject_GC_TRACK(it);
14741
1.97M
    return (PyObject *)it;
14742
1.97M
}
14743
14744
static int
14745
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
14746
148
{
14747
148
    int res;
14748
148
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
14749
148
    if (res == -2) {
14750
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
14751
0
        return -1;
14752
0
    }
14753
148
    if (res < 0) {
14754
0
        PyErr_NoMemory();
14755
0
        return -1;
14756
0
    }
14757
148
    return 0;
14758
148
}
14759
14760
14761
static int
14762
config_get_codec_name(wchar_t **config_encoding)
14763
74
{
14764
74
    char *encoding;
14765
74
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
14766
0
        return -1;
14767
0
    }
14768
14769
74
    PyObject *name_obj = NULL;
14770
74
    PyObject *codec = _PyCodec_Lookup(encoding);
14771
74
    PyMem_RawFree(encoding);
14772
14773
74
    if (!codec)
14774
0
        goto error;
14775
14776
74
    name_obj = PyObject_GetAttrString(codec, "name");
14777
74
    Py_CLEAR(codec);
14778
74
    if (!name_obj) {
14779
0
        goto error;
14780
0
    }
14781
14782
74
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
14783
74
    Py_DECREF(name_obj);
14784
74
    if (wname == NULL) {
14785
0
        goto error;
14786
0
    }
14787
14788
74
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
14789
74
    if (raw_wname == NULL) {
14790
0
        PyMem_Free(wname);
14791
0
        PyErr_NoMemory();
14792
0
        goto error;
14793
0
    }
14794
14795
74
    PyMem_RawFree(*config_encoding);
14796
74
    *config_encoding = raw_wname;
14797
14798
74
    PyMem_Free(wname);
14799
74
    return 0;
14800
14801
0
error:
14802
0
    Py_XDECREF(codec);
14803
0
    Py_XDECREF(name_obj);
14804
0
    return -1;
14805
74
}
14806
14807
14808
static PyStatus
14809
init_stdio_encoding(PyInterpreterState *interp)
14810
37
{
14811
    /* Update the stdio encoding to the normalized Python codec name. */
14812
37
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14813
37
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
14814
0
        return _PyStatus_ERR("failed to get the Python codec name "
14815
0
                             "of the stdio encoding");
14816
0
    }
14817
37
    return _PyStatus_OK();
14818
37
}
14819
14820
14821
static int
14822
init_fs_codec(PyInterpreterState *interp)
14823
37
{
14824
37
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
14825
14826
37
    _Py_error_handler error_handler;
14827
37
    error_handler = get_error_handler_wide(config->filesystem_errors);
14828
37
    if (error_handler == _Py_ERROR_UNKNOWN) {
14829
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
14830
0
        return -1;
14831
0
    }
14832
14833
37
    char *encoding, *errors;
14834
37
    if (encode_wstr_utf8(config->filesystem_encoding,
14835
37
                         &encoding,
14836
37
                         "filesystem_encoding") < 0) {
14837
0
        return -1;
14838
0
    }
14839
14840
37
    if (encode_wstr_utf8(config->filesystem_errors,
14841
37
                         &errors,
14842
37
                         "filesystem_errors") < 0) {
14843
0
        PyMem_RawFree(encoding);
14844
0
        return -1;
14845
0
    }
14846
14847
37
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
14848
37
    PyMem_RawFree(fs_codec->encoding);
14849
37
    fs_codec->encoding = encoding;
14850
    /* encoding has been normalized by init_fs_encoding() */
14851
37
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
14852
37
    PyMem_RawFree(fs_codec->errors);
14853
37
    fs_codec->errors = errors;
14854
37
    fs_codec->error_handler = error_handler;
14855
14856
#ifdef _Py_FORCE_UTF8_FS_ENCODING
14857
    assert(fs_codec->utf8 == 1);
14858
#endif
14859
14860
    /* At this point, PyUnicode_EncodeFSDefault() and
14861
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
14862
       the C implementation of the filesystem encoding. */
14863
14864
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
14865
       global configuration variables. */
14866
37
    if (_Py_IsMainInterpreter(interp)) {
14867
14868
37
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
14869
37
                                      fs_codec->errors) < 0) {
14870
0
            PyErr_NoMemory();
14871
0
            return -1;
14872
0
        }
14873
37
    }
14874
37
    return 0;
14875
37
}
14876
14877
14878
static PyStatus
14879
init_fs_encoding(PyThreadState *tstate)
14880
37
{
14881
37
    PyInterpreterState *interp = tstate->interp;
14882
14883
    /* Update the filesystem encoding to the normalized Python codec name.
14884
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
14885
       (Python codec name). */
14886
37
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
14887
37
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
14888
0
        _Py_DumpPathConfig(tstate);
14889
0
        return _PyStatus_ERR("failed to get the Python codec "
14890
0
                             "of the filesystem encoding");
14891
0
    }
14892
14893
37
    if (init_fs_codec(interp) < 0) {
14894
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
14895
0
    }
14896
37
    return _PyStatus_OK();
14897
37
}
14898
14899
14900
PyStatus
14901
_PyUnicode_InitEncodings(PyThreadState *tstate)
14902
37
{
14903
37
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
14904
37
    if (_PyStatus_EXCEPTION(status)) {
14905
0
        return status;
14906
0
    }
14907
37
    status = init_fs_encoding(tstate);
14908
37
    if (_PyStatus_EXCEPTION(status)) {
14909
0
        return status;
14910
0
    }
14911
14912
37
    return init_stdio_encoding(tstate->interp);
14913
37
}
14914
14915
14916
static void
14917
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
14918
0
{
14919
0
    PyMem_RawFree(fs_codec->encoding);
14920
0
    fs_codec->encoding = NULL;
14921
0
    fs_codec->utf8 = 0;
14922
0
    PyMem_RawFree(fs_codec->errors);
14923
0
    fs_codec->errors = NULL;
14924
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
14925
0
}
14926
14927
14928
#ifdef Py_DEBUG
14929
static inline int
14930
unicode_is_finalizing(void)
14931
{
14932
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
14933
}
14934
#endif
14935
14936
14937
void
14938
_PyUnicode_FiniTypes(PyInterpreterState *interp)
14939
0
{
14940
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
14941
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
14942
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
14943
0
}
14944
14945
14946
void
14947
_PyUnicode_Fini(PyInterpreterState *interp)
14948
0
{
14949
0
    struct _Py_unicode_state *state = &interp->unicode;
14950
14951
0
    if (!has_shared_intern_dict(interp)) {
14952
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
14953
0
        assert(get_interned_dict(interp) == NULL);
14954
0
    }
14955
14956
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
14957
14958
    // bpo-47182: force a unicodedata CAPI capsule re-import on
14959
    // subsequent initialization of interpreter.
14960
0
    interp->unicode.ucnhash_capi = NULL;
14961
14962
0
    unicode_clear_identifiers(state);
14963
0
}
14964
14965
/* A _string module, to export formatter_parser and formatter_field_name_split
14966
   to the string.Formatter class implemented in Python. */
14967
14968
static PyMethodDef _string_methods[] = {
14969
    {"formatter_field_name_split", formatter_field_name_split,
14970
     METH_O, PyDoc_STR("split the argument as a field name")},
14971
    {"formatter_parser", formatter_parser,
14972
     METH_O, PyDoc_STR("parse the argument as a format string")},
14973
    {NULL, NULL}
14974
};
14975
14976
static PyModuleDef_Slot module_slots[] = {
14977
    _Py_ABI_SLOT,
14978
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
14979
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
14980
    {0, NULL}
14981
};
14982
14983
static struct PyModuleDef _string_module = {
14984
    PyModuleDef_HEAD_INIT,
14985
    .m_name = "_string",
14986
    .m_doc = PyDoc_STR("string helper module"),
14987
    .m_size = 0,
14988
    .m_methods = _string_methods,
14989
    .m_slots = module_slots,
14990
};
14991
14992
PyMODINIT_FUNC
14993
PyInit__string(void)
14994
8
{
14995
8
    return PyModuleDef_Init(&_string_module);
14996
8
}
14997
14998
14999
#undef PyUnicode_KIND
15000
int PyUnicode_KIND(PyObject *op)
15001
0
{
15002
0
    if (!PyUnicode_Check(op)) {
15003
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15004
0
        return -1;
15005
0
    }
15006
0
    return _PyASCIIObject_CAST(op)->state.kind;
15007
0
}
15008
15009
#undef PyUnicode_DATA
15010
void* PyUnicode_DATA(PyObject *op)
15011
0
{
15012
0
    if (!PyUnicode_Check(op)) {
15013
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
15014
0
        return NULL;
15015
0
    }
15016
0
    return _PyUnicode_DATA(op);
15017
0
}