Coverage Report

Created: 2025-10-10 06:33

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Objects/unicodeobject.c
Line
Count
Source
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#include "Python.h"
42
#include "pycore_abstract.h"      // _PyIndex_Check()
43
#include "pycore_bytes_methods.h" // _Py_bytes_lower()
44
#include "pycore_bytesobject.h"   // _PyBytes_Repeat()
45
#include "pycore_ceval.h"         // _PyEval_GetBuiltin()
46
#include "pycore_codecs.h"        // _PyCodec_Lookup()
47
#include "pycore_critical_section.h" // Py_*_CRITICAL_SECTION_SEQUENCE_FAST
48
#include "pycore_format.h"        // F_LJUST
49
#include "pycore_freelist.h"      // _Py_FREELIST_FREE(), _Py_FREELIST_POP()
50
#include "pycore_initconfig.h"    // _PyStatus_OK()
51
#include "pycore_interp.h"        // PyInterpreterState.fs_codec
52
#include "pycore_long.h"          // _PyLong_FormatWriter()
53
#include "pycore_object.h"        // _PyObject_GC_TRACK(), _Py_FatalRefcountError()
54
#include "pycore_pathconfig.h"    // _Py_DumpPathConfig()
55
#include "pycore_pyerrors.h"      // _PyUnicodeTranslateError_Create()
56
#include "pycore_pyhash.h"        // _Py_HashSecret_t
57
#include "pycore_pylifecycle.h"   // _Py_SetFileSystemEncoding()
58
#include "pycore_pystate.h"       // _PyInterpreterState_GET()
59
#include "pycore_tuple.h"         // _PyTuple_FromArray()
60
#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
61
#include "pycore_unicodeobject.h" // struct _Py_unicode_state
62
#include "pycore_unicodeobject_generated.h"  // _PyUnicode_InitStaticStrings()
63
64
#include "stringlib/eq.h"         // unicode_eq()
65
#include <stddef.h>               // ptrdiff_t
66
67
#ifdef MS_WINDOWS
68
#include <windows.h>
69
#endif
70
71
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
72
#  include "pycore_fileutils.h"   // _Py_LocaleUsesNonUnicodeWchar()
73
#endif
74
75
/* Uncomment to display statistics on interned strings at exit
76
   in _PyUnicode_ClearInterned(). */
77
/* #define INTERNED_STATS 1 */
78
79
80
/*[clinic input]
81
class str "PyObject *" "&PyUnicode_Type"
82
[clinic start generated code]*/
83
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
84
85
/*[python input]
86
class Py_UCS4_converter(CConverter):
87
    type = 'Py_UCS4'
88
    converter = 'convert_uc'
89
90
    def converter_init(self):
91
        if self.default is not unspecified:
92
            self.c_default = ascii(self.default)
93
            if len(self.c_default) > 4 or self.c_default[0] != "'":
94
                self.c_default = hex(ord(self.default))
95
96
[python start generated code]*/
97
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
98
99
/* --- Globals ------------------------------------------------------------
100
101
NOTE: In the interpreter's initialization phase, some globals are currently
102
      initialized dynamically as needed. In the process Unicode objects may
103
      be created before the Unicode type is ready.
104
105
*/
106
107
82.1M
#define MAX_UNICODE _Py_MAX_UNICODE
108
109
#ifdef Py_DEBUG
110
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
111
#else
112
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
113
#endif
114
115
static inline char* _PyUnicode_UTF8(PyObject *op)
116
253M
{
117
253M
    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
118
253M
}
119
120
static inline char* PyUnicode_UTF8(PyObject *op)
121
67.6M
{
122
67.6M
    assert(_PyUnicode_CHECK(op));
123
67.6M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
124
54.9M
        return ((char*)(_PyASCIIObject_CAST(op) + 1));
125
54.9M
    }
126
12.7M
    else {
127
12.7M
         return _PyUnicode_UTF8(op);
128
12.7M
    }
129
67.6M
}
130
131
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
132
20.8M
{
133
20.8M
    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
134
20.8M
}
135
136
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
137
30.7M
{
138
30.7M
    assert(_PyUnicode_CHECK(op));
139
30.7M
    if (PyUnicode_IS_COMPACT_ASCII(op)) {
140
27.4M
         return _PyASCIIObject_CAST(op)->length;
141
27.4M
    }
142
3.32M
    else {
143
3.32M
         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
144
3.32M
    }
145
30.7M
}
146
147
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
148
20.8M
{
149
20.8M
    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
150
20.8M
}
151
152
#define _PyUnicode_LENGTH(op)                           \
153
572M
    (_PyASCIIObject_CAST(op)->length)
154
#define _PyUnicode_STATE(op)                            \
155
3.60G
    (_PyASCIIObject_CAST(op)->state)
156
#define _PyUnicode_HASH(op)                             \
157
525M
    (_PyASCIIObject_CAST(op)->hash)
158
159
106M
#define PyUnicode_HASH PyUnstable_Unicode_GET_CACHED_HASH
160
161
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
162
45.4M
{
163
45.4M
    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
164
45.4M
}
165
166
#define _PyUnicode_DATA_ANY(op)                         \
167
43.8M
    (_PyUnicodeObject_CAST(op)->data.any)
168
169
static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
170
0
{
171
0
    assert(_PyUnicode_CHECK(op));
172
0
    assert(!PyUnicode_IS_COMPACT_ASCII(op));
173
0
    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
174
0
}
175
176
/* true if the Unicode object has an allocated UTF-8 memory block
177
   (not shared with other data) */
178
static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
179
572M
{
180
572M
    return (!PyUnicode_IS_COMPACT_ASCII(op)
181
230M
            && _PyUnicode_UTF8(op) != NULL
182
9.86M
            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
183
572M
}
184
185
186
/* Generic helper macro to convert characters of different types.
187
   from_type and to_type have to be valid type names, begin and end
188
   are pointers to the source characters which should be of type
189
   "from_type *".  to is a pointer of type "to_type *" and points to the
190
   buffer where the result characters are written to. */
191
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
192
157M
    do {                                                \
193
157M
        to_type *_to = (to_type *)(to);                 \
194
157M
        const from_type *_iter = (const from_type *)(begin);\
195
157M
        const from_type *_end = (const from_type *)(end);\
196
157M
        Py_ssize_t n = (_end) - (_iter);                \
197
157M
        const from_type *_unrolled_end =                \
198
157M
            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
199
1.05G
        while (_iter < (_unrolled_end)) {               \
200
895M
            _to[0] = (to_type) _iter[0];                \
201
895M
            _to[1] = (to_type) _iter[1];                \
202
895M
            _to[2] = (to_type) _iter[2];                \
203
895M
            _to[3] = (to_type) _iter[3];                \
204
895M
            _iter += 4; _to += 4;                       \
205
895M
        }                                               \
206
357M
        while (_iter < (_end))                          \
207
200M
            *_to++ = (to_type) *_iter++;                \
208
157M
    } while (0)
209
210
258M
#define LATIN1 _Py_LATIN1_CHR
211
212
#ifdef MS_WINDOWS
213
   /* On Windows, overallocate by 50% is the best factor */
214
#  define OVERALLOCATE_FACTOR 2
215
#else
216
   /* On Linux, overallocate by 25% is the best factor */
217
100M
#  define OVERALLOCATE_FACTOR 4
218
#endif
219
220
/* Forward declaration */
221
static inline int
222
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
223
static inline void
224
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
225
static PyObject *
226
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
227
                    const char *errors);
228
static PyObject *
229
unicode_decode_utf8(const char *s, Py_ssize_t size,
230
                    _Py_error_handler error_handler, const char *errors,
231
                    Py_ssize_t *consumed);
232
static int
233
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
234
                           const char *s, Py_ssize_t size,
235
                           _Py_error_handler error_handler, const char *errors,
236
                           Py_ssize_t *consumed);
237
#ifdef Py_DEBUG
238
static inline int unicode_is_finalizing(void);
239
static int unicode_is_singleton(PyObject *unicode);
240
#endif
241
242
243
// Return a reference to the immortal empty string singleton.
244
static inline PyObject* unicode_get_empty(void)
245
118M
{
246
118M
    _Py_DECLARE_STR(empty, "");
247
118M
    return &_Py_STR(empty);
248
118M
}
249
250
/* This dictionary holds per-interpreter interned strings.
251
 * See InternalDocs/string_interning.md for details.
252
 */
253
static inline PyObject *get_interned_dict(PyInterpreterState *interp)
254
3.71M
{
255
3.71M
    return _Py_INTERP_CACHED_OBJECT(interp, interned_strings);
256
3.71M
}
257
258
/* This hashtable holds statically allocated interned strings.
259
 * See InternalDocs/string_interning.md for details.
260
 */
261
3.38M
#define INTERNED_STRINGS _PyRuntime.cached_objects.interned_strings
262
263
/* Get number of all interned strings for the current interpreter. */
264
Py_ssize_t
265
_PyUnicode_InternedSize(void)
266
0
{
267
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
268
0
    return _Py_hashtable_len(INTERNED_STRINGS) + PyDict_GET_SIZE(dict);
269
0
}
270
271
/* Get number of immortal interned strings for the current interpreter. */
272
Py_ssize_t
273
_PyUnicode_InternedSize_Immortal(void)
274
0
{
275
0
    PyObject *dict = get_interned_dict(_PyInterpreterState_GET());
276
0
    PyObject *key, *value;
277
0
    Py_ssize_t pos = 0;
278
0
    Py_ssize_t count = 0;
279
280
    // It's tempting to keep a count and avoid a loop here. But, this function
281
    // is intended for refleak tests. It spends extra work to report the true
282
    // value, to help detect bugs in optimizations.
283
284
0
    while (PyDict_Next(dict, &pos, &key, &value)) {
285
0
        assert(PyUnicode_CHECK_INTERNED(key) != SSTATE_INTERNED_IMMORTAL_STATIC);
286
0
        if (PyUnicode_CHECK_INTERNED(key) == SSTATE_INTERNED_IMMORTAL) {
287
0
           count++;
288
0
       }
289
0
    }
290
0
    return _Py_hashtable_len(INTERNED_STRINGS) + count;
291
0
}
292
293
static Py_hash_t unicode_hash(PyObject *);
294
295
static Py_uhash_t
296
hashtable_unicode_hash(const void *key)
297
3.38M
{
298
3.38M
    return unicode_hash((PyObject *)key);
299
3.38M
}
300
301
static int
302
hashtable_unicode_compare(const void *key1, const void *key2)
303
288k
{
304
288k
    PyObject *obj1 = (PyObject *)key1;
305
288k
    PyObject *obj2 = (PyObject *)key2;
306
288k
    if (obj1 != NULL && obj2 != NULL) {
307
288k
        return unicode_eq(obj1, obj2);
308
288k
    }
309
0
    else {
310
0
        return obj1 == obj2;
311
0
    }
312
288k
}
313
314
/* Return true if this interpreter should share the main interpreter's
315
   intern_dict.  That's important for interpreters which load basic
316
   single-phase init extension modules (m_size == -1).  There could be interned
317
   immortal strings that are shared between interpreters, due to the
318
   PyDict_Update(mdict, m_copy) call in import_find_extension().
319
320
   It's not safe to deallocate those strings until all interpreters that
321
   potentially use them are freed.  By storing them in the main interpreter, we
322
   ensure they get freed after all other interpreters are freed.
323
*/
324
static bool
325
has_shared_intern_dict(PyInterpreterState *interp)
326
16
{
327
16
    PyInterpreterState *main_interp = _PyInterpreterState_Main();
328
16
    return interp != main_interp  && interp->feature_flags & Py_RTFLAGS_USE_MAIN_OBMALLOC;
329
16
}
330
331
static int
332
init_interned_dict(PyInterpreterState *interp)
333
16
{
334
16
    assert(get_interned_dict(interp) == NULL);
335
16
    PyObject *interned;
336
16
    if (has_shared_intern_dict(interp)) {
337
0
        interned = get_interned_dict(_PyInterpreterState_Main());
338
0
        Py_INCREF(interned);
339
0
    }
340
16
    else {
341
16
        interned = PyDict_New();
342
16
        if (interned == NULL) {
343
0
            return -1;
344
0
        }
345
16
    }
346
16
    _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = interned;
347
16
    return 0;
348
16
}
349
350
static void
351
clear_interned_dict(PyInterpreterState *interp)
352
0
{
353
0
    PyObject *interned = get_interned_dict(interp);
354
0
    if (interned != NULL) {
355
0
        if (!has_shared_intern_dict(interp)) {
356
            // only clear if the dict belongs to this interpreter
357
0
            PyDict_Clear(interned);
358
0
        }
359
0
        Py_DECREF(interned);
360
0
        _Py_INTERP_CACHED_OBJECT(interp, interned_strings) = NULL;
361
0
    }
362
0
}
363
364
static PyStatus
365
init_global_interned_strings(PyInterpreterState *interp)
366
16
{
367
16
    assert(INTERNED_STRINGS == NULL);
368
16
    _Py_hashtable_allocator_t hashtable_alloc = {PyMem_RawMalloc, PyMem_RawFree};
369
370
16
    INTERNED_STRINGS = _Py_hashtable_new_full(
371
16
        hashtable_unicode_hash,
372
16
        hashtable_unicode_compare,
373
        // Objects stored here are immortal and statically allocated,
374
        // so we don't need key_destroy_func & value_destroy_func:
375
16
        NULL,
376
16
        NULL,
377
16
        &hashtable_alloc
378
16
    );
379
16
    if (INTERNED_STRINGS == NULL) {
380
0
        PyErr_Clear();
381
0
        return _PyStatus_ERR("failed to create global interned dict");
382
0
    }
383
384
    /* Intern statically allocated string identifiers, deepfreeze strings,
385
        * and one-byte latin-1 strings.
386
        * This must be done before any module initialization so that statically
387
        * allocated string identifiers are used instead of heap allocated strings.
388
        * Deepfreeze uses the interned identifiers if present to save space
389
        * else generates them and they are interned to speed up dict lookups.
390
    */
391
16
    _PyUnicode_InitStaticStrings(interp);
392
393
4.11k
    for (int i = 0; i < 256; i++) {
394
4.09k
        PyObject *s = LATIN1(i);
395
4.09k
        _PyUnicode_InternStatic(interp, &s);
396
4.09k
        assert(s == LATIN1(i));
397
4.09k
    }
398
#ifdef Py_DEBUG
399
    assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1));
400
401
    for (int i = 0; i < 256; i++) {
402
        assert(_PyUnicode_CheckConsistency(LATIN1(i), 1));
403
    }
404
#endif
405
16
    return _PyStatus_OK();
406
16
}
407
408
static void clear_global_interned_strings(void)
409
0
{
410
0
    if (INTERNED_STRINGS != NULL) {
411
0
        _Py_hashtable_destroy(INTERNED_STRINGS);
412
0
        INTERNED_STRINGS = NULL;
413
0
    }
414
0
}
415
416
#define _Py_RETURN_UNICODE_EMPTY()   \
417
53.4M
    do {                             \
418
53.4M
        return unicode_get_empty();  \
419
53.4M
    } while (0)
420
421
422
/* Fast detection of the most frequent whitespace characters */
423
const unsigned char _Py_ascii_whitespace[] = {
424
    0, 0, 0, 0, 0, 0, 0, 0,
425
/*     case 0x0009: * CHARACTER TABULATION */
426
/*     case 0x000A: * LINE FEED */
427
/*     case 0x000B: * LINE TABULATION */
428
/*     case 0x000C: * FORM FEED */
429
/*     case 0x000D: * CARRIAGE RETURN */
430
    0, 1, 1, 1, 1, 1, 0, 0,
431
    0, 0, 0, 0, 0, 0, 0, 0,
432
/*     case 0x001C: * FILE SEPARATOR */
433
/*     case 0x001D: * GROUP SEPARATOR */
434
/*     case 0x001E: * RECORD SEPARATOR */
435
/*     case 0x001F: * UNIT SEPARATOR */
436
    0, 0, 0, 0, 1, 1, 1, 1,
437
/*     case 0x0020: * SPACE */
438
    1, 0, 0, 0, 0, 0, 0, 0,
439
    0, 0, 0, 0, 0, 0, 0, 0,
440
    0, 0, 0, 0, 0, 0, 0, 0,
441
    0, 0, 0, 0, 0, 0, 0, 0,
442
443
    0, 0, 0, 0, 0, 0, 0, 0,
444
    0, 0, 0, 0, 0, 0, 0, 0,
445
    0, 0, 0, 0, 0, 0, 0, 0,
446
    0, 0, 0, 0, 0, 0, 0, 0,
447
    0, 0, 0, 0, 0, 0, 0, 0,
448
    0, 0, 0, 0, 0, 0, 0, 0,
449
    0, 0, 0, 0, 0, 0, 0, 0,
450
    0, 0, 0, 0, 0, 0, 0, 0
451
};
452
453
/* forward */
454
static PyObject* get_latin1_char(unsigned char ch);
455
static int unicode_modifiable(PyObject *unicode);
456
457
458
static PyObject *
459
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
460
static PyObject *
461
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
462
static PyObject *
463
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
464
465
static PyObject *
466
unicode_encode_call_errorhandler(const char *errors,
467
       PyObject **errorHandler,const char *encoding, const char *reason,
468
       PyObject *unicode, PyObject **exceptionObject,
469
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
470
471
static void
472
raise_encode_exception(PyObject **exceptionObject,
473
                       const char *encoding,
474
                       PyObject *unicode,
475
                       Py_ssize_t startpos, Py_ssize_t endpos,
476
                       const char *reason);
477
478
/* Same for linebreaks */
479
static const unsigned char ascii_linebreak[] = {
480
    0, 0, 0, 0, 0, 0, 0, 0,
481
/*         0x000A, * LINE FEED */
482
/*         0x000B, * LINE TABULATION */
483
/*         0x000C, * FORM FEED */
484
/*         0x000D, * CARRIAGE RETURN */
485
    0, 0, 1, 1, 1, 1, 0, 0,
486
    0, 0, 0, 0, 0, 0, 0, 0,
487
/*         0x001C, * FILE SEPARATOR */
488
/*         0x001D, * GROUP SEPARATOR */
489
/*         0x001E, * RECORD SEPARATOR */
490
    0, 0, 0, 0, 1, 1, 1, 0,
491
    0, 0, 0, 0, 0, 0, 0, 0,
492
    0, 0, 0, 0, 0, 0, 0, 0,
493
    0, 0, 0, 0, 0, 0, 0, 0,
494
    0, 0, 0, 0, 0, 0, 0, 0,
495
496
    0, 0, 0, 0, 0, 0, 0, 0,
497
    0, 0, 0, 0, 0, 0, 0, 0,
498
    0, 0, 0, 0, 0, 0, 0, 0,
499
    0, 0, 0, 0, 0, 0, 0, 0,
500
    0, 0, 0, 0, 0, 0, 0, 0,
501
    0, 0, 0, 0, 0, 0, 0, 0,
502
    0, 0, 0, 0, 0, 0, 0, 0,
503
    0, 0, 0, 0, 0, 0, 0, 0
504
};
505
506
static int convert_uc(PyObject *obj, void *addr);
507
508
struct encoding_map;
509
#include "clinic/unicodeobject.c.h"
510
511
_Py_error_handler
512
_Py_GetErrorHandler(const char *errors)
513
526k
{
514
526k
    if (errors == NULL || strcmp(errors, "strict") == 0) {
515
200k
        return _Py_ERROR_STRICT;
516
200k
    }
517
325k
    if (strcmp(errors, "surrogateescape") == 0) {
518
165k
        return _Py_ERROR_SURROGATEESCAPE;
519
165k
    }
520
159k
    if (strcmp(errors, "replace") == 0) {
521
159k
        return _Py_ERROR_REPLACE;
522
159k
    }
523
0
    if (strcmp(errors, "ignore") == 0) {
524
0
        return _Py_ERROR_IGNORE;
525
0
    }
526
0
    if (strcmp(errors, "backslashreplace") == 0) {
527
0
        return _Py_ERROR_BACKSLASHREPLACE;
528
0
    }
529
0
    if (strcmp(errors, "surrogatepass") == 0) {
530
0
        return _Py_ERROR_SURROGATEPASS;
531
0
    }
532
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
533
0
        return _Py_ERROR_XMLCHARREFREPLACE;
534
0
    }
535
0
    return _Py_ERROR_OTHER;
536
0
}
537
538
539
static _Py_error_handler
540
get_error_handler_wide(const wchar_t *errors)
541
5.57k
{
542
5.57k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
543
0
        return _Py_ERROR_STRICT;
544
0
    }
545
5.57k
    if (wcscmp(errors, L"surrogateescape") == 0) {
546
5.57k
        return _Py_ERROR_SURROGATEESCAPE;
547
5.57k
    }
548
0
    if (wcscmp(errors, L"replace") == 0) {
549
0
        return _Py_ERROR_REPLACE;
550
0
    }
551
0
    if (wcscmp(errors, L"ignore") == 0) {
552
0
        return _Py_ERROR_IGNORE;
553
0
    }
554
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
555
0
        return _Py_ERROR_BACKSLASHREPLACE;
556
0
    }
557
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
558
0
        return _Py_ERROR_SURROGATEPASS;
559
0
    }
560
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
561
0
        return _Py_ERROR_XMLCHARREFREPLACE;
562
0
    }
563
0
    return _Py_ERROR_OTHER;
564
0
}
565
566
567
static inline int
568
unicode_check_encoding_errors(const char *encoding, const char *errors)
569
22.1M
{
570
22.1M
    if (encoding == NULL && errors == NULL) {
571
11.2M
        return 0;
572
11.2M
    }
573
574
10.8M
    PyInterpreterState *interp = _PyInterpreterState_GET();
575
10.8M
#ifndef Py_DEBUG
576
    /* In release mode, only check in development mode (-X dev) */
577
10.8M
    if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
578
10.8M
        return 0;
579
10.8M
    }
580
#else
581
    /* Always check in debug mode */
582
#endif
583
584
    /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
585
       codec registry is ready: before_PyUnicode_InitEncodings() is called. */
586
0
    if (!interp->unicode.fs_codec.encoding) {
587
0
        return 0;
588
0
    }
589
590
    /* Disable checks during Python finalization. For example, it allows to
591
       call _PyObject_Dump() during finalization for debugging purpose. */
592
0
    if (_PyInterpreterState_GetFinalizing(interp) != NULL) {
593
0
        return 0;
594
0
    }
595
596
0
    if (encoding != NULL
597
        // Fast path for the most common built-in encodings. Even if the codec
598
        // is cached, _PyCodec_Lookup() decodes the bytes string from UTF-8 to
599
        // create a temporary Unicode string (the key in the cache).
600
0
        && strcmp(encoding, "utf-8") != 0
601
0
        && strcmp(encoding, "utf8") != 0
602
0
        && strcmp(encoding, "ascii") != 0)
603
0
    {
604
0
        PyObject *handler = _PyCodec_Lookup(encoding);
605
0
        if (handler == NULL) {
606
0
            return -1;
607
0
        }
608
0
        Py_DECREF(handler);
609
0
    }
610
611
0
    if (errors != NULL
612
        // Fast path for the most common built-in error handlers.
613
0
        && strcmp(errors, "strict") != 0
614
0
        && strcmp(errors, "ignore") != 0
615
0
        && strcmp(errors, "replace") != 0
616
0
        && strcmp(errors, "surrogateescape") != 0
617
0
        && strcmp(errors, "surrogatepass") != 0)
618
0
    {
619
0
        PyObject *handler = PyCodec_LookupError(errors);
620
0
        if (handler == NULL) {
621
0
            return -1;
622
0
        }
623
0
        Py_DECREF(handler);
624
0
    }
625
0
    return 0;
626
0
}
627
628
629
int
630
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
631
0
{
632
0
#define CHECK(expr) \
633
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
634
635
0
    assert(op != NULL);
636
0
    CHECK(PyUnicode_Check(op));
637
638
0
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
639
0
    int kind = ascii->state.kind;
640
641
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
642
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
643
0
    }
644
0
    else {
645
0
        PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
646
0
        void *data;
647
648
0
        if (ascii->state.compact == 1) {
649
0
            data = compact + 1;
650
0
            CHECK(kind == PyUnicode_1BYTE_KIND
651
0
                                 || kind == PyUnicode_2BYTE_KIND
652
0
                                 || kind == PyUnicode_4BYTE_KIND);
653
0
            CHECK(ascii->state.ascii == 0);
654
0
            CHECK(_PyUnicode_UTF8(op) != data);
655
0
        }
656
0
        else {
657
0
            PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
658
659
0
            data = unicode->data.any;
660
0
            CHECK(kind == PyUnicode_1BYTE_KIND
661
0
                     || kind == PyUnicode_2BYTE_KIND
662
0
                     || kind == PyUnicode_4BYTE_KIND);
663
0
            CHECK(ascii->state.compact == 0);
664
0
            CHECK(data != NULL);
665
0
            if (ascii->state.ascii) {
666
0
                CHECK(_PyUnicode_UTF8(op) == data);
667
0
                CHECK(compact->utf8_length == ascii->length);
668
0
            }
669
0
            else {
670
0
                CHECK(_PyUnicode_UTF8(op) != data);
671
0
            }
672
0
        }
673
0
#ifndef Py_GIL_DISABLED
674
0
        if (_PyUnicode_UTF8(op) == NULL)
675
0
            CHECK(compact->utf8_length == 0);
676
0
#endif
677
0
    }
678
679
    /* check that the best kind is used: O(n) operation */
680
0
    if (check_content) {
681
0
        Py_ssize_t i;
682
0
        Py_UCS4 maxchar = 0;
683
0
        const void *data;
684
0
        Py_UCS4 ch;
685
686
0
        data = PyUnicode_DATA(ascii);
687
0
        for (i=0; i < ascii->length; i++)
688
0
        {
689
0
            ch = PyUnicode_READ(kind, data, i);
690
0
            if (ch > maxchar)
691
0
                maxchar = ch;
692
0
        }
693
0
        if (kind == PyUnicode_1BYTE_KIND) {
694
0
            if (ascii->state.ascii == 0) {
695
0
                CHECK(maxchar >= 128);
696
0
                CHECK(maxchar <= 255);
697
0
            }
698
0
            else
699
0
                CHECK(maxchar < 128);
700
0
        }
701
0
        else if (kind == PyUnicode_2BYTE_KIND) {
702
0
            CHECK(maxchar >= 0x100);
703
0
            CHECK(maxchar <= 0xFFFF);
704
0
        }
705
0
        else {
706
0
            CHECK(maxchar >= 0x10000);
707
0
            CHECK(maxchar <= MAX_UNICODE);
708
0
        }
709
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
710
0
    }
711
712
    /* Check interning state */
713
#ifdef Py_DEBUG
714
    // Note that we do not check `_Py_IsImmortal(op)`, since stable ABI
715
    // extensions can make immortal strings mortal (but with a high enough
716
    // refcount).
717
    // The other way is extremely unlikely (worth a potential failed assertion
718
    // in a debug build), so we do check `!_Py_IsImmortal(op)`.
719
    switch (PyUnicode_CHECK_INTERNED(op)) {
720
        case SSTATE_NOT_INTERNED:
721
            if (ascii->state.statically_allocated) {
722
                // This state is for two exceptions:
723
                // - strings are currently checked before they're interned
724
                // - the 256 one-latin1-character strings
725
                //   are static but use SSTATE_NOT_INTERNED
726
            }
727
            else {
728
                CHECK(!_Py_IsImmortal(op));
729
            }
730
            break;
731
        case SSTATE_INTERNED_MORTAL:
732
            CHECK(!ascii->state.statically_allocated);
733
            CHECK(!_Py_IsImmortal(op));
734
            break;
735
        case SSTATE_INTERNED_IMMORTAL:
736
            CHECK(!ascii->state.statically_allocated);
737
            break;
738
        case SSTATE_INTERNED_IMMORTAL_STATIC:
739
            CHECK(ascii->state.statically_allocated);
740
            break;
741
        default:
742
            Py_UNREACHABLE();
743
    }
744
#endif
745
746
0
    return 1;
747
748
0
#undef CHECK
749
0
}
750
751
static PyObject*
752
unicode_result(PyObject *unicode)
753
47.9M
{
754
47.9M
    assert(_PyUnicode_CHECK(unicode));
755
756
47.9M
    Py_ssize_t length = PyUnicode_GET_LENGTH(unicode);
757
47.9M
    if (length == 0) {
758
246
        PyObject *empty = unicode_get_empty();
759
246
        if (unicode != empty) {
760
0
            Py_DECREF(unicode);
761
0
        }
762
246
        return empty;
763
246
    }
764
765
47.9M
    if (length == 1) {
766
274k
        int kind = PyUnicode_KIND(unicode);
767
274k
        if (kind == PyUnicode_1BYTE_KIND) {
768
86.7k
            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
769
86.7k
            Py_UCS1 ch = data[0];
770
86.7k
            PyObject *latin1_char = LATIN1(ch);
771
86.7k
            if (unicode != latin1_char) {
772
81.2k
                Py_DECREF(unicode);
773
81.2k
            }
774
86.7k
            return latin1_char;
775
86.7k
        }
776
274k
    }
777
778
47.9M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
779
47.8M
    return unicode;
780
47.9M
}
781
782
static PyObject*
783
unicode_result_unchanged(PyObject *unicode)
784
141M
{
785
141M
    if (PyUnicode_CheckExact(unicode)) {
786
137M
        return Py_NewRef(unicode);
787
137M
    }
788
3.37M
    else
789
        /* Subtype -- return genuine unicode string with the same value. */
790
3.37M
        return _PyUnicode_Copy(unicode);
791
141M
}
792
793
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
794
   ASCII, Latin1, UTF-8, etc. */
795
static char*
796
backslashreplace(PyBytesWriter *writer, char *str,
797
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
798
0
{
799
0
    Py_ssize_t size, i;
800
0
    Py_UCS4 ch;
801
0
    int kind;
802
0
    const void *data;
803
804
0
    kind = PyUnicode_KIND(unicode);
805
0
    data = PyUnicode_DATA(unicode);
806
807
0
    size = 0;
808
    /* determine replacement size */
809
0
    for (i = collstart; i < collend; ++i) {
810
0
        Py_ssize_t incr;
811
812
0
        ch = PyUnicode_READ(kind, data, i);
813
0
        if (ch < 0x100)
814
0
            incr = 2+2;
815
0
        else if (ch < 0x10000)
816
0
            incr = 2+4;
817
0
        else {
818
0
            assert(ch <= MAX_UNICODE);
819
0
            incr = 2+8;
820
0
        }
821
0
        if (size > PY_SSIZE_T_MAX - incr) {
822
0
            PyErr_SetString(PyExc_OverflowError,
823
0
                            "encoded result is too long for a Python string");
824
0
            return NULL;
825
0
        }
826
0
        size += incr;
827
0
    }
828
829
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
830
0
    if (str == NULL) {
831
0
        return NULL;
832
0
    }
833
834
    /* generate replacement */
835
0
    for (i = collstart; i < collend; ++i) {
836
0
        ch = PyUnicode_READ(kind, data, i);
837
0
        *str++ = '\\';
838
0
        if (ch >= 0x00010000) {
839
0
            *str++ = 'U';
840
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
841
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
842
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
843
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
844
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
845
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
846
0
        }
847
0
        else if (ch >= 0x100) {
848
0
            *str++ = 'u';
849
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
850
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
851
0
        }
852
0
        else
853
0
            *str++ = 'x';
854
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
855
0
        *str++ = Py_hexdigits[ch&0xf];
856
0
    }
857
0
    return str;
858
0
}
859
860
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
861
   ASCII, Latin1, UTF-8, etc. */
862
static char*
863
xmlcharrefreplace(PyBytesWriter *writer, char *str,
864
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
865
0
{
866
0
    Py_ssize_t size, i;
867
0
    Py_UCS4 ch;
868
0
    int kind;
869
0
    const void *data;
870
871
0
    kind = PyUnicode_KIND(unicode);
872
0
    data = PyUnicode_DATA(unicode);
873
874
0
    size = 0;
875
    /* determine replacement size */
876
0
    for (i = collstart; i < collend; ++i) {
877
0
        Py_ssize_t incr;
878
879
0
        ch = PyUnicode_READ(kind, data, i);
880
0
        if (ch < 10)
881
0
            incr = 2+1+1;
882
0
        else if (ch < 100)
883
0
            incr = 2+2+1;
884
0
        else if (ch < 1000)
885
0
            incr = 2+3+1;
886
0
        else if (ch < 10000)
887
0
            incr = 2+4+1;
888
0
        else if (ch < 100000)
889
0
            incr = 2+5+1;
890
0
        else if (ch < 1000000)
891
0
            incr = 2+6+1;
892
0
        else {
893
0
            assert(ch <= MAX_UNICODE);
894
0
            incr = 2+7+1;
895
0
        }
896
0
        if (size > PY_SSIZE_T_MAX - incr) {
897
0
            PyErr_SetString(PyExc_OverflowError,
898
0
                            "encoded result is too long for a Python string");
899
0
            return NULL;
900
0
        }
901
0
        size += incr;
902
0
    }
903
904
0
    str = PyBytesWriter_GrowAndUpdatePointer(writer, size, str);
905
0
    if (str == NULL) {
906
0
        return NULL;
907
0
    }
908
909
    /* generate replacement */
910
0
    for (i = collstart; i < collend; ++i) {
911
0
        size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
912
0
        if (size < 0) {
913
0
            return NULL;
914
0
        }
915
0
        str += size;
916
0
    }
917
0
    return str;
918
0
}
919
920
/* --- Bloom Filters ----------------------------------------------------- */
921
922
/* stuff to implement simple "bloom filters" for Unicode characters.
923
   to keep things simple, we use a single bitmask, using the least 5
924
   bits from each unicode characters as the bit index. */
925
926
/* the linebreak mask is set up by _PyUnicode_Init() below */
927
928
#if LONG_BIT >= 128
929
#define BLOOM_WIDTH 128
930
#elif LONG_BIT >= 64
931
42.9M
#define BLOOM_WIDTH 64
932
#elif LONG_BIT >= 32
933
#define BLOOM_WIDTH 32
934
#else
935
#error "LONG_BIT is smaller than 32"
936
#endif
937
938
16.8M
#define BLOOM_MASK unsigned long
939
940
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
941
942
61.7M
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
943
944
#define BLOOM_LINEBREAK(ch)                                             \
945
243M
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
946
243M
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
947
948
static inline BLOOM_MASK
949
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
950
8.44M
{
951
8.44M
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
952
8.44M
    do {                                               \
953
8.44M
        TYPE *data = (TYPE *)PTR;                      \
954
8.44M
        TYPE *end = data + LEN;                        \
955
8.44M
        Py_UCS4 ch;                                    \
956
18.4M
        for (; data != end; data++) {                  \
957
9.98M
            ch = *data;                                \
958
9.98M
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
959
9.98M
        }                                              \
960
8.44M
        break;                                         \
961
8.44M
    } while (0)
962
963
    /* calculate simple bloom-style bitmask for a given unicode string */
964
965
8.44M
    BLOOM_MASK mask;
966
967
8.44M
    mask = 0;
968
8.44M
    switch (kind) {
969
8.44M
    case PyUnicode_1BYTE_KIND:
970
8.44M
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
971
8.44M
        break;
972
16
    case PyUnicode_2BYTE_KIND:
973
16
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
974
16
        break;
975
0
    case PyUnicode_4BYTE_KIND:
976
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
977
0
        break;
978
0
    default:
979
0
        Py_UNREACHABLE();
980
8.44M
    }
981
8.44M
    return mask;
982
983
8.44M
#undef BLOOM_UPDATE
984
8.44M
}
985
986
static int
987
ensure_unicode(PyObject *obj)
988
166M
{
989
166M
    if (!PyUnicode_Check(obj)) {
990
0
        PyErr_Format(PyExc_TypeError,
991
0
                     "must be str, not %.100s",
992
0
                     Py_TYPE(obj)->tp_name);
993
0
        return -1;
994
0
    }
995
166M
    return 0;
996
166M
}
997
998
/* Compilation of templated routines */
999
1000
1.06M
#define STRINGLIB_GET_EMPTY() unicode_get_empty()
1001
1002
#include "stringlib/asciilib.h"
1003
#include "stringlib/fastsearch.h"
1004
#include "stringlib/partition.h"
1005
#include "stringlib/split.h"
1006
#include "stringlib/count.h"
1007
#include "stringlib/find.h"
1008
#include "stringlib/find_max_char.h"
1009
#include "stringlib/undef.h"
1010
1011
#include "stringlib/ucs1lib.h"
1012
#include "stringlib/fastsearch.h"
1013
#include "stringlib/partition.h"
1014
#include "stringlib/split.h"
1015
#include "stringlib/count.h"
1016
#include "stringlib/find.h"
1017
#include "stringlib/replace.h"
1018
#include "stringlib/repr.h"
1019
#include "stringlib/find_max_char.h"
1020
#include "stringlib/undef.h"
1021
1022
#include "stringlib/ucs2lib.h"
1023
#include "stringlib/fastsearch.h"
1024
#include "stringlib/partition.h"
1025
#include "stringlib/split.h"
1026
#include "stringlib/count.h"
1027
#include "stringlib/find.h"
1028
#include "stringlib/replace.h"
1029
#include "stringlib/repr.h"
1030
#include "stringlib/find_max_char.h"
1031
#include "stringlib/undef.h"
1032
1033
#include "stringlib/ucs4lib.h"
1034
#include "stringlib/fastsearch.h"
1035
#include "stringlib/partition.h"
1036
#include "stringlib/split.h"
1037
#include "stringlib/count.h"
1038
#include "stringlib/find.h"
1039
#include "stringlib/replace.h"
1040
#include "stringlib/repr.h"
1041
#include "stringlib/find_max_char.h"
1042
#include "stringlib/undef.h"
1043
1044
#undef STRINGLIB_GET_EMPTY
1045
1046
/* --- Unicode Object ----------------------------------------------------- */
1047
1048
static inline Py_ssize_t
1049
findchar(const void *s, int kind,
1050
         Py_ssize_t size, Py_UCS4 ch,
1051
         int direction)
1052
105M
{
1053
105M
    switch (kind) {
1054
97.7M
    case PyUnicode_1BYTE_KIND:
1055
97.7M
        if ((Py_UCS1) ch != ch)
1056
3.60k
            return -1;
1057
97.7M
        if (direction > 0)
1058
97.7M
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1059
7.83k
        else
1060
7.83k
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
1061
6.54M
    case PyUnicode_2BYTE_KIND:
1062
6.54M
        if ((Py_UCS2) ch != ch)
1063
0
            return -1;
1064
6.54M
        if (direction > 0)
1065
6.51M
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1066
27.6k
        else
1067
27.6k
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
1068
866k
    case PyUnicode_4BYTE_KIND:
1069
866k
        if (direction > 0)
1070
773k
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
1071
92.4k
        else
1072
92.4k
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
1073
0
    default:
1074
0
        Py_UNREACHABLE();
1075
105M
    }
1076
105M
}
1077
1078
#ifdef Py_DEBUG
1079
/* Fill the data of a Unicode string with invalid characters to detect bugs
1080
   earlier.
1081
1082
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1083
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1084
   invalid character in Unicode 6.0. */
1085
static void
1086
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1087
{
1088
    int kind = PyUnicode_KIND(unicode);
1089
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1090
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1091
    if (length <= old_length)
1092
        return;
1093
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1094
}
1095
#endif
1096
1097
static PyObject*
1098
resize_copy(PyObject *unicode, Py_ssize_t length)
1099
0
{
1100
0
    Py_ssize_t copy_length;
1101
0
    PyObject *copy;
1102
1103
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1104
0
    if (copy == NULL)
1105
0
        return NULL;
1106
1107
0
    copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1108
0
    _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1109
0
    return copy;
1110
0
}
1111
1112
static PyObject*
1113
resize_compact(PyObject *unicode, Py_ssize_t length)
1114
58.5M
{
1115
58.5M
    Py_ssize_t char_size;
1116
58.5M
    Py_ssize_t struct_size;
1117
58.5M
    Py_ssize_t new_size;
1118
58.5M
    PyObject *new_unicode;
1119
#ifdef Py_DEBUG
1120
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1121
#endif
1122
1123
58.5M
    if (!unicode_modifiable(unicode)) {
1124
0
        PyObject *copy = resize_copy(unicode, length);
1125
0
        if (copy == NULL) {
1126
0
            return NULL;
1127
0
        }
1128
0
        Py_DECREF(unicode);
1129
0
        return copy;
1130
0
    }
1131
58.5M
    assert(PyUnicode_IS_COMPACT(unicode));
1132
1133
58.5M
    char_size = PyUnicode_KIND(unicode);
1134
58.5M
    if (PyUnicode_IS_ASCII(unicode))
1135
50.3M
        struct_size = sizeof(PyASCIIObject);
1136
8.23M
    else
1137
8.23M
        struct_size = sizeof(PyCompactUnicodeObject);
1138
1139
58.5M
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1140
0
        PyErr_NoMemory();
1141
0
        return NULL;
1142
0
    }
1143
58.5M
    new_size = (struct_size + (length + 1) * char_size);
1144
1145
58.5M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1146
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1147
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1148
0
        PyUnicode_SET_UTF8(unicode, NULL);
1149
0
    }
1150
#ifdef Py_TRACE_REFS
1151
    _Py_ForgetReference(unicode);
1152
#endif
1153
58.5M
    _PyReftracerTrack(unicode, PyRefTracer_DESTROY);
1154
1155
58.5M
    new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
1156
58.5M
    if (new_unicode == NULL) {
1157
0
        _Py_NewReferenceNoTotal(unicode);
1158
0
        PyErr_NoMemory();
1159
0
        return NULL;
1160
0
    }
1161
58.5M
    unicode = new_unicode;
1162
58.5M
    _Py_NewReferenceNoTotal(unicode);
1163
1164
58.5M
    _PyUnicode_LENGTH(unicode) = length;
1165
#ifdef Py_DEBUG
1166
    unicode_fill_invalid(unicode, old_length);
1167
#endif
1168
58.5M
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1169
58.5M
                    length, 0);
1170
58.5M
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1171
58.5M
    return unicode;
1172
58.5M
}
1173
1174
static int
1175
resize_inplace(PyObject *unicode, Py_ssize_t length)
1176
0
{
1177
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1178
0
    assert(Py_REFCNT(unicode) == 1);
1179
1180
0
    Py_ssize_t new_size;
1181
0
    Py_ssize_t char_size;
1182
0
    int share_utf8;
1183
0
    void *data;
1184
#ifdef Py_DEBUG
1185
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1186
#endif
1187
1188
0
    data = _PyUnicode_DATA_ANY(unicode);
1189
0
    char_size = PyUnicode_KIND(unicode);
1190
0
    share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1191
1192
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1193
0
        PyErr_NoMemory();
1194
0
        return -1;
1195
0
    }
1196
0
    new_size = (length + 1) * char_size;
1197
1198
0
    if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1199
0
    {
1200
0
        PyMem_Free(_PyUnicode_UTF8(unicode));
1201
0
        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
1202
0
        PyUnicode_SET_UTF8(unicode, NULL);
1203
0
    }
1204
1205
0
    data = (PyObject *)PyObject_Realloc(data, new_size);
1206
0
    if (data == NULL) {
1207
0
        PyErr_NoMemory();
1208
0
        return -1;
1209
0
    }
1210
0
    _PyUnicode_DATA_ANY(unicode) = data;
1211
0
    if (share_utf8) {
1212
0
        PyUnicode_SET_UTF8_LENGTH(unicode, length);
1213
0
        PyUnicode_SET_UTF8(unicode, data);
1214
0
    }
1215
0
    _PyUnicode_LENGTH(unicode) = length;
1216
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1217
#ifdef Py_DEBUG
1218
    unicode_fill_invalid(unicode, old_length);
1219
#endif
1220
1221
    /* check for integer overflow */
1222
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1223
0
        PyErr_NoMemory();
1224
0
        return -1;
1225
0
    }
1226
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1227
0
    return 0;
1228
0
}
1229
1230
static const char*
1231
unicode_kind_name(PyObject *unicode)
1232
0
{
1233
    /* don't check consistency: unicode_kind_name() is called from
1234
       _PyUnicode_Dump() */
1235
0
    if (!PyUnicode_IS_COMPACT(unicode))
1236
0
    {
1237
0
        switch (PyUnicode_KIND(unicode))
1238
0
        {
1239
0
        case PyUnicode_1BYTE_KIND:
1240
0
            if (PyUnicode_IS_ASCII(unicode))
1241
0
                return "legacy ascii";
1242
0
            else
1243
0
                return "legacy latin1";
1244
0
        case PyUnicode_2BYTE_KIND:
1245
0
            return "legacy UCS2";
1246
0
        case PyUnicode_4BYTE_KIND:
1247
0
            return "legacy UCS4";
1248
0
        default:
1249
0
            return "<legacy invalid kind>";
1250
0
        }
1251
0
    }
1252
0
    switch (PyUnicode_KIND(unicode)) {
1253
0
    case PyUnicode_1BYTE_KIND:
1254
0
        if (PyUnicode_IS_ASCII(unicode))
1255
0
            return "ascii";
1256
0
        else
1257
0
            return "latin1";
1258
0
    case PyUnicode_2BYTE_KIND:
1259
0
        return "UCS2";
1260
0
    case PyUnicode_4BYTE_KIND:
1261
0
        return "UCS4";
1262
0
    default:
1263
0
        return "<invalid compact kind>";
1264
0
    }
1265
0
}
1266
1267
#ifdef Py_DEBUG
1268
/* Functions wrapping macros for use in debugger */
1269
const char *_PyUnicode_utf8(void *unicode_raw){
1270
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1271
    return PyUnicode_UTF8(unicode);
1272
}
1273
1274
const void *_PyUnicode_compact_data(void *unicode_raw) {
1275
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1276
    return _PyUnicode_COMPACT_DATA(unicode);
1277
}
1278
const void *_PyUnicode_data(void *unicode_raw) {
1279
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1280
    printf("obj %p\n", (void*)unicode);
1281
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1282
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1283
    printf("ascii op %p\n", (void*)(_PyASCIIObject_CAST(unicode) + 1));
1284
    printf("compact op %p\n", (void*)(_PyCompactUnicodeObject_CAST(unicode) + 1));
1285
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1286
    return PyUnicode_DATA(unicode);
1287
}
1288
1289
void
1290
_PyUnicode_Dump(PyObject *op)
1291
{
1292
    PyASCIIObject *ascii = _PyASCIIObject_CAST(op);
1293
    PyCompactUnicodeObject *compact = _PyCompactUnicodeObject_CAST(op);
1294
    PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
1295
    const void *data;
1296
1297
    if (ascii->state.compact)
1298
    {
1299
        if (ascii->state.ascii)
1300
            data = (ascii + 1);
1301
        else
1302
            data = (compact + 1);
1303
    }
1304
    else
1305
        data = unicode->data.any;
1306
    printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
1307
1308
    if (!ascii->state.ascii) {
1309
        printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
1310
    }
1311
    printf(", data=%p\n", data);
1312
}
1313
#endif
1314
1315
1316
PyObject *
1317
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1318
526M
{
1319
    /* Optimization for empty strings */
1320
526M
    if (size == 0) {
1321
23.6M
        return unicode_get_empty();
1322
23.6M
    }
1323
1324
503M
    PyObject *obj;
1325
503M
    PyCompactUnicodeObject *unicode;
1326
503M
    void *data;
1327
503M
    int kind;
1328
503M
    int is_ascii;
1329
503M
    Py_ssize_t char_size;
1330
503M
    Py_ssize_t struct_size;
1331
1332
503M
    is_ascii = 0;
1333
503M
    struct_size = sizeof(PyCompactUnicodeObject);
1334
503M
    if (maxchar < 128) {
1335
291M
        kind = PyUnicode_1BYTE_KIND;
1336
291M
        char_size = 1;
1337
291M
        is_ascii = 1;
1338
291M
        struct_size = sizeof(PyASCIIObject);
1339
291M
    }
1340
211M
    else if (maxchar < 256) {
1341
25.0M
        kind = PyUnicode_1BYTE_KIND;
1342
25.0M
        char_size = 1;
1343
25.0M
    }
1344
186M
    else if (maxchar < 65536) {
1345
180M
        kind = PyUnicode_2BYTE_KIND;
1346
180M
        char_size = 2;
1347
180M
    }
1348
5.80M
    else {
1349
5.80M
        if (maxchar > MAX_UNICODE) {
1350
0
            PyErr_SetString(PyExc_SystemError,
1351
0
                            "invalid maximum character passed to PyUnicode_New");
1352
0
            return NULL;
1353
0
        }
1354
5.80M
        kind = PyUnicode_4BYTE_KIND;
1355
5.80M
        char_size = 4;
1356
5.80M
    }
1357
1358
    /* Ensure we won't overflow the size. */
1359
503M
    if (size < 0) {
1360
0
        PyErr_SetString(PyExc_SystemError,
1361
0
                        "Negative size passed to PyUnicode_New");
1362
0
        return NULL;
1363
0
    }
1364
503M
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1365
0
        return PyErr_NoMemory();
1366
1367
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1368
     * PyObject_New() so we are able to allocate space for the object and
1369
     * it's data buffer.
1370
     */
1371
503M
    obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
1372
503M
    if (obj == NULL) {
1373
0
        return PyErr_NoMemory();
1374
0
    }
1375
503M
    _PyObject_Init(obj, &PyUnicode_Type);
1376
1377
503M
    unicode = (PyCompactUnicodeObject *)obj;
1378
503M
    if (is_ascii)
1379
291M
        data = ((PyASCIIObject*)obj) + 1;
1380
211M
    else
1381
211M
        data = unicode + 1;
1382
503M
    _PyUnicode_LENGTH(unicode) = size;
1383
503M
    _PyUnicode_HASH(unicode) = -1;
1384
503M
    _PyUnicode_STATE(unicode).interned = 0;
1385
503M
    _PyUnicode_STATE(unicode).kind = kind;
1386
503M
    _PyUnicode_STATE(unicode).compact = 1;
1387
503M
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1388
503M
    _PyUnicode_STATE(unicode).statically_allocated = 0;
1389
503M
    if (is_ascii) {
1390
291M
        ((char*)data)[size] = 0;
1391
291M
    }
1392
211M
    else if (kind == PyUnicode_1BYTE_KIND) {
1393
25.0M
        ((char*)data)[size] = 0;
1394
25.0M
        unicode->utf8 = NULL;
1395
25.0M
        unicode->utf8_length = 0;
1396
25.0M
    }
1397
186M
    else {
1398
186M
        unicode->utf8 = NULL;
1399
186M
        unicode->utf8_length = 0;
1400
186M
        if (kind == PyUnicode_2BYTE_KIND)
1401
180M
            ((Py_UCS2*)data)[size] = 0;
1402
5.80M
        else /* kind == PyUnicode_4BYTE_KIND */
1403
5.80M
            ((Py_UCS4*)data)[size] = 0;
1404
186M
    }
1405
#ifdef Py_DEBUG
1406
    unicode_fill_invalid((PyObject*)unicode, 0);
1407
#endif
1408
503M
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1409
503M
    return obj;
1410
503M
}
1411
1412
static int
1413
unicode_check_modifiable(PyObject *unicode)
1414
739
{
1415
739
    if (!unicode_modifiable(unicode)) {
1416
0
        PyErr_SetString(PyExc_SystemError,
1417
0
                        "Cannot modify a string currently used");
1418
0
        return -1;
1419
0
    }
1420
739
    return 0;
1421
739
}
1422
1423
static int
1424
_copy_characters(PyObject *to, Py_ssize_t to_start,
1425
                 PyObject *from, Py_ssize_t from_start,
1426
                 Py_ssize_t how_many, int check_maxchar)
1427
277M
{
1428
277M
    int from_kind, to_kind;
1429
277M
    const void *from_data;
1430
277M
    void *to_data;
1431
1432
277M
    assert(0 <= how_many);
1433
277M
    assert(0 <= from_start);
1434
277M
    assert(0 <= to_start);
1435
277M
    assert(PyUnicode_Check(from));
1436
277M
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1437
1438
277M
    assert(to == NULL || PyUnicode_Check(to));
1439
1440
277M
    if (how_many == 0) {
1441
266k
        return 0;
1442
266k
    }
1443
1444
277M
    assert(to != NULL);
1445
277M
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1446
1447
277M
    from_kind = PyUnicode_KIND(from);
1448
277M
    from_data = PyUnicode_DATA(from);
1449
277M
    to_kind = PyUnicode_KIND(to);
1450
277M
    to_data = PyUnicode_DATA(to);
1451
1452
#ifdef Py_DEBUG
1453
    if (!check_maxchar
1454
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1455
    {
1456
        Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1457
        Py_UCS4 ch;
1458
        Py_ssize_t i;
1459
        for (i=0; i < how_many; i++) {
1460
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1461
            assert(ch <= to_maxchar);
1462
        }
1463
    }
1464
#endif
1465
1466
277M
    if (from_kind == to_kind) {
1467
179M
        if (check_maxchar
1468
0
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1469
0
        {
1470
            /* Writing Latin-1 characters into an ASCII string requires to
1471
               check that all written characters are pure ASCII */
1472
0
            Py_UCS4 max_char;
1473
0
            max_char = ucs1lib_find_max_char(from_data,
1474
0
                                             (const Py_UCS1*)from_data + how_many);
1475
0
            if (max_char >= 128)
1476
0
                return -1;
1477
0
        }
1478
179M
        memcpy((char*)to_data + to_kind * to_start,
1479
179M
                  (const char*)from_data + from_kind * from_start,
1480
179M
                  to_kind * how_many);
1481
179M
    }
1482
97.7M
    else if (from_kind == PyUnicode_1BYTE_KIND
1483
95.9M
             && to_kind == PyUnicode_2BYTE_KIND)
1484
80.1M
    {
1485
80.1M
        _PyUnicode_CONVERT_BYTES(
1486
80.1M
            Py_UCS1, Py_UCS2,
1487
80.1M
            PyUnicode_1BYTE_DATA(from) + from_start,
1488
80.1M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1489
80.1M
            PyUnicode_2BYTE_DATA(to) + to_start
1490
80.1M
            );
1491
80.1M
    }
1492
17.6M
    else if (from_kind == PyUnicode_1BYTE_KIND
1493
15.8M
             && to_kind == PyUnicode_4BYTE_KIND)
1494
15.8M
    {
1495
15.8M
        _PyUnicode_CONVERT_BYTES(
1496
15.8M
            Py_UCS1, Py_UCS4,
1497
15.8M
            PyUnicode_1BYTE_DATA(from) + from_start,
1498
15.8M
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1499
15.8M
            PyUnicode_4BYTE_DATA(to) + to_start
1500
15.8M
            );
1501
15.8M
    }
1502
1.81M
    else if (from_kind == PyUnicode_2BYTE_KIND
1503
1.78M
             && to_kind == PyUnicode_4BYTE_KIND)
1504
1.77M
    {
1505
1.77M
        _PyUnicode_CONVERT_BYTES(
1506
1.77M
            Py_UCS2, Py_UCS4,
1507
1.77M
            PyUnicode_2BYTE_DATA(from) + from_start,
1508
1.77M
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1509
1.77M
            PyUnicode_4BYTE_DATA(to) + to_start
1510
1.77M
            );
1511
1.77M
    }
1512
32.3k
    else {
1513
32.3k
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1514
1515
32.3k
        if (!check_maxchar) {
1516
32.3k
            if (from_kind == PyUnicode_2BYTE_KIND
1517
2.35k
                && to_kind == PyUnicode_1BYTE_KIND)
1518
2.35k
            {
1519
2.35k
                _PyUnicode_CONVERT_BYTES(
1520
2.35k
                    Py_UCS2, Py_UCS1,
1521
2.35k
                    PyUnicode_2BYTE_DATA(from) + from_start,
1522
2.35k
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1523
2.35k
                    PyUnicode_1BYTE_DATA(to) + to_start
1524
2.35k
                    );
1525
2.35k
            }
1526
29.9k
            else if (from_kind == PyUnicode_4BYTE_KIND
1527
29.9k
                     && to_kind == PyUnicode_1BYTE_KIND)
1528
9.00k
            {
1529
9.00k
                _PyUnicode_CONVERT_BYTES(
1530
9.00k
                    Py_UCS4, Py_UCS1,
1531
9.00k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1532
9.00k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1533
9.00k
                    PyUnicode_1BYTE_DATA(to) + to_start
1534
9.00k
                    );
1535
9.00k
            }
1536
20.9k
            else if (from_kind == PyUnicode_4BYTE_KIND
1537
20.9k
                     && to_kind == PyUnicode_2BYTE_KIND)
1538
20.9k
            {
1539
20.9k
                _PyUnicode_CONVERT_BYTES(
1540
20.9k
                    Py_UCS4, Py_UCS2,
1541
20.9k
                    PyUnicode_4BYTE_DATA(from) + from_start,
1542
20.9k
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1543
20.9k
                    PyUnicode_2BYTE_DATA(to) + to_start
1544
20.9k
                    );
1545
20.9k
            }
1546
0
            else {
1547
0
                Py_UNREACHABLE();
1548
0
            }
1549
32.3k
        }
1550
0
        else {
1551
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1552
0
            Py_UCS4 ch;
1553
0
            Py_ssize_t i;
1554
1555
0
            for (i=0; i < how_many; i++) {
1556
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1557
0
                if (ch > to_maxchar)
1558
0
                    return -1;
1559
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1560
0
            }
1561
0
        }
1562
32.3k
    }
1563
277M
    return 0;
1564
277M
}
1565
1566
void
1567
_PyUnicode_FastCopyCharacters(
1568
    PyObject *to, Py_ssize_t to_start,
1569
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1570
277M
{
1571
277M
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1572
277M
}
1573
1574
Py_ssize_t
1575
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1576
                         PyObject *from, Py_ssize_t from_start,
1577
                         Py_ssize_t how_many)
1578
0
{
1579
0
    int err;
1580
1581
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1582
0
        PyErr_BadInternalCall();
1583
0
        return -1;
1584
0
    }
1585
1586
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1587
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1588
0
        return -1;
1589
0
    }
1590
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1591
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1592
0
        return -1;
1593
0
    }
1594
0
    if (how_many < 0) {
1595
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1596
0
        return -1;
1597
0
    }
1598
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1599
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1600
0
        PyErr_Format(PyExc_SystemError,
1601
0
                     "Cannot write %zi characters at %zi "
1602
0
                     "in a string of %zi characters",
1603
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1604
0
        return -1;
1605
0
    }
1606
1607
0
    if (how_many == 0)
1608
0
        return 0;
1609
1610
0
    if (unicode_check_modifiable(to))
1611
0
        return -1;
1612
1613
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1614
0
    if (err) {
1615
0
        PyErr_Format(PyExc_SystemError,
1616
0
                     "Cannot copy %s characters "
1617
0
                     "into a string of %s characters",
1618
0
                     unicode_kind_name(from),
1619
0
                     unicode_kind_name(to));
1620
0
        return -1;
1621
0
    }
1622
0
    return how_many;
1623
0
}
1624
1625
/* Find the maximum code point and count the number of surrogate pairs so a
1626
   correct string length can be computed before converting a string to UCS4.
1627
   This function counts single surrogates as a character and not as a pair.
1628
1629
   Return 0 on success, or -1 on error. */
1630
static int
1631
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1632
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1633
17.6k
{
1634
17.6k
    const wchar_t *iter;
1635
17.6k
    Py_UCS4 ch;
1636
1637
17.6k
    assert(num_surrogates != NULL && maxchar != NULL);
1638
17.6k
    *num_surrogates = 0;
1639
17.6k
    *maxchar = 0;
1640
1641
394k
    for (iter = begin; iter < end; ) {
1642
#if SIZEOF_WCHAR_T == 2
1643
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1644
            && (iter+1) < end
1645
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1646
        {
1647
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1648
            ++(*num_surrogates);
1649
            iter += 2;
1650
        }
1651
        else
1652
#endif
1653
376k
        {
1654
376k
            ch = *iter;
1655
376k
            iter++;
1656
376k
        }
1657
376k
        if (ch > *maxchar) {
1658
77.0k
            *maxchar = ch;
1659
77.0k
            if (*maxchar > MAX_UNICODE) {
1660
0
                PyErr_Format(PyExc_ValueError,
1661
0
                             "character U+%x is not in range [U+0000; U+%x]",
1662
0
                             ch, MAX_UNICODE);
1663
0
                return -1;
1664
0
            }
1665
77.0k
        }
1666
376k
    }
1667
17.6k
    return 0;
1668
17.6k
}
1669
1670
static void
1671
unicode_dealloc(PyObject *unicode)
1672
513M
{
1673
#ifdef Py_DEBUG
1674
    if (!unicode_is_finalizing() && unicode_is_singleton(unicode)) {
1675
        _Py_FatalRefcountError("deallocating an Unicode singleton");
1676
    }
1677
#endif
1678
513M
    if (_PyUnicode_STATE(unicode).statically_allocated) {
1679
        /* This should never get called, but we also don't want to SEGV if
1680
        * we accidentally decref an immortal string out of existence. Since
1681
        * the string is an immortal object, just re-set the reference count.
1682
        */
1683
#ifdef Py_DEBUG
1684
        Py_UNREACHABLE();
1685
#endif
1686
0
        _Py_SetImmortal(unicode);
1687
0
        return;
1688
0
    }
1689
513M
    switch (_PyUnicode_STATE(unicode).interned) {
1690
513M
        case SSTATE_NOT_INTERNED:
1691
513M
            break;
1692
650k
        case SSTATE_INTERNED_MORTAL:
1693
            /* Remove the object from the intern dict.
1694
             * Before doing so, we set the refcount to 2: the key and value
1695
             * in the interned_dict.
1696
             */
1697
650k
            assert(Py_REFCNT(unicode) == 0);
1698
650k
            Py_SET_REFCNT(unicode, 2);
1699
#ifdef Py_REF_DEBUG
1700
            /* let's be pedantic with the ref total */
1701
            _Py_IncRefTotal(_PyThreadState_GET());
1702
            _Py_IncRefTotal(_PyThreadState_GET());
1703
#endif
1704
650k
            PyInterpreterState *interp = _PyInterpreterState_GET();
1705
650k
            PyObject *interned = get_interned_dict(interp);
1706
650k
            assert(interned != NULL);
1707
650k
            PyObject *popped;
1708
650k
            int r = PyDict_Pop(interned, unicode, &popped);
1709
650k
            if (r == -1) {
1710
0
                PyErr_FormatUnraisable("Exception ignored while "
1711
0
                                       "removing an interned string %R",
1712
0
                                       unicode);
1713
                // We don't know what happened to the string. It's probably
1714
                // best to leak it:
1715
                // - if it was popped, there are no more references to it
1716
                //   so it can't cause trouble (except wasted memory)
1717
                // - if it wasn't popped, it'll remain interned
1718
0
                _Py_SetImmortal(unicode);
1719
0
                _PyUnicode_STATE(unicode).interned = SSTATE_INTERNED_IMMORTAL;
1720
0
                return;
1721
0
            }
1722
650k
            if (r == 0) {
1723
                // The interned string was not found in the interned_dict.
1724
#ifdef Py_DEBUG
1725
                Py_UNREACHABLE();
1726
#endif
1727
0
                _Py_SetImmortal(unicode);
1728
0
                return;
1729
0
            }
1730
            // Successfully popped.
1731
650k
            assert(popped == unicode);
1732
            // Only our `popped` reference should be left; remove it too.
1733
650k
            assert(Py_REFCNT(unicode) == 1);
1734
650k
            Py_SET_REFCNT(unicode, 0);
1735
#ifdef Py_REF_DEBUG
1736
            /* let's be pedantic with the ref total */
1737
            _Py_DecRefTotal(_PyThreadState_GET());
1738
#endif
1739
650k
            break;
1740
0
        default:
1741
            // As with `statically_allocated` above.
1742
#ifdef Py_REF_DEBUG
1743
            Py_UNREACHABLE();
1744
#endif
1745
0
            _Py_SetImmortal(unicode);
1746
0
            return;
1747
513M
    }
1748
513M
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1749
142k
        PyMem_Free(_PyUnicode_UTF8(unicode));
1750
142k
    }
1751
513M
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
1752
10.9M
        PyMem_Free(_PyUnicode_DATA_ANY(unicode));
1753
10.9M
    }
1754
1755
513M
    Py_TYPE(unicode)->tp_free(unicode);
1756
513M
}
1757
1758
#ifdef Py_DEBUG
1759
static int
1760
unicode_is_singleton(PyObject *unicode)
1761
{
1762
    if (unicode == &_Py_STR(empty)) {
1763
        return 1;
1764
    }
1765
1766
    PyASCIIObject *ascii = _PyASCIIObject_CAST(unicode);
1767
    if (ascii->length == 1) {
1768
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1769
        if (ch < 256 && LATIN1(ch) == unicode) {
1770
            return 1;
1771
        }
1772
    }
1773
    return 0;
1774
}
1775
#endif
1776
1777
static int
1778
unicode_modifiable(PyObject *unicode)
1779
60.6M
{
1780
60.6M
    assert(_PyUnicode_CHECK(unicode));
1781
60.6M
    if (!_PyObject_IsUniquelyReferenced(unicode))
1782
44.6k
        return 0;
1783
60.5M
    if (PyUnicode_HASH(unicode) != -1)
1784
0
        return 0;
1785
60.5M
    if (PyUnicode_CHECK_INTERNED(unicode))
1786
0
        return 0;
1787
60.5M
    if (!PyUnicode_CheckExact(unicode))
1788
0
        return 0;
1789
#ifdef Py_DEBUG
1790
    /* singleton refcount is greater than 1 */
1791
    assert(!unicode_is_singleton(unicode));
1792
#endif
1793
60.5M
    return 1;
1794
60.5M
}
1795
1796
static int
1797
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1798
992k
{
1799
992k
    PyObject *unicode;
1800
992k
    Py_ssize_t old_length;
1801
1802
992k
    assert(p_unicode != NULL);
1803
992k
    unicode = *p_unicode;
1804
1805
992k
    assert(unicode != NULL);
1806
992k
    assert(PyUnicode_Check(unicode));
1807
992k
    assert(0 <= length);
1808
1809
992k
    old_length = PyUnicode_GET_LENGTH(unicode);
1810
992k
    if (old_length == length)
1811
0
        return 0;
1812
1813
992k
    if (length == 0) {
1814
0
        PyObject *empty = unicode_get_empty();
1815
0
        Py_SETREF(*p_unicode, empty);
1816
0
        return 0;
1817
0
    }
1818
1819
992k
    if (!unicode_modifiable(unicode)) {
1820
0
        PyObject *copy = resize_copy(unicode, length);
1821
0
        if (copy == NULL)
1822
0
            return -1;
1823
0
        Py_SETREF(*p_unicode, copy);
1824
0
        return 0;
1825
0
    }
1826
1827
992k
    if (PyUnicode_IS_COMPACT(unicode)) {
1828
992k
        PyObject *new_unicode = resize_compact(unicode, length);
1829
992k
        if (new_unicode == NULL)
1830
0
            return -1;
1831
992k
        *p_unicode = new_unicode;
1832
992k
        return 0;
1833
992k
    }
1834
0
    return resize_inplace(unicode, length);
1835
992k
}
1836
1837
int
1838
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1839
0
{
1840
0
    PyObject *unicode;
1841
0
    if (p_unicode == NULL) {
1842
0
        PyErr_BadInternalCall();
1843
0
        return -1;
1844
0
    }
1845
0
    unicode = *p_unicode;
1846
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1847
0
    {
1848
0
        PyErr_BadInternalCall();
1849
0
        return -1;
1850
0
    }
1851
0
    return unicode_resize(p_unicode, length);
1852
0
}
1853
1854
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1855
1856
   WARNING: The function doesn't copy the terminating null character and
1857
   doesn't check the maximum character (may write a latin1 character in an
1858
   ASCII string). */
1859
static void
1860
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1861
                   const char *str, Py_ssize_t len)
1862
0
{
1863
0
    int kind = PyUnicode_KIND(unicode);
1864
0
    const void *data = PyUnicode_DATA(unicode);
1865
0
    const char *end = str + len;
1866
1867
0
    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1868
0
    switch (kind) {
1869
0
    case PyUnicode_1BYTE_KIND: {
1870
#ifdef Py_DEBUG
1871
        if (PyUnicode_IS_ASCII(unicode)) {
1872
            Py_UCS4 maxchar = ucs1lib_find_max_char(
1873
                (const Py_UCS1*)str,
1874
                (const Py_UCS1*)str + len);
1875
            assert(maxchar < 128);
1876
        }
1877
#endif
1878
0
        memcpy((char *) data + index, str, len);
1879
0
        break;
1880
0
    }
1881
0
    case PyUnicode_2BYTE_KIND: {
1882
0
        Py_UCS2 *start = (Py_UCS2 *)data + index;
1883
0
        Py_UCS2 *ucs2 = start;
1884
1885
0
        for (; str < end; ++ucs2, ++str)
1886
0
            *ucs2 = (Py_UCS2)*str;
1887
1888
0
        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1889
0
        break;
1890
0
    }
1891
0
    case PyUnicode_4BYTE_KIND: {
1892
0
        Py_UCS4 *start = (Py_UCS4 *)data + index;
1893
0
        Py_UCS4 *ucs4 = start;
1894
1895
0
        for (; str < end; ++ucs4, ++str)
1896
0
            *ucs4 = (Py_UCS4)*str;
1897
1898
0
        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1899
0
        break;
1900
0
    }
1901
0
    default:
1902
0
        Py_UNREACHABLE();
1903
0
    }
1904
0
}
1905
1906
static PyObject*
1907
get_latin1_char(Py_UCS1 ch)
1908
257M
{
1909
257M
    PyObject *o = LATIN1(ch);
1910
257M
    return o;
1911
257M
}
1912
1913
static PyObject*
1914
unicode_char(Py_UCS4 ch)
1915
326M
{
1916
326M
    PyObject *unicode;
1917
1918
326M
    assert(ch <= MAX_UNICODE);
1919
1920
326M
    if (ch < 256) {
1921
201M
        return get_latin1_char(ch);
1922
201M
    }
1923
1924
124M
    unicode = PyUnicode_New(1, ch);
1925
124M
    if (unicode == NULL)
1926
0
        return NULL;
1927
1928
124M
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
1929
124M
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
1930
121M
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1931
121M
    } else {
1932
3.25M
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1933
3.25M
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1934
3.25M
    }
1935
124M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1936
124M
    return unicode;
1937
124M
}
1938
1939
1940
static inline void
1941
unicode_write_widechar(int kind, void *data,
1942
                       const wchar_t *u, Py_ssize_t size,
1943
                       Py_ssize_t num_surrogates)
1944
17.6k
{
1945
17.6k
    switch (kind) {
1946
17.6k
    case PyUnicode_1BYTE_KIND:
1947
17.6k
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char, u, u + size, data);
1948
17.6k
        break;
1949
1950
0
    case PyUnicode_2BYTE_KIND:
1951
#if SIZEOF_WCHAR_T == 2
1952
        memcpy(data, u, size * 2);
1953
#else
1954
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2, u, u + size, data);
1955
0
#endif
1956
0
        break;
1957
1958
0
    case PyUnicode_4BYTE_KIND:
1959
0
    {
1960
#if SIZEOF_WCHAR_T == 2
1961
        // Convert a 16-bits wchar_t representation to UCS4, this will decode
1962
        // surrogate pairs.
1963
        const wchar_t *end = u + size;
1964
        Py_UCS4 *ucs4_out = (Py_UCS4*)data;
1965
#  ifndef NDEBUG
1966
        Py_UCS4 *ucs4_end = (Py_UCS4*)data + (size - num_surrogates);
1967
#  endif
1968
        for (const wchar_t *iter = u; iter < end; ) {
1969
            assert(ucs4_out < ucs4_end);
1970
            if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1971
                && (iter+1) < end
1972
                && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1973
            {
1974
                *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1975
                iter += 2;
1976
            }
1977
            else {
1978
                *ucs4_out++ = *iter;
1979
                iter++;
1980
            }
1981
        }
1982
        assert(ucs4_out == ucs4_end);
1983
#else
1984
0
        assert(num_surrogates == 0);
1985
0
        memcpy(data, u, size * 4);
1986
0
#endif
1987
0
        break;
1988
0
    }
1989
0
    default:
1990
0
        Py_UNREACHABLE();
1991
17.6k
    }
1992
17.6k
}
1993
1994
1995
PyObject *
1996
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
1997
17.7k
{
1998
17.7k
    PyObject *unicode;
1999
17.7k
    Py_UCS4 maxchar = 0;
2000
17.7k
    Py_ssize_t num_surrogates;
2001
2002
17.7k
    if (u == NULL && size != 0) {
2003
0
        PyErr_BadInternalCall();
2004
0
        return NULL;
2005
0
    }
2006
2007
17.7k
    if (size == -1) {
2008
576
        size = wcslen(u);
2009
576
    }
2010
2011
    /* If the Unicode data is known at construction time, we can apply
2012
       some optimizations which share commonly used objects. */
2013
2014
    /* Optimization for empty strings */
2015
17.7k
    if (size == 0)
2016
32
        _Py_RETURN_UNICODE_EMPTY();
2017
2018
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2019
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2020
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2021
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2022
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2023
        if (!converted) {
2024
            return NULL;
2025
        }
2026
        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2027
        PyMem_Free(converted);
2028
        return unicode;
2029
    }
2030
#endif
2031
2032
    /* Single character Unicode objects in the Latin-1 range are
2033
       shared when using this constructor */
2034
17.6k
    if (size == 1 && (Py_UCS4)*u < 256)
2035
0
        return get_latin1_char((unsigned char)*u);
2036
2037
    /* If not empty and not single character, copy the Unicode data
2038
       into the new object */
2039
17.6k
    if (find_maxchar_surrogates(u, u + size,
2040
17.6k
                                &maxchar, &num_surrogates) == -1)
2041
0
        return NULL;
2042
2043
17.6k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
2044
17.6k
    if (!unicode)
2045
0
        return NULL;
2046
2047
17.6k
    unicode_write_widechar(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
2048
17.6k
                           u, size, num_surrogates);
2049
2050
17.6k
    return unicode_result(unicode);
2051
17.6k
}
2052
2053
2054
int
2055
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *pub_writer,
2056
                              const wchar_t *str,
2057
                              Py_ssize_t size)
2058
0
{
2059
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
2060
2061
0
    if (size < 0) {
2062
0
        size = wcslen(str);
2063
0
    }
2064
2065
0
    if (size == 0) {
2066
0
        return 0;
2067
0
    }
2068
2069
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2070
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
2071
       non-Unicode locales and hence needs conversion to UCS-4 first. */
2072
    if (_Py_LocaleUsesNonUnicodeWchar()) {
2073
        wchar_t* converted = _Py_DecodeNonUnicodeWchar(str, size);
2074
        if (!converted) {
2075
            return -1;
2076
        }
2077
2078
        int res = PyUnicodeWriter_WriteUCS4(pub_writer, converted, size);
2079
        PyMem_Free(converted);
2080
        return res;
2081
    }
2082
#endif
2083
2084
0
    Py_UCS4 maxchar = 0;
2085
0
    Py_ssize_t num_surrogates;
2086
0
    if (find_maxchar_surrogates(str, str + size,
2087
0
                                &maxchar, &num_surrogates) == -1) {
2088
0
        return -1;
2089
0
    }
2090
2091
0
    if (_PyUnicodeWriter_Prepare(writer, size - num_surrogates, maxchar) < 0) {
2092
0
        return -1;
2093
0
    }
2094
2095
0
    int kind = writer->kind;
2096
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2097
0
    unicode_write_widechar(kind, data, str, size, num_surrogates);
2098
2099
0
    writer->pos += size - num_surrogates;
2100
0
    return 0;
2101
0
}
2102
2103
2104
PyObject *
2105
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2106
571k
{
2107
571k
    if (size < 0) {
2108
0
        PyErr_SetString(PyExc_SystemError,
2109
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2110
0
        return NULL;
2111
0
    }
2112
571k
    if (u != NULL) {
2113
571k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2114
571k
    }
2115
0
    if (size > 0) {
2116
0
        PyErr_SetString(PyExc_SystemError,
2117
0
            "NULL string with positive size with NULL passed to PyUnicode_FromStringAndSize");
2118
0
        return NULL;
2119
0
    }
2120
0
    return unicode_get_empty();
2121
0
}
2122
2123
PyObject *
2124
PyUnicode_FromString(const char *u)
2125
7.34M
{
2126
7.34M
    size_t size = strlen(u);
2127
7.34M
    if (size > PY_SSIZE_T_MAX) {
2128
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2129
0
        return NULL;
2130
0
    }
2131
7.34M
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2132
7.34M
}
2133
2134
2135
PyObject *
2136
_PyUnicode_FromId(_Py_Identifier *id)
2137
0
{
2138
0
    PyMutex_Lock((PyMutex *)&id->mutex);
2139
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
2140
0
    struct _Py_unicode_ids *ids = &interp->unicode.ids;
2141
2142
0
    Py_ssize_t index = _Py_atomic_load_ssize(&id->index);
2143
0
    if (index < 0) {
2144
0
        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_state.ids;
2145
2146
0
        PyMutex_Lock(&rt_ids->mutex);
2147
        // Check again to detect concurrent access. Another thread can have
2148
        // initialized the index while this thread waited for the lock.
2149
0
        index = _Py_atomic_load_ssize(&id->index);
2150
0
        if (index < 0) {
2151
0
            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2152
0
            index = rt_ids->next_index;
2153
0
            rt_ids->next_index++;
2154
0
            _Py_atomic_store_ssize(&id->index, index);
2155
0
        }
2156
0
        PyMutex_Unlock(&rt_ids->mutex);
2157
0
    }
2158
0
    assert(index >= 0);
2159
2160
0
    PyObject *obj;
2161
0
    if (index < ids->size) {
2162
0
        obj = ids->array[index];
2163
0
        if (obj) {
2164
            // Return a borrowed reference
2165
0
            goto end;
2166
0
        }
2167
0
    }
2168
2169
0
    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
2170
0
                                       NULL, NULL);
2171
0
    if (!obj) {
2172
0
        goto end;
2173
0
    }
2174
0
    _PyUnicode_InternImmortal(interp, &obj);
2175
2176
0
    if (index >= ids->size) {
2177
        // Overallocate to reduce the number of realloc
2178
0
        Py_ssize_t new_size = Py_MAX(index * 2, 16);
2179
0
        Py_ssize_t item_size = sizeof(ids->array[0]);
2180
0
        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2181
0
        if (new_array == NULL) {
2182
0
            PyErr_NoMemory();
2183
0
            obj = NULL;
2184
0
            goto end;
2185
0
        }
2186
0
        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2187
0
        ids->array = new_array;
2188
0
        ids->size = new_size;
2189
0
    }
2190
2191
    // The array stores a strong reference
2192
0
    ids->array[index] = obj;
2193
2194
0
end:
2195
0
    PyMutex_Unlock((PyMutex *)&id->mutex);
2196
    // Return a borrowed reference
2197
0
    return obj;
2198
0
}
2199
2200
2201
static void
2202
unicode_clear_identifiers(struct _Py_unicode_state *state)
2203
0
{
2204
0
    struct _Py_unicode_ids *ids = &state->ids;
2205
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
2206
0
        Py_XDECREF(ids->array[i]);
2207
0
    }
2208
0
    ids->size = 0;
2209
0
    PyMem_Free(ids->array);
2210
0
    ids->array = NULL;
2211
    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2212
    // after Py_Finalize().
2213
0
}
2214
2215
2216
/* Internal function, doesn't check maximum character */
2217
2218
PyObject*
2219
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2220
90.8M
{
2221
90.8M
    const unsigned char *s = (const unsigned char *)buffer;
2222
90.8M
    PyObject *unicode;
2223
90.8M
    if (size == 1) {
2224
#ifdef Py_DEBUG
2225
        assert((unsigned char)s[0] < 128);
2226
#endif
2227
33.3M
        return get_latin1_char(s[0]);
2228
33.3M
    }
2229
57.5M
    unicode = PyUnicode_New(size, 127);
2230
57.5M
    if (!unicode)
2231
0
        return NULL;
2232
57.5M
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2233
57.5M
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2234
57.5M
    return unicode;
2235
57.5M
}
2236
2237
static Py_UCS4
2238
kind_maxchar_limit(int kind)
2239
0
{
2240
0
    switch (kind) {
2241
0
    case PyUnicode_1BYTE_KIND:
2242
0
        return 0x80;
2243
0
    case PyUnicode_2BYTE_KIND:
2244
0
        return 0x100;
2245
0
    case PyUnicode_4BYTE_KIND:
2246
0
        return 0x10000;
2247
0
    default:
2248
0
        Py_UNREACHABLE();
2249
0
    }
2250
0
}
2251
2252
static PyObject*
2253
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2254
73.2M
{
2255
73.2M
    PyObject *res;
2256
73.2M
    unsigned char max_char;
2257
2258
73.2M
    if (size == 0) {
2259
10.6M
        _Py_RETURN_UNICODE_EMPTY();
2260
10.6M
    }
2261
73.2M
    assert(size > 0);
2262
62.6M
    if (size == 1) {
2263
21.0M
        return get_latin1_char(u[0]);
2264
21.0M
    }
2265
2266
41.6M
    max_char = ucs1lib_find_max_char(u, u + size);
2267
41.6M
    res = PyUnicode_New(size, max_char);
2268
41.6M
    if (!res)
2269
0
        return NULL;
2270
41.6M
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2271
41.6M
    assert(_PyUnicode_CheckConsistency(res, 1));
2272
41.6M
    return res;
2273
41.6M
}
2274
2275
static PyObject*
2276
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2277
104M
{
2278
104M
    PyObject *res;
2279
104M
    Py_UCS2 max_char;
2280
2281
104M
    if (size == 0)
2282
14.1M
        _Py_RETURN_UNICODE_EMPTY();
2283
104M
    assert(size > 0);
2284
90.5M
    if (size == 1)
2285
62.3M
        return unicode_char(u[0]);
2286
2287
28.2M
    max_char = ucs2lib_find_max_char(u, u + size);
2288
28.2M
    res = PyUnicode_New(size, max_char);
2289
28.2M
    if (!res)
2290
0
        return NULL;
2291
28.2M
    if (max_char >= 256)
2292
16.7M
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2293
11.4M
    else {
2294
11.4M
        _PyUnicode_CONVERT_BYTES(
2295
11.4M
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2296
11.4M
    }
2297
28.2M
    assert(_PyUnicode_CheckConsistency(res, 1));
2298
28.2M
    return res;
2299
28.2M
}
2300
2301
static PyObject*
2302
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2303
75.1M
{
2304
75.1M
    PyObject *res;
2305
75.1M
    Py_UCS4 max_char;
2306
2307
75.1M
    if (size == 0)
2308
7.58M
        _Py_RETURN_UNICODE_EMPTY();
2309
75.1M
    assert(size > 0);
2310
67.5M
    if (size == 1)
2311
48.7M
        return unicode_char(u[0]);
2312
2313
18.8M
    max_char = ucs4lib_find_max_char(u, u + size);
2314
18.8M
    res = PyUnicode_New(size, max_char);
2315
18.8M
    if (!res)
2316
0
        return NULL;
2317
18.8M
    if (max_char < 256)
2318
13.7M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2319
18.8M
                                 PyUnicode_1BYTE_DATA(res));
2320
5.10M
    else if (max_char < 0x10000)
2321
3.32M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2322
5.10M
                                 PyUnicode_2BYTE_DATA(res));
2323
1.77M
    else
2324
1.77M
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2325
18.8M
    assert(_PyUnicode_CheckConsistency(res, 1));
2326
18.8M
    return res;
2327
18.8M
}
2328
2329
2330
int
2331
PyUnicodeWriter_WriteUCS4(PyUnicodeWriter *pub_writer,
2332
                          Py_UCS4 *str,
2333
                          Py_ssize_t size)
2334
0
{
2335
0
    _PyUnicodeWriter *writer = (_PyUnicodeWriter*)pub_writer;
2336
2337
0
    if (size < 0) {
2338
0
        PyErr_SetString(PyExc_ValueError,
2339
0
                        "size must be positive");
2340
0
        return -1;
2341
0
    }
2342
2343
0
    if (size == 0) {
2344
0
        return 0;
2345
0
    }
2346
2347
0
    Py_UCS4 max_char = ucs4lib_find_max_char(str, str + size);
2348
2349
0
    if (_PyUnicodeWriter_Prepare(writer, size, max_char) < 0) {
2350
0
        return -1;
2351
0
    }
2352
2353
0
    int kind = writer->kind;
2354
0
    void *data = (Py_UCS1*)writer->data + writer->pos * kind;
2355
0
    if (kind == PyUnicode_1BYTE_KIND) {
2356
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1,
2357
0
                                 str, str + size,
2358
0
                                 data);
2359
0
    }
2360
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2361
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2,
2362
0
                                 str, str + size,
2363
0
                                 data);
2364
0
    }
2365
0
    else {
2366
0
        memcpy(data, str, size * sizeof(Py_UCS4));
2367
0
    }
2368
0
    writer->pos += size;
2369
2370
0
    return 0;
2371
0
}
2372
2373
2374
PyObject*
2375
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2376
193M
{
2377
193M
    if (size < 0) {
2378
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2379
0
        return NULL;
2380
0
    }
2381
193M
    switch (kind) {
2382
43.4M
    case PyUnicode_1BYTE_KIND:
2383
43.4M
        return _PyUnicode_FromUCS1(buffer, size);
2384
86.0M
    case PyUnicode_2BYTE_KIND:
2385
86.0M
        return _PyUnicode_FromUCS2(buffer, size);
2386
63.7M
    case PyUnicode_4BYTE_KIND:
2387
63.7M
        return _PyUnicode_FromUCS4(buffer, size);
2388
0
    default:
2389
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2390
0
        return NULL;
2391
193M
    }
2392
193M
}
2393
2394
Py_UCS4
2395
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2396
13.5M
{
2397
13.5M
    int kind;
2398
13.5M
    const void *startptr, *endptr;
2399
2400
13.5M
    assert(0 <= start);
2401
13.5M
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2402
13.5M
    assert(start <= end);
2403
2404
13.5M
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2405
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2406
2407
13.5M
    if (start == end)
2408
0
        return 127;
2409
2410
13.5M
    if (PyUnicode_IS_ASCII(unicode))
2411
13.5M
        return 127;
2412
2413
40.3k
    kind = PyUnicode_KIND(unicode);
2414
40.3k
    startptr = PyUnicode_DATA(unicode);
2415
40.3k
    endptr = (char *)startptr + end * kind;
2416
40.3k
    startptr = (char *)startptr + start * kind;
2417
40.3k
    switch(kind) {
2418
1.57k
    case PyUnicode_1BYTE_KIND:
2419
1.57k
        return ucs1lib_find_max_char(startptr, endptr);
2420
3.97k
    case PyUnicode_2BYTE_KIND:
2421
3.97k
        return ucs2lib_find_max_char(startptr, endptr);
2422
34.7k
    case PyUnicode_4BYTE_KIND:
2423
34.7k
        return ucs4lib_find_max_char(startptr, endptr);
2424
0
    default:
2425
0
        Py_UNREACHABLE();
2426
40.3k
    }
2427
40.3k
}
2428
2429
/* Ensure that a string uses the most efficient storage, if it is not the
2430
   case: create a new string with of the right kind. Write NULL into *p_unicode
2431
   on error. */
2432
static void
2433
unicode_adjust_maxchar(PyObject **p_unicode)
2434
0
{
2435
0
    PyObject *unicode, *copy;
2436
0
    Py_UCS4 max_char;
2437
0
    Py_ssize_t len;
2438
0
    int kind;
2439
2440
0
    assert(p_unicode != NULL);
2441
0
    unicode = *p_unicode;
2442
0
    if (PyUnicode_IS_ASCII(unicode))
2443
0
        return;
2444
2445
0
    len = PyUnicode_GET_LENGTH(unicode);
2446
0
    kind = PyUnicode_KIND(unicode);
2447
0
    if (kind == PyUnicode_1BYTE_KIND) {
2448
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2449
0
        max_char = ucs1lib_find_max_char(u, u + len);
2450
0
        if (max_char >= 128)
2451
0
            return;
2452
0
    }
2453
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2454
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2455
0
        max_char = ucs2lib_find_max_char(u, u + len);
2456
0
        if (max_char >= 256)
2457
0
            return;
2458
0
    }
2459
0
    else if (kind == PyUnicode_4BYTE_KIND) {
2460
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2461
0
        max_char = ucs4lib_find_max_char(u, u + len);
2462
0
        if (max_char >= 0x10000)
2463
0
            return;
2464
0
    }
2465
0
    else
2466
0
        Py_UNREACHABLE();
2467
2468
0
    copy = PyUnicode_New(len, max_char);
2469
0
    if (copy != NULL)
2470
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2471
0
    Py_DECREF(unicode);
2472
0
    *p_unicode = copy;
2473
0
}
2474
2475
PyObject*
2476
_PyUnicode_Copy(PyObject *unicode)
2477
3.37M
{
2478
3.37M
    Py_ssize_t length;
2479
3.37M
    PyObject *copy;
2480
2481
3.37M
    if (!PyUnicode_Check(unicode)) {
2482
0
        PyErr_BadInternalCall();
2483
0
        return NULL;
2484
0
    }
2485
2486
3.37M
    length = PyUnicode_GET_LENGTH(unicode);
2487
3.37M
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2488
3.37M
    if (!copy)
2489
0
        return NULL;
2490
3.37M
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2491
2492
3.37M
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2493
3.37M
              length * PyUnicode_KIND(unicode));
2494
3.37M
    assert(_PyUnicode_CheckConsistency(copy, 1));
2495
3.37M
    return copy;
2496
3.37M
}
2497
2498
2499
/* Widen Unicode objects to larger buffers. Don't write terminating null
2500
   character. Return NULL on error. */
2501
2502
static void*
2503
unicode_askind(int skind, void const *data, Py_ssize_t len, int kind)
2504
9.85M
{
2505
9.85M
    void *result;
2506
2507
9.85M
    assert(skind < kind);
2508
9.85M
    switch (kind) {
2509
8.77M
    case PyUnicode_2BYTE_KIND:
2510
8.77M
        result = PyMem_New(Py_UCS2, len);
2511
8.77M
        if (!result)
2512
0
            return PyErr_NoMemory();
2513
8.77M
        assert(skind == PyUnicode_1BYTE_KIND);
2514
8.77M
        _PyUnicode_CONVERT_BYTES(
2515
8.77M
            Py_UCS1, Py_UCS2,
2516
8.77M
            (const Py_UCS1 *)data,
2517
8.77M
            ((const Py_UCS1 *)data) + len,
2518
8.77M
            result);
2519
8.77M
        return result;
2520
1.08M
    case PyUnicode_4BYTE_KIND:
2521
1.08M
        result = PyMem_New(Py_UCS4, len);
2522
1.08M
        if (!result)
2523
0
            return PyErr_NoMemory();
2524
1.08M
        if (skind == PyUnicode_2BYTE_KIND) {
2525
0
            _PyUnicode_CONVERT_BYTES(
2526
0
                Py_UCS2, Py_UCS4,
2527
0
                (const Py_UCS2 *)data,
2528
0
                ((const Py_UCS2 *)data) + len,
2529
0
                result);
2530
0
        }
2531
1.08M
        else {
2532
1.08M
            assert(skind == PyUnicode_1BYTE_KIND);
2533
1.08M
            _PyUnicode_CONVERT_BYTES(
2534
1.08M
                Py_UCS1, Py_UCS4,
2535
1.08M
                (const Py_UCS1 *)data,
2536
1.08M
                ((const Py_UCS1 *)data) + len,
2537
1.08M
                result);
2538
1.08M
        }
2539
1.08M
        return result;
2540
0
    default:
2541
0
        Py_UNREACHABLE();
2542
0
        return NULL;
2543
9.85M
    }
2544
9.85M
}
2545
2546
static Py_UCS4*
2547
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2548
        int copy_null)
2549
75.5k
{
2550
75.5k
    int kind;
2551
75.5k
    const void *data;
2552
75.5k
    Py_ssize_t len, targetlen;
2553
75.5k
    kind = PyUnicode_KIND(string);
2554
75.5k
    data = PyUnicode_DATA(string);
2555
75.5k
    len = PyUnicode_GET_LENGTH(string);
2556
75.5k
    targetlen = len;
2557
75.5k
    if (copy_null)
2558
0
        targetlen++;
2559
75.5k
    if (!target) {
2560
0
        target = PyMem_New(Py_UCS4, targetlen);
2561
0
        if (!target) {
2562
0
            PyErr_NoMemory();
2563
0
            return NULL;
2564
0
        }
2565
0
    }
2566
75.5k
    else {
2567
75.5k
        if (targetsize < targetlen) {
2568
0
            PyErr_Format(PyExc_SystemError,
2569
0
                         "string is longer than the buffer");
2570
0
            if (copy_null && 0 < targetsize)
2571
0
                target[0] = 0;
2572
0
            return NULL;
2573
0
        }
2574
75.5k
    }
2575
75.5k
    if (kind == PyUnicode_1BYTE_KIND) {
2576
56.1k
        const Py_UCS1 *start = (const Py_UCS1 *) data;
2577
56.1k
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2578
56.1k
    }
2579
19.3k
    else if (kind == PyUnicode_2BYTE_KIND) {
2580
14.8k
        const Py_UCS2 *start = (const Py_UCS2 *) data;
2581
14.8k
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2582
14.8k
    }
2583
4.55k
    else if (kind == PyUnicode_4BYTE_KIND) {
2584
4.55k
        memcpy(target, data, len * sizeof(Py_UCS4));
2585
4.55k
    }
2586
0
    else {
2587
0
        Py_UNREACHABLE();
2588
0
    }
2589
75.5k
    if (copy_null)
2590
0
        target[len] = 0;
2591
75.5k
    return target;
2592
75.5k
}
2593
2594
Py_UCS4*
2595
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2596
                 int copy_null)
2597
75.5k
{
2598
75.5k
    if (target == NULL || targetsize < 0) {
2599
0
        PyErr_BadInternalCall();
2600
0
        return NULL;
2601
0
    }
2602
75.5k
    return as_ucs4(string, target, targetsize, copy_null);
2603
75.5k
}
2604
2605
Py_UCS4*
2606
PyUnicode_AsUCS4Copy(PyObject *string)
2607
0
{
2608
0
    return as_ucs4(string, NULL, 0, 1);
2609
0
}
2610
2611
/* maximum number of characters required for output of %jo or %jd or %p.
2612
   We need at most ceil(log8(256)*sizeof(intmax_t)) digits,
2613
   plus 1 for the sign, plus 2 for the 0x prefix (for %p),
2614
   plus 1 for the terminal NUL. */
2615
#define MAX_INTMAX_CHARS (5 + (sizeof(intmax_t)*8-1) / 3)
2616
2617
static int
2618
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2619
                             Py_ssize_t width, Py_ssize_t precision, int flags)
2620
32.1k
{
2621
32.1k
    Py_ssize_t length, fill, arglen;
2622
32.1k
    Py_UCS4 maxchar;
2623
2624
32.1k
    length = PyUnicode_GET_LENGTH(str);
2625
32.1k
    if ((precision == -1 || precision >= length)
2626
32.1k
        && width <= length)
2627
32.1k
        return _PyUnicodeWriter_WriteStr(writer, str);
2628
2629
49
    if (precision != -1)
2630
49
        length = Py_MIN(precision, length);
2631
2632
49
    arglen = Py_MAX(length, width);
2633
49
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2634
25
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2635
24
    else
2636
24
        maxchar = writer->maxchar;
2637
2638
49
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2639
0
        return -1;
2640
2641
49
    fill = Py_MAX(width - length, 0);
2642
49
    if (fill && !(flags & F_LJUST)) {
2643
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2644
0
            return -1;
2645
0
        writer->pos += fill;
2646
0
    }
2647
2648
49
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2649
49
                                  str, 0, length);
2650
49
    writer->pos += length;
2651
2652
49
    if (fill && (flags & F_LJUST)) {
2653
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2654
0
            return -1;
2655
0
        writer->pos += fill;
2656
0
    }
2657
2658
49
    return 0;
2659
49
}
2660
2661
static int
2662
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
2663
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2664
4.36M
{
2665
    /* UTF-8 */
2666
4.36M
    Py_ssize_t *pconsumed = NULL;
2667
4.36M
    Py_ssize_t length;
2668
4.36M
    if (precision == -1) {
2669
202k
        length = strlen(str);
2670
202k
    }
2671
4.16M
    else {
2672
4.16M
        length = 0;
2673
17.2M
        while (length < precision && str[length]) {
2674
13.0M
            length++;
2675
13.0M
        }
2676
4.16M
        if (length == precision) {
2677
            /* The input string is not NUL-terminated.  If it ends with an
2678
             * incomplete UTF-8 sequence, truncate the string just before it.
2679
             * Incomplete sequences in the middle and sequences which cannot
2680
             * be valid prefixes are still treated as errors and replaced
2681
             * with \xfffd. */
2682
1.80k
            pconsumed = &length;
2683
1.80k
        }
2684
4.16M
    }
2685
2686
4.36M
    if (width < 0) {
2687
4.36M
        return unicode_decode_utf8_writer(writer, str, length,
2688
4.36M
                                          _Py_ERROR_REPLACE, "replace", pconsumed);
2689
4.36M
    }
2690
2691
0
    PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
2692
0
                                                     "replace", pconsumed);
2693
0
    if (unicode == NULL)
2694
0
        return -1;
2695
2696
0
    int res = unicode_fromformat_write_str(writer, unicode,
2697
0
                                           width, -1, flags);
2698
0
    Py_DECREF(unicode);
2699
0
    return res;
2700
0
}
2701
2702
static int
2703
unicode_fromformat_write_wcstr(_PyUnicodeWriter *writer, const wchar_t *str,
2704
                              Py_ssize_t width, Py_ssize_t precision, int flags)
2705
0
{
2706
0
    Py_ssize_t length;
2707
0
    if (precision == -1) {
2708
0
        length = wcslen(str);
2709
0
    }
2710
0
    else {
2711
0
        length = 0;
2712
0
        while (length < precision && str[length]) {
2713
0
            length++;
2714
0
        }
2715
0
    }
2716
2717
0
    if (width < 0) {
2718
0
        return PyUnicodeWriter_WriteWideChar((PyUnicodeWriter*)writer,
2719
0
                                             str, length);
2720
0
    }
2721
2722
0
    PyObject *unicode = PyUnicode_FromWideChar(str, length);
2723
0
    if (unicode == NULL)
2724
0
        return -1;
2725
2726
0
    int res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
2727
0
    Py_DECREF(unicode);
2728
0
    return res;
2729
0
}
2730
2731
0
#define F_LONG 1
2732
0
#define F_LONGLONG 2
2733
118k
#define F_SIZE 3
2734
0
#define F_PTRDIFF 4
2735
0
#define F_INTMAX 5
2736
2737
static const char*
2738
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2739
                       const char *f, va_list *vargs)
2740
31.7M
{
2741
31.7M
    const char *p;
2742
31.7M
    Py_ssize_t len;
2743
31.7M
    int flags = 0;
2744
31.7M
    Py_ssize_t width;
2745
31.7M
    Py_ssize_t precision;
2746
2747
31.7M
    p = f;
2748
31.7M
    f++;
2749
31.7M
    if (*f == '%') {
2750
4.15M
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2751
0
            return NULL;
2752
4.15M
        f++;
2753
4.15M
        return f;
2754
4.15M
    }
2755
2756
    /* Parse flags. Example: "%-i" => flags=F_LJUST. */
2757
    /* Flags '+', ' ' and '#' are not particularly useful.
2758
     * They are not worth the implementation and maintenance costs.
2759
     * In addition, '#' should add "0" for "o" conversions for compatibility
2760
     * with printf, but it would confuse Python users. */
2761
27.5M
    while (1) {
2762
27.5M
        switch (*f++) {
2763
0
        case '-': flags |= F_LJUST; continue;
2764
2.67k
        case '0': flags |= F_ZERO; continue;
2765
0
        case '#': flags |= F_ALT; continue;
2766
27.5M
        }
2767
27.5M
        f--;
2768
27.5M
        break;
2769
27.5M
    }
2770
2771
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2772
27.5M
    width = -1;
2773
27.5M
    if (*f == '*') {
2774
0
        width = va_arg(*vargs, int);
2775
0
        if (width < 0) {
2776
0
            flags |= F_LJUST;
2777
0
            width = -width;
2778
0
        }
2779
0
        f++;
2780
0
    }
2781
27.5M
    else if (Py_ISDIGIT((unsigned)*f)) {
2782
2.67k
        width = *f - '0';
2783
2.67k
        f++;
2784
2.67k
        while (Py_ISDIGIT((unsigned)*f)) {
2785
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2786
0
                PyErr_SetString(PyExc_ValueError,
2787
0
                                "width too big");
2788
0
                return NULL;
2789
0
            }
2790
0
            width = (width * 10) + (*f - '0');
2791
0
            f++;
2792
0
        }
2793
2.67k
    }
2794
27.5M
    precision = -1;
2795
27.5M
    if (*f == '.') {
2796
4.16M
        f++;
2797
4.16M
        if (*f == '*') {
2798
0
            precision = va_arg(*vargs, int);
2799
0
            if (precision < 0) {
2800
0
                precision = -2;
2801
0
            }
2802
0
            f++;
2803
0
        }
2804
4.16M
        else if (Py_ISDIGIT((unsigned)*f)) {
2805
4.16M
            precision = (*f - '0');
2806
4.16M
            f++;
2807
12.5M
            while (Py_ISDIGIT((unsigned)*f)) {
2808
8.33M
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2809
0
                    PyErr_SetString(PyExc_ValueError,
2810
0
                                    "precision too big");
2811
0
                    return NULL;
2812
0
                }
2813
8.33M
                precision = (precision * 10) + (*f - '0');
2814
8.33M
                f++;
2815
8.33M
            }
2816
4.16M
        }
2817
4.16M
    }
2818
2819
27.5M
    int sizemod = 0;
2820
27.5M
    if (*f == 'l') {
2821
0
        if (f[1] == 'l') {
2822
0
            sizemod = F_LONGLONG;
2823
0
            f += 2;
2824
0
        }
2825
0
        else {
2826
0
            sizemod = F_LONG;
2827
0
            ++f;
2828
0
        }
2829
0
    }
2830
27.5M
    else if (*f == 'z') {
2831
59.0k
        sizemod = F_SIZE;
2832
59.0k
        ++f;
2833
59.0k
    }
2834
27.5M
    else if (*f == 't') {
2835
0
        sizemod = F_PTRDIFF;
2836
0
        ++f;
2837
0
    }
2838
27.5M
    else if (*f == 'j') {
2839
0
        sizemod = F_INTMAX;
2840
0
        ++f;
2841
0
    }
2842
27.5M
    if (f[0] != '\0' && f[1] == '\0')
2843
4.23M
        writer->overallocate = 0;
2844
2845
27.5M
    switch (*f) {
2846
19.0M
    case 'd': case 'i': case 'o': case 'u': case 'x': case 'X':
2847
19.0M
        break;
2848
4.16M
    case 'c': case 'p':
2849
4.16M
        if (sizemod || width >= 0 || precision >= 0) goto invalid_format;
2850
4.16M
        break;
2851
4.36M
    case 's':
2852
4.36M
    case 'V':
2853
4.36M
        if (sizemod && sizemod != F_LONG) goto invalid_format;
2854
4.36M
        break;
2855
4.36M
    default:
2856
32.1k
        if (sizemod) goto invalid_format;
2857
32.1k
        break;
2858
27.5M
    }
2859
2860
27.5M
    switch (*f) {
2861
4.16M
    case 'c':
2862
4.16M
    {
2863
4.16M
        int ordinal = va_arg(*vargs, int);
2864
4.16M
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2865
0
            PyErr_SetString(PyExc_OverflowError,
2866
0
                            "character argument not in range(0x110000)");
2867
0
            return NULL;
2868
0
        }
2869
4.16M
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2870
0
            return NULL;
2871
4.16M
        break;
2872
4.16M
    }
2873
2874
19.0M
    case 'd': case 'i':
2875
19.0M
    case 'o': case 'u': case 'x': case 'X':
2876
19.0M
    {
2877
19.0M
        char buffer[MAX_INTMAX_CHARS];
2878
2879
        // Fill buffer using sprinf, with one of many possible format
2880
        // strings, like "%llX" for `long long` in hexadecimal.
2881
        // The type/size is in `sizemod`; the format is in `*f`.
2882
2883
        // Use macros with nested switches to keep the sprintf format strings
2884
        // as compile-time literals, avoiding warnings and maybe allowing
2885
        // optimizations.
2886
2887
        // `SPRINT` macro does one sprintf
2888
        // Example usage: SPRINT("l", "X", unsigned long) expands to
2889
        // sprintf(buffer, "%" "l" "X", va_arg(*vargs, unsigned long))
2890
19.0M
        #define SPRINT(SIZE_SPEC, FMT_CHAR, TYPE) \
2891
19.0M
            sprintf(buffer, "%" SIZE_SPEC FMT_CHAR, va_arg(*vargs, TYPE))
2892
2893
        // One inner switch to handle all format variants
2894
19.0M
        #define DO_SPRINTS(SIZE_SPEC, SIGNED_TYPE, UNSIGNED_TYPE)             \
2895
19.0M
            switch (*f) {                                                     \
2896
0
                case 'o': len = SPRINT(SIZE_SPEC, "o", UNSIGNED_TYPE); break; \
2897
0
                case 'u': len = SPRINT(SIZE_SPEC, "u", UNSIGNED_TYPE); break; \
2898
1.49k
                case 'x': len = SPRINT(SIZE_SPEC, "x", UNSIGNED_TYPE); break; \
2899
1.17k
                case 'X': len = SPRINT(SIZE_SPEC, "X", UNSIGNED_TYPE); break; \
2900
19.0M
                default:  len = SPRINT(SIZE_SPEC, "d", SIGNED_TYPE); break;   \
2901
19.0M
            }
2902
2903
        // Outer switch to handle all the sizes/types
2904
19.0M
        switch (sizemod) {
2905
0
            case F_LONG:     DO_SPRINTS("l", long, unsigned long); break;
2906
0
            case F_LONGLONG: DO_SPRINTS("ll", long long, unsigned long long); break;
2907
59.0k
            case F_SIZE:     DO_SPRINTS("z", Py_ssize_t, size_t); break;
2908
0
            case F_PTRDIFF:  DO_SPRINTS("t", ptrdiff_t, ptrdiff_t); break;
2909
0
            case F_INTMAX:   DO_SPRINTS("j", intmax_t, uintmax_t); break;
2910
18.9M
            default:         DO_SPRINTS("", int, unsigned int); break;
2911
19.0M
        }
2912
19.0M
        #undef SPRINT
2913
19.0M
        #undef DO_SPRINTS
2914
2915
19.0M
        assert(len >= 0);
2916
2917
19.0M
        int sign = (buffer[0] == '-');
2918
19.0M
        len -= sign;
2919
2920
19.0M
        precision = Py_MAX(precision, len);
2921
19.0M
        width = Py_MAX(width, precision + sign);
2922
19.0M
        if ((flags & F_ZERO) && !(flags & F_LJUST)) {
2923
2.67k
            precision = width - sign;
2924
2.67k
        }
2925
2926
19.0M
        Py_ssize_t spacepad = Py_MAX(width - precision - sign, 0);
2927
19.0M
        Py_ssize_t zeropad = Py_MAX(precision - len, 0);
2928
2929
19.0M
        if (_PyUnicodeWriter_Prepare(writer, width, 127) == -1)
2930
0
            return NULL;
2931
2932
19.0M
        if (spacepad && !(flags & F_LJUST)) {
2933
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2934
0
                return NULL;
2935
0
            writer->pos += spacepad;
2936
0
        }
2937
2938
19.0M
        if (sign) {
2939
0
            if (_PyUnicodeWriter_WriteChar(writer, '-') == -1)
2940
0
                return NULL;
2941
0
        }
2942
2943
19.0M
        if (zeropad) {
2944
739
            if (PyUnicode_Fill(writer->buffer, writer->pos, zeropad, '0') == -1)
2945
0
                return NULL;
2946
739
            writer->pos += zeropad;
2947
739
        }
2948
2949
19.0M
        if (_PyUnicodeWriter_WriteASCIIString(writer, &buffer[sign], len) < 0)
2950
0
            return NULL;
2951
2952
19.0M
        if (spacepad && (flags & F_LJUST)) {
2953
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, spacepad, ' ') == -1)
2954
0
                return NULL;
2955
0
            writer->pos += spacepad;
2956
0
        }
2957
19.0M
        break;
2958
19.0M
    }
2959
2960
19.0M
    case 'p':
2961
0
    {
2962
0
        char number[MAX_INTMAX_CHARS];
2963
2964
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2965
0
        assert(len >= 0);
2966
2967
        /* %p is ill-defined:  ensure leading 0x. */
2968
0
        if (number[1] == 'X')
2969
0
            number[1] = 'x';
2970
0
        else if (number[1] != 'x') {
2971
0
            memmove(number + 2, number,
2972
0
                    strlen(number) + 1);
2973
0
            number[0] = '0';
2974
0
            number[1] = 'x';
2975
0
            len += 2;
2976
0
        }
2977
2978
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2979
0
            return NULL;
2980
0
        break;
2981
0
    }
2982
2983
4.36M
    case 's':
2984
4.36M
    {
2985
4.36M
        if (sizemod) {
2986
0
            const wchar_t *s = va_arg(*vargs, const wchar_t*);
2987
0
            if (unicode_fromformat_write_wcstr(writer, s, width, precision, flags) < 0)
2988
0
                return NULL;
2989
0
        }
2990
4.36M
        else {
2991
            /* UTF-8 */
2992
4.36M
            const char *s = va_arg(*vargs, const char*);
2993
4.36M
            if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
2994
0
                return NULL;
2995
4.36M
        }
2996
4.36M
        break;
2997
4.36M
    }
2998
2999
4.36M
    case 'U':
3000
31.4k
    {
3001
31.4k
        PyObject *obj = va_arg(*vargs, PyObject *);
3002
31.4k
        assert(obj && _PyUnicode_CHECK(obj));
3003
3004
31.4k
        if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3005
0
            return NULL;
3006
31.4k
        break;
3007
31.4k
    }
3008
3009
31.4k
    case 'V':
3010
0
    {
3011
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3012
0
        const char *str;
3013
0
        const wchar_t *wstr;
3014
0
        if (sizemod) {
3015
0
            wstr = va_arg(*vargs, const wchar_t*);
3016
0
        }
3017
0
        else {
3018
0
            str = va_arg(*vargs, const char *);
3019
0
        }
3020
0
        if (obj) {
3021
0
            assert(_PyUnicode_CHECK(obj));
3022
0
            if (unicode_fromformat_write_str(writer, obj, width, precision, flags) == -1)
3023
0
                return NULL;
3024
0
        }
3025
0
        else if (sizemod) {
3026
0
            assert(wstr != NULL);
3027
0
            if (unicode_fromformat_write_wcstr(writer, wstr, width, precision, flags) < 0)
3028
0
                return NULL;
3029
0
        }
3030
0
        else {
3031
0
            assert(str != NULL);
3032
0
            if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
3033
0
                return NULL;
3034
0
        }
3035
0
        break;
3036
0
    }
3037
3038
52
    case 'S':
3039
52
    {
3040
52
        PyObject *obj = va_arg(*vargs, PyObject *);
3041
52
        PyObject *str;
3042
52
        assert(obj);
3043
52
        str = PyObject_Str(obj);
3044
52
        if (!str)
3045
0
            return NULL;
3046
52
        if (unicode_fromformat_write_str(writer, str, width, precision, flags) == -1) {
3047
0
            Py_DECREF(str);
3048
0
            return NULL;
3049
0
        }
3050
52
        Py_DECREF(str);
3051
52
        break;
3052
52
    }
3053
3054
684
    case 'R':
3055
684
    {
3056
684
        PyObject *obj = va_arg(*vargs, PyObject *);
3057
684
        PyObject *repr;
3058
684
        assert(obj);
3059
684
        repr = PyObject_Repr(obj);
3060
684
        if (!repr)
3061
0
            return NULL;
3062
684
        if (unicode_fromformat_write_str(writer, repr, width, precision, flags) == -1) {
3063
0
            Py_DECREF(repr);
3064
0
            return NULL;
3065
0
        }
3066
684
        Py_DECREF(repr);
3067
684
        break;
3068
684
    }
3069
3070
0
    case 'A':
3071
0
    {
3072
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3073
0
        PyObject *ascii;
3074
0
        assert(obj);
3075
0
        ascii = PyObject_ASCII(obj);
3076
0
        if (!ascii)
3077
0
            return NULL;
3078
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision, flags) == -1) {
3079
0
            Py_DECREF(ascii);
3080
0
            return NULL;
3081
0
        }
3082
0
        Py_DECREF(ascii);
3083
0
        break;
3084
0
    }
3085
3086
0
    case 'T':
3087
0
    {
3088
0
        PyObject *obj = va_arg(*vargs, PyObject *);
3089
0
        PyTypeObject *type = (PyTypeObject *)Py_NewRef(Py_TYPE(obj));
3090
3091
0
        PyObject *type_name;
3092
0
        if (flags & F_ALT) {
3093
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3094
0
        }
3095
0
        else {
3096
0
            type_name = PyType_GetFullyQualifiedName(type);
3097
0
        }
3098
0
        Py_DECREF(type);
3099
0
        if (!type_name) {
3100
0
            return NULL;
3101
0
        }
3102
3103
0
        if (unicode_fromformat_write_str(writer, type_name,
3104
0
                                         width, precision, flags) == -1) {
3105
0
            Py_DECREF(type_name);
3106
0
            return NULL;
3107
0
        }
3108
0
        Py_DECREF(type_name);
3109
0
        break;
3110
0
    }
3111
3112
0
    case 'N':
3113
0
    {
3114
0
        PyObject *type_raw = va_arg(*vargs, PyObject *);
3115
0
        assert(type_raw != NULL);
3116
3117
0
        if (!PyType_Check(type_raw)) {
3118
0
            PyErr_SetString(PyExc_TypeError, "%N argument must be a type");
3119
0
            return NULL;
3120
0
        }
3121
0
        PyTypeObject *type = (PyTypeObject*)type_raw;
3122
3123
0
        PyObject *type_name;
3124
0
        if (flags & F_ALT) {
3125
0
            type_name = _PyType_GetFullyQualifiedName(type, ':');
3126
0
        }
3127
0
        else {
3128
0
            type_name = PyType_GetFullyQualifiedName(type);
3129
0
        }
3130
0
        if (!type_name) {
3131
0
            return NULL;
3132
0
        }
3133
0
        if (unicode_fromformat_write_str(writer, type_name,
3134
0
                                         width, precision, flags) == -1) {
3135
0
            Py_DECREF(type_name);
3136
0
            return NULL;
3137
0
        }
3138
0
        Py_DECREF(type_name);
3139
0
        break;
3140
0
    }
3141
3142
0
    default:
3143
0
    invalid_format:
3144
0
        PyErr_Format(PyExc_SystemError, "invalid format string: %s", p);
3145
0
        return NULL;
3146
27.5M
    }
3147
3148
27.5M
    f++;
3149
27.5M
    return f;
3150
27.5M
}
3151
3152
static int
3153
unicode_from_format(_PyUnicodeWriter *writer, const char *format, va_list vargs)
3154
13.8M
{
3155
13.8M
    Py_ssize_t len = strlen(format);
3156
13.8M
    writer->min_length += len + 100;
3157
13.8M
    writer->overallocate = 1;
3158
3159
    // Copy varags to be able to pass a reference to a subfunction.
3160
13.8M
    va_list vargs2;
3161
13.8M
    va_copy(vargs2, vargs);
3162
3163
    // _PyUnicodeWriter_WriteASCIIString() below requires the format string
3164
    // to be encoded to ASCII.
3165
13.8M
    int is_ascii = (ucs1lib_find_max_char((Py_UCS1*)format, (Py_UCS1*)format + len) < 128);
3166
13.8M
    if (!is_ascii) {
3167
0
        Py_ssize_t i;
3168
0
        for (i=0; i < len && (unsigned char)format[i] <= 127; i++);
3169
0
        PyErr_Format(PyExc_ValueError,
3170
0
            "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3171
0
            "string, got a non-ASCII byte: 0x%02x",
3172
0
            (unsigned char)format[i]);
3173
0
        goto fail;
3174
0
    }
3175
3176
78.5M
    for (const char *f = format; *f; ) {
3177
64.6M
        if (*f == '%') {
3178
31.7M
            f = unicode_fromformat_arg(writer, f, &vargs2);
3179
31.7M
            if (f == NULL)
3180
0
                goto fail;
3181
31.7M
        }
3182
32.9M
        else {
3183
32.9M
            const char *p = strchr(f, '%');
3184
32.9M
            if (p != NULL) {
3185
23.3M
                len = p - f;
3186
23.3M
            }
3187
9.59M
            else {
3188
9.59M
                len = strlen(f);
3189
9.59M
                writer->overallocate = 0;
3190
9.59M
            }
3191
3192
32.9M
            if (_PyUnicodeWriter_WriteASCIIString(writer, f, len) < 0) {
3193
0
                goto fail;
3194
0
            }
3195
32.9M
            f += len;
3196
32.9M
        }
3197
64.6M
    }
3198
13.8M
    va_end(vargs2);
3199
13.8M
    return 0;
3200
3201
0
  fail:
3202
0
    va_end(vargs2);
3203
0
    return -1;
3204
13.8M
}
3205
3206
PyObject *
3207
PyUnicode_FromFormatV(const char *format, va_list vargs)
3208
13.8M
{
3209
13.8M
    _PyUnicodeWriter writer;
3210
13.8M
    _PyUnicodeWriter_Init(&writer);
3211
3212
13.8M
    if (unicode_from_format(&writer, format, vargs) < 0) {
3213
0
        _PyUnicodeWriter_Dealloc(&writer);
3214
0
        return NULL;
3215
0
    }
3216
13.8M
    return _PyUnicodeWriter_Finish(&writer);
3217
13.8M
}
3218
3219
PyObject *
3220
PyUnicode_FromFormat(const char *format, ...)
3221
13.6k
{
3222
13.6k
    PyObject* ret;
3223
13.6k
    va_list vargs;
3224
3225
13.6k
    va_start(vargs, format);
3226
13.6k
    ret = PyUnicode_FromFormatV(format, vargs);
3227
13.6k
    va_end(vargs);
3228
13.6k
    return ret;
3229
13.6k
}
3230
3231
int
3232
PyUnicodeWriter_Format(PyUnicodeWriter *writer, const char *format, ...)
3233
0
{
3234
0
    va_list vargs;
3235
0
    va_start(vargs, format);
3236
0
    int res = _PyUnicodeWriter_FormatV(writer, format, vargs);
3237
0
    va_end(vargs);
3238
0
    return res;
3239
0
}
3240
3241
int
3242
_PyUnicodeWriter_FormatV(PyUnicodeWriter *writer, const char *format,
3243
                         va_list vargs)
3244
0
{
3245
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
3246
0
    Py_ssize_t old_pos = _writer->pos;
3247
3248
0
    int res = unicode_from_format(_writer, format, vargs);
3249
3250
0
    if (res < 0) {
3251
0
        _writer->pos = old_pos;
3252
0
    }
3253
0
    return res;
3254
0
}
3255
3256
static Py_ssize_t
3257
unicode_get_widechar_size(PyObject *unicode)
3258
7.10k
{
3259
7.10k
    Py_ssize_t res;
3260
3261
7.10k
    assert(unicode != NULL);
3262
7.10k
    assert(_PyUnicode_CHECK(unicode));
3263
3264
7.10k
    res = _PyUnicode_LENGTH(unicode);
3265
#if SIZEOF_WCHAR_T == 2
3266
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3267
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3268
        const Py_UCS4 *end = s + res;
3269
        for (; s < end; ++s) {
3270
            if (*s > 0xFFFF) {
3271
                ++res;
3272
            }
3273
        }
3274
    }
3275
#endif
3276
7.10k
    return res;
3277
7.10k
}
3278
3279
static void
3280
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3281
7.10k
{
3282
7.10k
    assert(unicode != NULL);
3283
7.10k
    assert(_PyUnicode_CHECK(unicode));
3284
3285
7.10k
    if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3286
0
        memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3287
0
        return;
3288
0
    }
3289
3290
7.10k
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3291
7.10k
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3292
610k
        for (; size--; ++s, ++w) {
3293
603k
            *w = *s;
3294
603k
        }
3295
7.10k
    }
3296
0
    else {
3297
0
#if SIZEOF_WCHAR_T == 4
3298
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3299
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3300
0
        for (; size--; ++s, ++w) {
3301
0
            *w = *s;
3302
0
        }
3303
#else
3304
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3305
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3306
        for (; size--; ++s, ++w) {
3307
            Py_UCS4 ch = *s;
3308
            if (ch > 0xFFFF) {
3309
                assert(ch <= MAX_UNICODE);
3310
                /* encode surrogate pair in this case */
3311
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3312
                if (!size--)
3313
                    break;
3314
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3315
            }
3316
            else {
3317
                *w = ch;
3318
            }
3319
        }
3320
#endif
3321
0
    }
3322
7.10k
}
3323
3324
#ifdef HAVE_WCHAR_H
3325
3326
/* Convert a Unicode object to a wide character string.
3327
3328
   - If w is NULL: return the number of wide characters (including the null
3329
     character) required to convert the unicode object. Ignore size argument.
3330
3331
   - Otherwise: return the number of wide characters (excluding the null
3332
     character) written into w. Write at most size wide characters (including
3333
     the null character). */
3334
Py_ssize_t
3335
PyUnicode_AsWideChar(PyObject *unicode,
3336
                     wchar_t *w,
3337
                     Py_ssize_t size)
3338
5.82k
{
3339
5.82k
    Py_ssize_t res;
3340
3341
5.82k
    if (unicode == NULL) {
3342
0
        PyErr_BadInternalCall();
3343
0
        return -1;
3344
0
    }
3345
5.82k
    if (!PyUnicode_Check(unicode)) {
3346
0
        PyErr_BadArgument();
3347
0
        return -1;
3348
0
    }
3349
3350
5.82k
    res = unicode_get_widechar_size(unicode);
3351
5.82k
    if (w == NULL) {
3352
0
        return res + 1;
3353
0
    }
3354
3355
5.82k
    if (size > res) {
3356
5.82k
        size = res + 1;
3357
5.82k
    }
3358
0
    else {
3359
0
        res = size;
3360
0
    }
3361
5.82k
    unicode_copy_as_widechar(unicode, w, size);
3362
3363
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3364
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3365
       non-Unicode locales and hence needs conversion first. */
3366
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3367
        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3368
            return -1;
3369
        }
3370
    }
3371
#endif
3372
3373
5.82k
    return res;
3374
5.82k
}
3375
3376
wchar_t*
3377
PyUnicode_AsWideCharString(PyObject *unicode,
3378
                           Py_ssize_t *size)
3379
1.27k
{
3380
1.27k
    wchar_t *buffer;
3381
1.27k
    Py_ssize_t buflen;
3382
3383
1.27k
    if (unicode == NULL) {
3384
0
        PyErr_BadInternalCall();
3385
0
        return NULL;
3386
0
    }
3387
1.27k
    if (!PyUnicode_Check(unicode)) {
3388
0
        PyErr_BadArgument();
3389
0
        return NULL;
3390
0
    }
3391
3392
1.27k
    buflen = unicode_get_widechar_size(unicode);
3393
1.27k
    buffer = (wchar_t *) PyMem_New(wchar_t, (buflen + 1));
3394
1.27k
    if (buffer == NULL) {
3395
0
        PyErr_NoMemory();
3396
0
        return NULL;
3397
0
    }
3398
1.27k
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3399
3400
#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3401
    /* Oracle Solaris uses non-Unicode internal wchar_t form for
3402
       non-Unicode locales and hence needs conversion first. */
3403
    if (_Py_LocaleUsesNonUnicodeWchar()) {
3404
        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3405
            return NULL;
3406
        }
3407
    }
3408
#endif
3409
3410
1.27k
    if (size != NULL) {
3411
826
        *size = buflen;
3412
826
    }
3413
448
    else if (wcslen(buffer) != (size_t)buflen) {
3414
0
        PyMem_Free(buffer);
3415
0
        PyErr_SetString(PyExc_ValueError,
3416
0
                        "embedded null character");
3417
0
        return NULL;
3418
0
    }
3419
1.27k
    return buffer;
3420
1.27k
}
3421
3422
#endif /* HAVE_WCHAR_H */
3423
3424
int
3425
_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3426
0
{
3427
0
    wchar_t **p = (wchar_t **)ptr;
3428
0
    if (obj == NULL) {
3429
0
        PyMem_Free(*p);
3430
0
        *p = NULL;
3431
0
        return 1;
3432
0
    }
3433
0
    if (PyUnicode_Check(obj)) {
3434
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3435
0
        if (*p == NULL) {
3436
0
            return 0;
3437
0
        }
3438
0
        return Py_CLEANUP_SUPPORTED;
3439
0
    }
3440
0
    PyErr_Format(PyExc_TypeError,
3441
0
                 "argument must be str, not %.50s",
3442
0
                 Py_TYPE(obj)->tp_name);
3443
0
    return 0;
3444
0
}
3445
3446
int
3447
_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3448
0
{
3449
0
    wchar_t **p = (wchar_t **)ptr;
3450
0
    if (obj == NULL) {
3451
0
        PyMem_Free(*p);
3452
0
        *p = NULL;
3453
0
        return 1;
3454
0
    }
3455
0
    if (obj == Py_None) {
3456
0
        *p = NULL;
3457
0
        return 1;
3458
0
    }
3459
0
    if (PyUnicode_Check(obj)) {
3460
0
        *p = PyUnicode_AsWideCharString(obj, NULL);
3461
0
        if (*p == NULL) {
3462
0
            return 0;
3463
0
        }
3464
0
        return Py_CLEANUP_SUPPORTED;
3465
0
    }
3466
0
    PyErr_Format(PyExc_TypeError,
3467
0
                 "argument must be str or None, not %.50s",
3468
0
                 Py_TYPE(obj)->tp_name);
3469
0
    return 0;
3470
0
}
3471
3472
PyObject *
3473
PyUnicode_FromOrdinal(int ordinal)
3474
228k
{
3475
228k
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3476
0
        PyErr_SetString(PyExc_ValueError,
3477
0
                        "chr() arg not in range(0x110000)");
3478
0
        return NULL;
3479
0
    }
3480
3481
228k
    return unicode_char((Py_UCS4)ordinal);
3482
228k
}
3483
3484
PyObject *
3485
PyUnicode_FromObject(PyObject *obj)
3486
4.67M
{
3487
    /* XXX Perhaps we should make this API an alias of
3488
       PyObject_Str() instead ?! */
3489
4.67M
    if (PyUnicode_CheckExact(obj)) {
3490
4.67M
        return Py_NewRef(obj);
3491
4.67M
    }
3492
0
    if (PyUnicode_Check(obj)) {
3493
        /* For a Unicode subtype that's not a Unicode object,
3494
           return a true Unicode object with the same data. */
3495
0
        return _PyUnicode_Copy(obj);
3496
0
    }
3497
0
    PyErr_Format(PyExc_TypeError,
3498
0
                 "Can't convert '%.100s' object to str implicitly",
3499
0
                 Py_TYPE(obj)->tp_name);
3500
0
    return NULL;
3501
0
}
3502
3503
PyObject *
3504
PyUnicode_FromEncodedObject(PyObject *obj,
3505
                            const char *encoding,
3506
                            const char *errors)
3507
5.84M
{
3508
5.84M
    Py_buffer buffer;
3509
5.84M
    PyObject *v;
3510
3511
5.84M
    if (obj == NULL) {
3512
0
        PyErr_BadInternalCall();
3513
0
        return NULL;
3514
0
    }
3515
3516
    /* Decoding bytes objects is the most common case and should be fast */
3517
5.84M
    if (PyBytes_Check(obj)) {
3518
5.25M
        if (PyBytes_GET_SIZE(obj) == 0) {
3519
679k
            if (unicode_check_encoding_errors(encoding, errors) < 0) {
3520
0
                return NULL;
3521
0
            }
3522
679k
            _Py_RETURN_UNICODE_EMPTY();
3523
679k
        }
3524
4.57M
        return PyUnicode_Decode(
3525
4.57M
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3526
4.57M
                encoding, errors);
3527
5.25M
    }
3528
3529
587k
    if (PyUnicode_Check(obj)) {
3530
0
        PyErr_SetString(PyExc_TypeError,
3531
0
                        "decoding str is not supported");
3532
0
        return NULL;
3533
0
    }
3534
3535
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3536
587k
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3537
0
        PyErr_Format(PyExc_TypeError,
3538
0
                     "decoding to str: need a bytes-like object, %.80s found",
3539
0
                     Py_TYPE(obj)->tp_name);
3540
0
        return NULL;
3541
0
    }
3542
3543
587k
    if (buffer.len == 0) {
3544
0
        PyBuffer_Release(&buffer);
3545
0
        if (unicode_check_encoding_errors(encoding, errors) < 0) {
3546
0
            return NULL;
3547
0
        }
3548
0
        _Py_RETURN_UNICODE_EMPTY();
3549
0
    }
3550
3551
587k
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3552
587k
    PyBuffer_Release(&buffer);
3553
587k
    return v;
3554
587k
}
3555
3556
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3557
   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3558
   longer than lower_len-1). */
3559
int
3560
_Py_normalize_encoding(const char *encoding,
3561
                       char *lower,
3562
                       size_t lower_len)
3563
10.1M
{
3564
10.1M
    const char *e;
3565
10.1M
    char *l;
3566
10.1M
    char *l_end;
3567
10.1M
    int punct;
3568
3569
10.1M
    assert(encoding != NULL);
3570
3571
10.1M
    e = encoding;
3572
10.1M
    l = lower;
3573
10.1M
    l_end = &lower[lower_len - 1];
3574
10.1M
    punct = 0;
3575
115M
    while (1) {
3576
115M
        char c = *e;
3577
115M
        if (c == 0) {
3578
9.37M
            break;
3579
9.37M
        }
3580
3581
105M
        if (Py_ISALNUM(c) || c == '.') {
3582
53.0M
            if (punct && l != lower) {
3583
9.13M
                if (l == l_end) {
3584
1.33k
                    return 0;
3585
1.33k
                }
3586
9.13M
                *l++ = '_';
3587
9.13M
            }
3588
53.0M
            punct = 0;
3589
3590
53.0M
            if (l == l_end) {
3591
818k
                return 0;
3592
818k
            }
3593
52.1M
            *l++ = Py_TOLOWER(c);
3594
52.1M
        }
3595
52.8M
        else {
3596
52.8M
            punct = 1;
3597
52.8M
        }
3598
3599
105M
        e++;
3600
105M
    }
3601
9.37M
    *l = '\0';
3602
9.37M
    return 1;
3603
10.1M
}
3604
3605
PyObject *
3606
PyUnicode_Decode(const char *s,
3607
                 Py_ssize_t size,
3608
                 const char *encoding,
3609
                 const char *errors)
3610
5.17M
{
3611
5.17M
    PyObject *buffer = NULL, *unicode;
3612
5.17M
    Py_buffer info;
3613
5.17M
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3614
3615
5.17M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3616
0
        return NULL;
3617
0
    }
3618
3619
5.17M
    if (size == 0) {
3620
0
        _Py_RETURN_UNICODE_EMPTY();
3621
0
    }
3622
3623
5.17M
    if (encoding == NULL) {
3624
38.2k
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3625
38.2k
    }
3626
3627
    /* Shortcuts for common default encodings */
3628
5.13M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3629
5.12M
        char *lower = buflower;
3630
3631
        /* Fast paths */
3632
5.12M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3633
840k
            lower += 3;
3634
840k
            if (*lower == '_') {
3635
                /* Match "utf8" and "utf_8" */
3636
839k
                lower++;
3637
839k
            }
3638
3639
840k
            if (lower[0] == '8' && lower[1] == 0) {
3640
839k
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3641
839k
            }
3642
945
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3643
107
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3644
107
            }
3645
838
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3646
108
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3647
108
            }
3648
840k
        }
3649
4.28M
        else {
3650
4.28M
            if (strcmp(lower, "ascii") == 0
3651
3.60M
                || strcmp(lower, "us_ascii") == 0) {
3652
825k
                return PyUnicode_DecodeASCII(s, size, errors);
3653
825k
            }
3654
    #ifdef MS_WINDOWS
3655
            else if (strcmp(lower, "mbcs") == 0) {
3656
                return PyUnicode_DecodeMBCS(s, size, errors);
3657
            }
3658
    #endif
3659
3.45M
            else if (strcmp(lower, "latin1") == 0
3660
3.45M
                     || strcmp(lower, "latin_1") == 0
3661
325k
                     || strcmp(lower, "iso_8859_1") == 0
3662
3.15M
                     || strcmp(lower, "iso8859_1") == 0) {
3663
3.15M
                return PyUnicode_DecodeLatin1(s, size, errors);
3664
3.15M
            }
3665
4.28M
        }
3666
5.12M
    }
3667
3668
    /* Decode via the codec registry */
3669
310k
    buffer = NULL;
3670
310k
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3671
0
        goto onError;
3672
310k
    buffer = PyMemoryView_FromBuffer(&info);
3673
310k
    if (buffer == NULL)
3674
0
        goto onError;
3675
310k
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3676
310k
    if (unicode == NULL)
3677
129k
        goto onError;
3678
180k
    if (!PyUnicode_Check(unicode)) {
3679
0
        PyErr_Format(PyExc_TypeError,
3680
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3681
0
                     "use codecs.decode() to decode to arbitrary types",
3682
0
                     encoding,
3683
0
                     Py_TYPE(unicode)->tp_name);
3684
0
        Py_DECREF(unicode);
3685
0
        goto onError;
3686
0
    }
3687
180k
    Py_DECREF(buffer);
3688
180k
    return unicode_result(unicode);
3689
3690
129k
  onError:
3691
129k
    Py_XDECREF(buffer);
3692
129k
    return NULL;
3693
180k
}
3694
3695
PyAPI_FUNC(PyObject *)
3696
PyUnicode_AsDecodedObject(PyObject *unicode,
3697
                          const char *encoding,
3698
                          const char *errors)
3699
0
{
3700
0
    if (!PyUnicode_Check(unicode)) {
3701
0
        PyErr_BadArgument();
3702
0
        return NULL;
3703
0
    }
3704
3705
0
    if (encoding == NULL)
3706
0
        encoding = PyUnicode_GetDefaultEncoding();
3707
3708
    /* Decode via the codec registry */
3709
0
    return PyCodec_Decode(unicode, encoding, errors);
3710
0
}
3711
3712
PyAPI_FUNC(PyObject *)
3713
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3714
                           const char *encoding,
3715
                           const char *errors)
3716
0
{
3717
0
    PyObject *v;
3718
3719
0
    if (!PyUnicode_Check(unicode)) {
3720
0
        PyErr_BadArgument();
3721
0
        goto onError;
3722
0
    }
3723
3724
0
    if (encoding == NULL)
3725
0
        encoding = PyUnicode_GetDefaultEncoding();
3726
3727
    /* Decode via the codec registry */
3728
0
    v = PyCodec_Decode(unicode, encoding, errors);
3729
0
    if (v == NULL)
3730
0
        goto onError;
3731
0
    if (!PyUnicode_Check(v)) {
3732
0
        PyErr_Format(PyExc_TypeError,
3733
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3734
0
                     "use codecs.decode() to decode to arbitrary types",
3735
0
                     encoding,
3736
0
                     Py_TYPE(unicode)->tp_name);
3737
0
        Py_DECREF(v);
3738
0
        goto onError;
3739
0
    }
3740
0
    return unicode_result(v);
3741
3742
0
  onError:
3743
0
    return NULL;
3744
0
}
3745
3746
PyAPI_FUNC(PyObject *)
3747
PyUnicode_AsEncodedObject(PyObject *unicode,
3748
                          const char *encoding,
3749
                          const char *errors)
3750
0
{
3751
0
    PyObject *v;
3752
3753
0
    if (!PyUnicode_Check(unicode)) {
3754
0
        PyErr_BadArgument();
3755
0
        goto onError;
3756
0
    }
3757
3758
0
    if (encoding == NULL)
3759
0
        encoding = PyUnicode_GetDefaultEncoding();
3760
3761
    /* Encode via the codec registry */
3762
0
    v = PyCodec_Encode(unicode, encoding, errors);
3763
0
    if (v == NULL)
3764
0
        goto onError;
3765
0
    return v;
3766
3767
0
  onError:
3768
0
    return NULL;
3769
0
}
3770
3771
3772
static PyObject *
3773
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3774
                      int current_locale)
3775
426
{
3776
426
    Py_ssize_t wlen;
3777
426
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3778
426
    if (wstr == NULL) {
3779
0
        return NULL;
3780
0
    }
3781
3782
426
    if ((size_t)wlen != wcslen(wstr)) {
3783
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3784
0
        PyMem_Free(wstr);
3785
0
        return NULL;
3786
0
    }
3787
3788
426
    char *str;
3789
426
    size_t error_pos;
3790
426
    const char *reason;
3791
426
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3792
426
                                 current_locale, error_handler);
3793
426
    PyMem_Free(wstr);
3794
3795
426
    if (res != 0) {
3796
0
        if (res == -2) {
3797
0
            PyObject *exc;
3798
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3799
0
                    "locale", unicode,
3800
0
                    (Py_ssize_t)error_pos,
3801
0
                    (Py_ssize_t)(error_pos+1),
3802
0
                    reason);
3803
0
            if (exc != NULL) {
3804
0
                PyCodec_StrictErrors(exc);
3805
0
                Py_DECREF(exc);
3806
0
            }
3807
0
        }
3808
0
        else if (res == -3) {
3809
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3810
0
        }
3811
0
        else {
3812
0
            PyErr_NoMemory();
3813
0
        }
3814
0
        return NULL;
3815
0
    }
3816
3817
426
    PyObject *bytes = PyBytes_FromString(str);
3818
426
    PyMem_RawFree(str);
3819
426
    return bytes;
3820
426
}
3821
3822
PyObject *
3823
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3824
0
{
3825
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3826
0
    return unicode_encode_locale(unicode, error_handler, 1);
3827
0
}
3828
3829
PyObject *
3830
PyUnicode_EncodeFSDefault(PyObject *unicode)
3831
18.9k
{
3832
18.9k
    PyInterpreterState *interp = _PyInterpreterState_GET();
3833
18.9k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3834
18.9k
    if (fs_codec->utf8) {
3835
18.5k
        return unicode_encode_utf8(unicode,
3836
18.5k
                                   fs_codec->error_handler,
3837
18.5k
                                   fs_codec->errors);
3838
18.5k
    }
3839
426
#ifndef _Py_FORCE_UTF8_FS_ENCODING
3840
426
    else if (fs_codec->encoding) {
3841
0
        return PyUnicode_AsEncodedString(unicode,
3842
0
                                         fs_codec->encoding,
3843
0
                                         fs_codec->errors);
3844
0
    }
3845
426
#endif
3846
426
    else {
3847
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
3848
           machinery is not ready and so cannot be used:
3849
           use wcstombs() in this case. */
3850
426
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3851
426
        const wchar_t *filesystem_errors = config->filesystem_errors;
3852
426
        assert(filesystem_errors != NULL);
3853
426
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3854
426
        assert(errors != _Py_ERROR_UNKNOWN);
3855
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3856
        return unicode_encode_utf8(unicode, errors, NULL);
3857
#else
3858
426
        return unicode_encode_locale(unicode, errors, 0);
3859
426
#endif
3860
426
    }
3861
18.9k
}
3862
3863
PyObject *
3864
PyUnicode_AsEncodedString(PyObject *unicode,
3865
                          const char *encoding,
3866
                          const char *errors)
3867
16.2M
{
3868
16.2M
    PyObject *v;
3869
16.2M
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3870
3871
16.2M
    if (!PyUnicode_Check(unicode)) {
3872
0
        PyErr_BadArgument();
3873
0
        return NULL;
3874
0
    }
3875
3876
16.2M
    if (unicode_check_encoding_errors(encoding, errors) < 0) {
3877
0
        return NULL;
3878
0
    }
3879
3880
16.2M
    if (encoding == NULL) {
3881
11.1M
        return _PyUnicode_AsUTF8String(unicode, errors);
3882
11.1M
    }
3883
3884
    /* Shortcuts for common default encodings */
3885
5.05M
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3886
4.24M
        char *lower = buflower;
3887
3888
        /* Fast paths */
3889
4.24M
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3890
4.11M
            lower += 3;
3891
4.11M
            if (*lower == '_') {
3892
                /* Match "utf8" and "utf_8" */
3893
4.11M
                lower++;
3894
4.11M
            }
3895
3896
4.11M
            if (lower[0] == '8' && lower[1] == 0) {
3897
4.11M
                return _PyUnicode_AsUTF8String(unicode, errors);
3898
4.11M
            }
3899
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3900
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3901
0
            }
3902
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3903
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3904
0
            }
3905
4.11M
        }
3906
127k
        else {
3907
127k
            if (strcmp(lower, "ascii") == 0
3908
108k
                || strcmp(lower, "us_ascii") == 0) {
3909
108k
                return _PyUnicode_AsASCIIString(unicode, errors);
3910
108k
            }
3911
#ifdef MS_WINDOWS
3912
            else if (strcmp(lower, "mbcs") == 0) {
3913
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3914
            }
3915
#endif
3916
19.3k
            else if (strcmp(lower, "latin1") == 0 ||
3917
19.3k
                     strcmp(lower, "latin_1") == 0 ||
3918
19.3k
                     strcmp(lower, "iso_8859_1") == 0 ||
3919
19.3k
                     strcmp(lower, "iso8859_1") == 0) {
3920
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3921
0
            }
3922
127k
        }
3923
4.24M
    }
3924
3925
    /* Encode via the codec registry */
3926
830k
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3927
830k
    if (v == NULL)
3928
0
        return NULL;
3929
3930
    /* The normal path */
3931
830k
    if (PyBytes_Check(v))
3932
830k
        return v;
3933
3934
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3935
0
    if (PyByteArray_Check(v)) {
3936
0
        int error;
3937
0
        PyObject *b;
3938
3939
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3940
0
            "encoder %s returned bytearray instead of bytes; "
3941
0
            "use codecs.encode() to encode to arbitrary types",
3942
0
            encoding);
3943
0
        if (error) {
3944
0
            Py_DECREF(v);
3945
0
            return NULL;
3946
0
        }
3947
3948
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3949
0
                                      PyByteArray_GET_SIZE(v));
3950
0
        Py_DECREF(v);
3951
0
        return b;
3952
0
    }
3953
3954
0
    PyErr_Format(PyExc_TypeError,
3955
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3956
0
                 "use codecs.encode() to encode to arbitrary types",
3957
0
                 encoding,
3958
0
                 Py_TYPE(v)->tp_name);
3959
0
    Py_DECREF(v);
3960
0
    return NULL;
3961
0
}
3962
3963
PyAPI_FUNC(PyObject *)
3964
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3965
                           const char *encoding,
3966
                           const char *errors)
3967
0
{
3968
0
    PyObject *v;
3969
3970
0
    if (!PyUnicode_Check(unicode)) {
3971
0
        PyErr_BadArgument();
3972
0
        goto onError;
3973
0
    }
3974
3975
0
    if (encoding == NULL)
3976
0
        encoding = PyUnicode_GetDefaultEncoding();
3977
3978
    /* Encode via the codec registry */
3979
0
    v = PyCodec_Encode(unicode, encoding, errors);
3980
0
    if (v == NULL)
3981
0
        goto onError;
3982
0
    if (!PyUnicode_Check(v)) {
3983
0
        PyErr_Format(PyExc_TypeError,
3984
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3985
0
                     "use codecs.encode() to encode to arbitrary types",
3986
0
                     encoding,
3987
0
                     Py_TYPE(v)->tp_name);
3988
0
        Py_DECREF(v);
3989
0
        goto onError;
3990
0
    }
3991
0
    return v;
3992
3993
0
  onError:
3994
0
    return NULL;
3995
0
}
3996
3997
static PyObject*
3998
unicode_decode_locale(const char *str, Py_ssize_t len,
3999
                      _Py_error_handler errors, int current_locale)
4000
16.9k
{
4001
16.9k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
4002
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4003
0
        return NULL;
4004
0
    }
4005
4006
16.9k
    wchar_t *wstr;
4007
16.9k
    size_t wlen;
4008
16.9k
    const char *reason;
4009
16.9k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
4010
16.9k
                                 current_locale, errors);
4011
16.9k
    if (res != 0) {
4012
0
        if (res == -2) {
4013
0
            PyObject *exc;
4014
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4015
0
                                        "locale", str, len,
4016
0
                                        (Py_ssize_t)wlen,
4017
0
                                        (Py_ssize_t)(wlen + 1),
4018
0
                                        reason);
4019
0
            if (exc != NULL) {
4020
0
                PyCodec_StrictErrors(exc);
4021
0
                Py_DECREF(exc);
4022
0
            }
4023
0
        }
4024
0
        else if (res == -3) {
4025
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4026
0
        }
4027
0
        else {
4028
0
            PyErr_NoMemory();
4029
0
        }
4030
0
        return NULL;
4031
0
    }
4032
4033
16.9k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4034
16.9k
    PyMem_RawFree(wstr);
4035
16.9k
    return unicode;
4036
16.9k
}
4037
4038
PyObject*
4039
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4040
                              const char *errors)
4041
0
{
4042
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4043
0
    return unicode_decode_locale(str, len, error_handler, 1);
4044
0
}
4045
4046
PyObject*
4047
PyUnicode_DecodeLocale(const char *str, const char *errors)
4048
11.8k
{
4049
11.8k
    Py_ssize_t size = (Py_ssize_t)strlen(str);
4050
11.8k
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4051
11.8k
    return unicode_decode_locale(str, size, error_handler, 1);
4052
11.8k
}
4053
4054
4055
PyObject*
4056
0
PyUnicode_DecodeFSDefault(const char *s) {
4057
0
    Py_ssize_t size = (Py_ssize_t)strlen(s);
4058
0
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
4059
0
}
4060
4061
PyObject*
4062
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4063
6.72k
{
4064
6.72k
    PyInterpreterState *interp = _PyInterpreterState_GET();
4065
6.72k
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4066
6.72k
    if (fs_codec->utf8) {
4067
1.59k
        return unicode_decode_utf8(s, size,
4068
1.59k
                                   fs_codec->error_handler,
4069
1.59k
                                   fs_codec->errors,
4070
1.59k
                                   NULL);
4071
1.59k
    }
4072
5.13k
#ifndef _Py_FORCE_UTF8_FS_ENCODING
4073
5.13k
    else if (fs_codec->encoding) {
4074
0
        return PyUnicode_Decode(s, size,
4075
0
                                fs_codec->encoding,
4076
0
                                fs_codec->errors);
4077
0
    }
4078
5.13k
#endif
4079
5.13k
    else {
4080
        /* Before _PyUnicode_InitEncodings() is called, the Python codec
4081
           machinery is not ready and so cannot be used:
4082
           use mbstowcs() in this case. */
4083
5.13k
        const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4084
5.13k
        const wchar_t *filesystem_errors = config->filesystem_errors;
4085
5.13k
        assert(filesystem_errors != NULL);
4086
5.13k
        _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4087
5.13k
        assert(errors != _Py_ERROR_UNKNOWN);
4088
#ifdef _Py_FORCE_UTF8_FS_ENCODING
4089
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
4090
#else
4091
5.13k
        return unicode_decode_locale(s, size, errors, 0);
4092
5.13k
#endif
4093
5.13k
    }
4094
6.72k
}
4095
4096
4097
int
4098
PyUnicode_FSConverter(PyObject* arg, void* addr)
4099
12.4k
{
4100
12.4k
    PyObject *path = NULL;
4101
12.4k
    PyObject *output = NULL;
4102
12.4k
    Py_ssize_t size;
4103
12.4k
    const char *data;
4104
12.4k
    if (arg == NULL) {
4105
0
        Py_DECREF(*(PyObject**)addr);
4106
0
        *(PyObject**)addr = NULL;
4107
0
        return 1;
4108
0
    }
4109
12.4k
    path = PyOS_FSPath(arg);
4110
12.4k
    if (path == NULL) {
4111
0
        return 0;
4112
0
    }
4113
12.4k
    if (PyBytes_Check(path)) {
4114
0
        output = path;
4115
0
    }
4116
12.4k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
4117
12.4k
        output = PyUnicode_EncodeFSDefault(path);
4118
12.4k
        Py_DECREF(path);
4119
12.4k
        if (!output) {
4120
0
            return 0;
4121
0
        }
4122
12.4k
        assert(PyBytes_Check(output));
4123
12.4k
    }
4124
4125
12.4k
    size = PyBytes_GET_SIZE(output);
4126
12.4k
    data = PyBytes_AS_STRING(output);
4127
12.4k
    if ((size_t)size != strlen(data)) {
4128
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
4129
0
        Py_DECREF(output);
4130
0
        return 0;
4131
0
    }
4132
12.4k
    *(PyObject**)addr = output;
4133
12.4k
    return Py_CLEANUP_SUPPORTED;
4134
12.4k
}
4135
4136
4137
int
4138
PyUnicode_FSDecoder(PyObject* arg, void* addr)
4139
21.9k
{
4140
21.9k
    if (arg == NULL) {
4141
0
        Py_DECREF(*(PyObject**)addr);
4142
0
        *(PyObject**)addr = NULL;
4143
0
        return 1;
4144
0
    }
4145
4146
21.9k
    PyObject *path = PyOS_FSPath(arg);
4147
21.9k
    if (path == NULL) {
4148
0
        return 0;
4149
0
    }
4150
4151
21.9k
    PyObject *output = NULL;
4152
21.9k
    if (PyUnicode_Check(path)) {
4153
21.9k
        output = path;
4154
21.9k
    }
4155
0
    else if (PyBytes_Check(path)) {
4156
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path),
4157
0
                                                  PyBytes_GET_SIZE(path));
4158
0
        Py_DECREF(path);
4159
0
        if (!output) {
4160
0
            return 0;
4161
0
        }
4162
0
    }
4163
0
    else {
4164
0
        PyErr_Format(PyExc_TypeError,
4165
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
4166
0
                     Py_TYPE(arg)->tp_name);
4167
0
        Py_DECREF(path);
4168
0
        return 0;
4169
0
    }
4170
4171
21.9k
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
4172
21.9k
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
4173
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4174
0
        Py_DECREF(output);
4175
0
        return 0;
4176
0
    }
4177
21.9k
    *(PyObject**)addr = output;
4178
21.9k
    return Py_CLEANUP_SUPPORTED;
4179
21.9k
}
4180
4181
4182
static int unicode_fill_utf8(PyObject *unicode);
4183
4184
4185
static int
4186
unicode_ensure_utf8(PyObject *unicode)
4187
21.3M
{
4188
21.3M
    int err = 0;
4189
21.3M
    if (PyUnicode_UTF8(unicode) == NULL) {
4190
142k
        Py_BEGIN_CRITICAL_SECTION(unicode);
4191
142k
        if (PyUnicode_UTF8(unicode) == NULL) {
4192
142k
            err = unicode_fill_utf8(unicode);
4193
142k
        }
4194
142k
        Py_END_CRITICAL_SECTION();
4195
142k
    }
4196
21.3M
    return err;
4197
21.3M
}
4198
4199
const char *
4200
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
4201
21.3M
{
4202
21.3M
    if (!PyUnicode_Check(unicode)) {
4203
0
        PyErr_BadArgument();
4204
0
        if (psize) {
4205
0
            *psize = -1;
4206
0
        }
4207
0
        return NULL;
4208
0
    }
4209
4210
21.3M
    if (unicode_ensure_utf8(unicode) == -1) {
4211
206
        if (psize) {
4212
206
            *psize = -1;
4213
206
        }
4214
206
        return NULL;
4215
206
    }
4216
4217
21.3M
    if (psize) {
4218
21.2M
        *psize = PyUnicode_UTF8_LENGTH(unicode);
4219
21.2M
    }
4220
21.3M
    return PyUnicode_UTF8(unicode);
4221
21.3M
}
4222
4223
const char *
4224
PyUnicode_AsUTF8(PyObject *unicode)
4225
70.3k
{
4226
70.3k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
4227
70.3k
}
4228
4229
const char *
4230
_PyUnicode_AsUTF8NoNUL(PyObject *unicode)
4231
1.06M
{
4232
1.06M
    Py_ssize_t size;
4233
1.06M
    const char *s = PyUnicode_AsUTF8AndSize(unicode, &size);
4234
1.06M
    if (s && strlen(s) != (size_t)size) {
4235
158
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4236
158
        return NULL;
4237
158
    }
4238
1.06M
    return s;
4239
1.06M
}
4240
4241
/*
4242
PyUnicode_GetSize() has been deprecated since Python 3.3
4243
because it returned length of Py_UNICODE.
4244
4245
But this function is part of stable abi, because it doesn't
4246
include Py_UNICODE in signature and it was not excluded from
4247
stable ABI in PEP 384.
4248
*/
4249
PyAPI_FUNC(Py_ssize_t)
4250
PyUnicode_GetSize(PyObject *unicode)
4251
0
{
4252
0
    PyErr_SetString(PyExc_RuntimeError,
4253
0
                    "PyUnicode_GetSize has been removed.");
4254
0
    return -1;
4255
0
}
4256
4257
Py_ssize_t
4258
PyUnicode_GetLength(PyObject *unicode)
4259
34.1k
{
4260
34.1k
    if (!PyUnicode_Check(unicode)) {
4261
0
        PyErr_BadArgument();
4262
0
        return -1;
4263
0
    }
4264
34.1k
    return PyUnicode_GET_LENGTH(unicode);
4265
34.1k
}
4266
4267
Py_UCS4
4268
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4269
22
{
4270
22
    const void *data;
4271
22
    int kind;
4272
4273
22
    if (!PyUnicode_Check(unicode)) {
4274
0
        PyErr_BadArgument();
4275
0
        return (Py_UCS4)-1;
4276
0
    }
4277
22
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4278
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4279
0
        return (Py_UCS4)-1;
4280
0
    }
4281
22
    data = PyUnicode_DATA(unicode);
4282
22
    kind = PyUnicode_KIND(unicode);
4283
22
    return PyUnicode_READ(kind, data, index);
4284
22
}
4285
4286
int
4287
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4288
0
{
4289
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4290
0
        PyErr_BadArgument();
4291
0
        return -1;
4292
0
    }
4293
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4294
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4295
0
        return -1;
4296
0
    }
4297
0
    if (unicode_check_modifiable(unicode))
4298
0
        return -1;
4299
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4300
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4301
0
        return -1;
4302
0
    }
4303
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4304
0
                    index, ch);
4305
0
    return 0;
4306
0
}
4307
4308
const char *
4309
PyUnicode_GetDefaultEncoding(void)
4310
0
{
4311
0
    return "utf-8";
4312
0
}
4313
4314
/* create or adjust a UnicodeDecodeError */
4315
static void
4316
make_decode_exception(PyObject **exceptionObject,
4317
                      const char *encoding,
4318
                      const char *input, Py_ssize_t length,
4319
                      Py_ssize_t startpos, Py_ssize_t endpos,
4320
                      const char *reason)
4321
274k
{
4322
274k
    if (*exceptionObject == NULL) {
4323
76.0k
        *exceptionObject = PyUnicodeDecodeError_Create(
4324
76.0k
            encoding, input, length, startpos, endpos, reason);
4325
76.0k
    }
4326
198k
    else {
4327
198k
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4328
0
            goto onError;
4329
198k
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4330
0
            goto onError;
4331
198k
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4332
0
            goto onError;
4333
198k
    }
4334
274k
    return;
4335
4336
274k
onError:
4337
0
    Py_CLEAR(*exceptionObject);
4338
0
}
4339
4340
#ifdef MS_WINDOWS
4341
static int
4342
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4343
{
4344
    if (newsize > *size) {
4345
        wchar_t *newbuf = *buf;
4346
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4347
            PyErr_NoMemory();
4348
            return -1;
4349
        }
4350
        *buf = newbuf;
4351
    }
4352
    *size = newsize;
4353
    return 0;
4354
}
4355
4356
/* error handling callback helper:
4357
   build arguments, call the callback and check the arguments,
4358
   if no exception occurred, copy the replacement to the output
4359
   and adjust various state variables.
4360
   return 0 on success, -1 on error
4361
*/
4362
4363
static int
4364
unicode_decode_call_errorhandler_wchar(
4365
    const char *errors, PyObject **errorHandler,
4366
    const char *encoding, const char *reason,
4367
    const char **input, const char **inend, Py_ssize_t *startinpos,
4368
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4369
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4370
{
4371
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4372
4373
    PyObject *restuple = NULL;
4374
    PyObject *repunicode = NULL;
4375
    Py_ssize_t outsize;
4376
    Py_ssize_t insize;
4377
    Py_ssize_t requiredsize;
4378
    Py_ssize_t newpos;
4379
    PyObject *inputobj = NULL;
4380
    Py_ssize_t repwlen;
4381
4382
    if (*errorHandler == NULL) {
4383
        *errorHandler = PyCodec_LookupError(errors);
4384
        if (*errorHandler == NULL)
4385
            goto onError;
4386
    }
4387
4388
    make_decode_exception(exceptionObject,
4389
        encoding,
4390
        *input, *inend - *input,
4391
        *startinpos, *endinpos,
4392
        reason);
4393
    if (*exceptionObject == NULL)
4394
        goto onError;
4395
4396
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4397
    if (restuple == NULL)
4398
        goto onError;
4399
    if (!PyTuple_Check(restuple)) {
4400
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4401
        goto onError;
4402
    }
4403
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4404
        goto onError;
4405
4406
    /* Copy back the bytes variables, which might have been modified by the
4407
       callback */
4408
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4409
    if (!inputobj)
4410
        goto onError;
4411
    *input = PyBytes_AS_STRING(inputobj);
4412
    insize = PyBytes_GET_SIZE(inputobj);
4413
    *inend = *input + insize;
4414
    /* we can DECREF safely, as the exception has another reference,
4415
       so the object won't go away. */
4416
    Py_DECREF(inputobj);
4417
4418
    if (newpos<0)
4419
        newpos = insize+newpos;
4420
    if (newpos<0 || newpos>insize) {
4421
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4422
        goto onError;
4423
    }
4424
4425
    repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4426
    if (repwlen < 0)
4427
        goto onError;
4428
    repwlen--;
4429
    /* need more space? (at least enough for what we
4430
       have+the replacement+the rest of the string (starting
4431
       at the new input position), so we won't have to check space
4432
       when there are no errors in the rest of the string) */
4433
    requiredsize = *outpos;
4434
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4435
        goto overflow;
4436
    requiredsize += repwlen;
4437
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4438
        goto overflow;
4439
    requiredsize += insize - newpos;
4440
    outsize = *bufsize;
4441
    if (requiredsize > outsize) {
4442
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4443
            requiredsize = 2*outsize;
4444
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4445
            goto onError;
4446
        }
4447
    }
4448
    PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
4449
    *outpos += repwlen;
4450
    *endinpos = newpos;
4451
    *inptr = *input + newpos;
4452
4453
    /* we made it! */
4454
    Py_DECREF(restuple);
4455
    return 0;
4456
4457
  overflow:
4458
    PyErr_SetString(PyExc_OverflowError,
4459
                    "decoded result is too long for a Python string");
4460
4461
  onError:
4462
    Py_XDECREF(restuple);
4463
    return -1;
4464
}
4465
#endif   /* MS_WINDOWS */
4466
4467
static int
4468
unicode_decode_call_errorhandler_writer(
4469
    const char *errors, PyObject **errorHandler,
4470
    const char *encoding, const char *reason,
4471
    const char **input, const char **inend, Py_ssize_t *startinpos,
4472
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4473
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4474
274k
{
4475
274k
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4476
4477
274k
    PyObject *restuple = NULL;
4478
274k
    PyObject *repunicode = NULL;
4479
274k
    Py_ssize_t insize;
4480
274k
    Py_ssize_t newpos;
4481
274k
    Py_ssize_t replen;
4482
274k
    Py_ssize_t remain;
4483
274k
    PyObject *inputobj = NULL;
4484
274k
    int need_to_grow = 0;
4485
274k
    const char *new_inptr;
4486
4487
274k
    if (*errorHandler == NULL) {
4488
76.0k
        *errorHandler = PyCodec_LookupError(errors);
4489
76.0k
        if (*errorHandler == NULL)
4490
0
            goto onError;
4491
76.0k
    }
4492
4493
274k
    make_decode_exception(exceptionObject,
4494
274k
        encoding,
4495
274k
        *input, *inend - *input,
4496
274k
        *startinpos, *endinpos,
4497
274k
        reason);
4498
274k
    if (*exceptionObject == NULL)
4499
0
        goto onError;
4500
4501
274k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
4502
274k
    if (restuple == NULL)
4503
48.2k
        goto onError;
4504
225k
    if (!PyTuple_Check(restuple)) {
4505
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4506
0
        goto onError;
4507
0
    }
4508
225k
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4509
0
        goto onError;
4510
4511
    /* Copy back the bytes variables, which might have been modified by the
4512
       callback */
4513
225k
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4514
225k
    if (!inputobj)
4515
0
        goto onError;
4516
225k
    remain = *inend - *input - *endinpos;
4517
225k
    *input = PyBytes_AS_STRING(inputobj);
4518
225k
    insize = PyBytes_GET_SIZE(inputobj);
4519
225k
    *inend = *input + insize;
4520
    /* we can DECREF safely, as the exception has another reference,
4521
       so the object won't go away. */
4522
225k
    Py_DECREF(inputobj);
4523
4524
225k
    if (newpos<0)
4525
0
        newpos = insize+newpos;
4526
225k
    if (newpos<0 || newpos>insize) {
4527
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4528
0
        goto onError;
4529
0
    }
4530
4531
225k
    replen = PyUnicode_GET_LENGTH(repunicode);
4532
225k
    if (replen > 1) {
4533
17.0k
        writer->min_length += replen - 1;
4534
17.0k
        need_to_grow = 1;
4535
17.0k
    }
4536
225k
    new_inptr = *input + newpos;
4537
225k
    if (*inend - new_inptr > remain) {
4538
        /* We don't know the decoding algorithm here so we make the worst
4539
           assumption that one byte decodes to one unicode character.
4540
           If unfortunately one byte could decode to more unicode characters,
4541
           the decoder may write out-of-bound then.  Is it possible for the
4542
           algorithms using this function? */
4543
6.61k
        writer->min_length += *inend - new_inptr - remain;
4544
6.61k
        need_to_grow = 1;
4545
6.61k
    }
4546
225k
    if (need_to_grow) {
4547
17.2k
        writer->overallocate = 1;
4548
17.2k
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4549
17.2k
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4550
0
            goto onError;
4551
17.2k
    }
4552
225k
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4553
0
        goto onError;
4554
4555
225k
    *endinpos = newpos;
4556
225k
    *inptr = new_inptr;
4557
4558
    /* we made it! */
4559
225k
    Py_DECREF(restuple);
4560
225k
    return 0;
4561
4562
48.2k
  onError:
4563
48.2k
    Py_XDECREF(restuple);
4564
48.2k
    return -1;
4565
225k
}
4566
4567
/* --- UTF-7 Codec -------------------------------------------------------- */
4568
4569
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4570
4571
/* Three simple macros defining base-64. */
4572
4573
/* Is c a base-64 character? */
4574
4575
#define IS_BASE64(c) \
4576
289k
    (((c) >= 'A' && (c) <= 'Z') ||     \
4577
289k
     ((c) >= 'a' && (c) <= 'z') ||     \
4578
289k
     ((c) >= '0' && (c) <= '9') ||     \
4579
289k
     (c) == '+' || (c) == '/')
4580
4581
/* given that c is a base-64 character, what is its base-64 value? */
4582
4583
#define FROM_BASE64(c)                                                  \
4584
241k
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4585
241k
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4586
188k
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4587
105k
     (c) == '+' ? 62 : 63)
4588
4589
/* What is the base-64 character of the bottom 6 bits of n? */
4590
4591
#define TO_BASE64(n)  \
4592
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4593
4594
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4595
 * decoded as itself.  We are permissive on decoding; the only ASCII
4596
 * byte not decoding to itself is the + which begins a base64
4597
 * string. */
4598
4599
#define DECODE_DIRECT(c)                                \
4600
7.57M
    ((c) <= 127 && (c) != '+')
4601
4602
/* The UTF-7 encoder treats ASCII characters differently according to
4603
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4604
 * the above).  See RFC2152.  This array identifies these different
4605
 * sets:
4606
 * 0 : "Set D"
4607
 *     alphanumeric and '(),-./:?
4608
 * 1 : "Set O"
4609
 *     !"#$%&*;<=>@[]^_`{|}
4610
 * 2 : "whitespace"
4611
 *     ht nl cr sp
4612
 * 3 : special (must be base64 encoded)
4613
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4614
 */
4615
4616
static
4617
char utf7_category[128] = {
4618
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4619
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4620
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4621
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4622
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4623
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4624
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4625
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4626
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4627
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4628
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4629
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4630
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4631
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4632
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4633
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4634
};
4635
4636
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4637
 * answer depends on whether we are encoding set O as itself, and also
4638
 * on whether we are encoding whitespace as itself.  RFC 2152 makes it
4639
 * clear that the answers to these questions vary between
4640
 * applications, so this code needs to be flexible.  */
4641
4642
#define ENCODE_DIRECT(c) \
4643
0
    ((c) < 128 && (c) > 0 && ((utf7_category[(c)] != 3)))
4644
4645
PyObject *
4646
PyUnicode_DecodeUTF7(const char *s,
4647
                     Py_ssize_t size,
4648
                     const char *errors)
4649
0
{
4650
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4651
0
}
4652
4653
/* The decoder.  The only state we preserve is our read position,
4654
 * i.e. how many characters we have consumed.  So if we end in the
4655
 * middle of a shift sequence we have to back off the read position
4656
 * and the output to the beginning of the sequence, otherwise we lose
4657
 * all the shift state (seen bits, number of bits seen, high
4658
 * surrogate). */
4659
4660
PyObject *
4661
PyUnicode_DecodeUTF7Stateful(const char *s,
4662
                             Py_ssize_t size,
4663
                             const char *errors,
4664
                             Py_ssize_t *consumed)
4665
32.5k
{
4666
32.5k
    const char *starts = s;
4667
32.5k
    Py_ssize_t startinpos;
4668
32.5k
    Py_ssize_t endinpos;
4669
32.5k
    const char *e;
4670
32.5k
    _PyUnicodeWriter writer;
4671
32.5k
    const char *errmsg = "";
4672
32.5k
    int inShift = 0;
4673
32.5k
    Py_ssize_t shiftOutStart;
4674
32.5k
    unsigned int base64bits = 0;
4675
32.5k
    unsigned long base64buffer = 0;
4676
32.5k
    Py_UCS4 surrogate = 0;
4677
32.5k
    PyObject *errorHandler = NULL;
4678
32.5k
    PyObject *exc = NULL;
4679
4680
32.5k
    if (size == 0) {
4681
0
        if (consumed)
4682
0
            *consumed = 0;
4683
0
        _Py_RETURN_UNICODE_EMPTY();
4684
0
    }
4685
4686
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4687
32.5k
    _PyUnicodeWriter_Init(&writer);
4688
32.5k
    writer.min_length = size;
4689
4690
32.5k
    shiftOutStart = 0;
4691
32.5k
    e = s + size;
4692
4693
7.88M
    while (s < e) {
4694
7.86M
        Py_UCS4 ch;
4695
7.86M
      restart:
4696
7.86M
        ch = (unsigned char) *s;
4697
4698
7.86M
        if (inShift) { /* in a base-64 section */
4699
262k
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4700
241k
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4701
241k
                base64bits += 6;
4702
241k
                s++;
4703
241k
                if (base64bits >= 16) {
4704
                    /* we have enough bits for a UTF-16 value */
4705
82.6k
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4706
82.6k
                    base64bits -= 16;
4707
82.6k
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4708
82.6k
                    assert(outCh <= 0xffff);
4709
82.6k
                    if (surrogate) {
4710
                        /* expecting a second surrogate */
4711
8.87k
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4712
3.66k
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4713
3.66k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4714
0
                                goto onError;
4715
3.66k
                            surrogate = 0;
4716
3.66k
                            continue;
4717
3.66k
                        }
4718
5.20k
                        else {
4719
5.20k
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4720
0
                                goto onError;
4721
5.20k
                            surrogate = 0;
4722
5.20k
                        }
4723
8.87k
                    }
4724
79.0k
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4725
                        /* first surrogate */
4726
12.9k
                        surrogate = outCh;
4727
12.9k
                    }
4728
66.0k
                    else {
4729
66.0k
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4730
0
                            goto onError;
4731
66.0k
                    }
4732
79.0k
                }
4733
241k
            }
4734
20.1k
            else { /* now leaving a base-64 section */
4735
20.1k
                inShift = 0;
4736
20.1k
                if (base64bits > 0) { /* left-over bits */
4737
16.6k
                    if (base64bits >= 6) {
4738
                        /* We've seen at least one base-64 character */
4739
9.74k
                        s++;
4740
9.74k
                        errmsg = "partial character in shift sequence";
4741
9.74k
                        goto utf7Error;
4742
9.74k
                    }
4743
6.93k
                    else {
4744
                        /* Some bits remain; they should be zero */
4745
6.93k
                        if (base64buffer != 0) {
4746
1.34k
                            s++;
4747
1.34k
                            errmsg = "non-zero padding bits in shift sequence";
4748
1.34k
                            goto utf7Error;
4749
1.34k
                        }
4750
6.93k
                    }
4751
16.6k
                }
4752
9.10k
                if (surrogate && DECODE_DIRECT(ch)) {
4753
3.04k
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4754
0
                        goto onError;
4755
3.04k
                }
4756
9.10k
                surrogate = 0;
4757
9.10k
                if (ch == '-') {
4758
                    /* '-' is absorbed; other terminating
4759
                       characters are preserved */
4760
2.54k
                    s++;
4761
2.54k
                }
4762
9.10k
            }
4763
262k
        }
4764
7.60M
        else if ( ch == '+' ) {
4765
30.3k
            startinpos = s-starts;
4766
30.3k
            s++; /* consume '+' */
4767
30.3k
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4768
2.25k
                s++;
4769
2.25k
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4770
0
                    goto onError;
4771
2.25k
            }
4772
28.0k
            else if (s < e && !IS_BASE64(*s)) {
4773
4.12k
                s++;
4774
4.12k
                errmsg = "ill-formed sequence";
4775
4.12k
                goto utf7Error;
4776
4.12k
            }
4777
23.9k
            else { /* begin base64-encoded section */
4778
23.9k
                inShift = 1;
4779
23.9k
                surrogate = 0;
4780
23.9k
                shiftOutStart = writer.pos;
4781
23.9k
                base64bits = 0;
4782
23.9k
                base64buffer = 0;
4783
23.9k
            }
4784
30.3k
        }
4785
7.56M
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4786
7.47M
            s++;
4787
7.47M
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4788
0
                goto onError;
4789
7.47M
        }
4790
96.9k
        else {
4791
96.9k
            startinpos = s-starts;
4792
96.9k
            s++;
4793
96.9k
            errmsg = "unexpected special character";
4794
96.9k
            goto utf7Error;
4795
96.9k
        }
4796
7.74M
        continue;
4797
7.74M
utf7Error:
4798
112k
        endinpos = s-starts;
4799
112k
        if (unicode_decode_call_errorhandler_writer(
4800
112k
                errors, &errorHandler,
4801
112k
                "utf7", errmsg,
4802
112k
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4803
112k
                &writer))
4804
14.1k
            goto onError;
4805
112k
    }
4806
4807
    /* end of string */
4808
4809
18.4k
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4810
        /* if we're in an inconsistent state, that's an error */
4811
3.75k
        inShift = 0;
4812
3.75k
        if (surrogate ||
4813
3.17k
                (base64bits >= 6) ||
4814
2.53k
                (base64bits > 0 && base64buffer != 0)) {
4815
2.53k
            endinpos = size;
4816
2.53k
            if (unicode_decode_call_errorhandler_writer(
4817
2.53k
                    errors, &errorHandler,
4818
2.53k
                    "utf7", "unterminated shift sequence",
4819
2.53k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4820
2.53k
                    &writer))
4821
2.18k
                goto onError;
4822
347
            if (s < e)
4823
0
                goto restart;
4824
347
        }
4825
3.75k
    }
4826
4827
    /* return state */
4828
16.2k
    if (consumed) {
4829
0
        if (inShift) {
4830
0
            *consumed = startinpos;
4831
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4832
0
                PyObject *result = PyUnicode_FromKindAndData(
4833
0
                        writer.kind, writer.data, shiftOutStart);
4834
0
                Py_XDECREF(errorHandler);
4835
0
                Py_XDECREF(exc);
4836
0
                _PyUnicodeWriter_Dealloc(&writer);
4837
0
                return result;
4838
0
            }
4839
0
            writer.pos = shiftOutStart; /* back off output */
4840
0
        }
4841
0
        else {
4842
0
            *consumed = s-starts;
4843
0
        }
4844
0
    }
4845
4846
16.2k
    Py_XDECREF(errorHandler);
4847
16.2k
    Py_XDECREF(exc);
4848
16.2k
    return _PyUnicodeWriter_Finish(&writer);
4849
4850
16.2k
  onError:
4851
16.2k
    Py_XDECREF(errorHandler);
4852
16.2k
    Py_XDECREF(exc);
4853
16.2k
    _PyUnicodeWriter_Dealloc(&writer);
4854
16.2k
    return NULL;
4855
16.2k
}
4856
4857
4858
PyObject *
4859
_PyUnicode_EncodeUTF7(PyObject *str,
4860
                      const char *errors)
4861
0
{
4862
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
4863
0
    if (len == 0) {
4864
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
4865
0
    }
4866
0
    int kind = PyUnicode_KIND(str);
4867
0
    const void *data = PyUnicode_DATA(str);
4868
4869
    /* It might be possible to tighten this worst case */
4870
0
    if (len > PY_SSIZE_T_MAX / 8) {
4871
0
        return PyErr_NoMemory();
4872
0
    }
4873
0
    PyBytesWriter *writer = PyBytesWriter_Create(len * 8);
4874
0
    if (writer == NULL) {
4875
0
        return NULL;
4876
0
    }
4877
4878
0
    int inShift = 0;
4879
0
    unsigned int base64bits = 0;
4880
0
    unsigned long base64buffer = 0;
4881
0
    char *out = PyBytesWriter_GetData(writer);
4882
0
    for (Py_ssize_t i = 0; i < len; ++i) {
4883
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4884
4885
0
        if (inShift) {
4886
0
            if (ENCODE_DIRECT(ch)) {
4887
                /* shifting out */
4888
0
                if (base64bits) { /* output remaining bits */
4889
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4890
0
                    base64buffer = 0;
4891
0
                    base64bits = 0;
4892
0
                }
4893
0
                inShift = 0;
4894
                /* Characters not in the BASE64 set implicitly unshift the sequence
4895
                   so no '-' is required, except if the character is itself a '-' */
4896
0
                if (IS_BASE64(ch) || ch == '-') {
4897
0
                    *out++ = '-';
4898
0
                }
4899
0
                *out++ = (char) ch;
4900
0
            }
4901
0
            else {
4902
0
                goto encode_char;
4903
0
            }
4904
0
        }
4905
0
        else { /* not in a shift sequence */
4906
0
            if (ch == '+') {
4907
0
                *out++ = '+';
4908
0
                        *out++ = '-';
4909
0
            }
4910
0
            else if (ENCODE_DIRECT(ch)) {
4911
0
                *out++ = (char) ch;
4912
0
            }
4913
0
            else {
4914
0
                *out++ = '+';
4915
0
                inShift = 1;
4916
0
                goto encode_char;
4917
0
            }
4918
0
        }
4919
0
        continue;
4920
0
encode_char:
4921
0
        if (ch >= 0x10000) {
4922
0
            assert(ch <= MAX_UNICODE);
4923
4924
            /* code first surrogate */
4925
0
            base64bits += 16;
4926
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4927
0
            while (base64bits >= 6) {
4928
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4929
0
                base64bits -= 6;
4930
0
            }
4931
            /* prepare second surrogate */
4932
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4933
0
        }
4934
0
        base64bits += 16;
4935
0
        base64buffer = (base64buffer << 16) | ch;
4936
0
        while (base64bits >= 6) {
4937
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4938
0
            base64bits -= 6;
4939
0
        }
4940
0
    }
4941
0
    if (base64bits)
4942
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4943
0
    if (inShift)
4944
0
        *out++ = '-';
4945
0
    return PyBytesWriter_FinishWithPointer(writer, out);
4946
0
}
4947
4948
#undef IS_BASE64
4949
#undef FROM_BASE64
4950
#undef TO_BASE64
4951
#undef DECODE_DIRECT
4952
#undef ENCODE_DIRECT
4953
4954
/* --- UTF-8 Codec -------------------------------------------------------- */
4955
4956
PyObject *
4957
PyUnicode_DecodeUTF8(const char *s,
4958
                     Py_ssize_t size,
4959
                     const char *errors)
4960
2.52M
{
4961
2.52M
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4962
2.52M
}
4963
4964
#include "stringlib/asciilib.h"
4965
#include "stringlib/codecs.h"
4966
#include "stringlib/undef.h"
4967
4968
#include "stringlib/ucs1lib.h"
4969
#include "stringlib/codecs.h"
4970
#include "stringlib/undef.h"
4971
4972
#include "stringlib/ucs2lib.h"
4973
#include "stringlib/codecs.h"
4974
#include "stringlib/undef.h"
4975
4976
#include "stringlib/ucs4lib.h"
4977
#include "stringlib/codecs.h"
4978
#include "stringlib/undef.h"
4979
4980
#if (SIZEOF_SIZE_T == 8)
4981
/* Mask to quickly check whether a C 'size_t' contains a
4982
   non-ASCII, UTF8-encoded char. */
4983
120M
# define ASCII_CHAR_MASK 0x8080808080808080ULL
4984
// used to count codepoints in UTF-8 string.
4985
274M
# define VECTOR_0101     0x0101010101010101ULL
4986
2.46M
# define VECTOR_00FF     0x00ff00ff00ff00ffULL
4987
#elif (SIZEOF_SIZE_T == 4)
4988
# define ASCII_CHAR_MASK 0x80808080U
4989
# define VECTOR_0101     0x01010101U
4990
# define VECTOR_00FF     0x00ff00ffU
4991
#else
4992
# error C 'size_t' size should be either 4 or 8!
4993
#endif
4994
4995
#if (defined(__clang__) || defined(__GNUC__))
4996
#define HAVE_CTZ 1
4997
static inline unsigned int
4998
ctz(size_t v)
4999
531k
{
5000
531k
    return __builtin_ctzll((unsigned long long)v);
5001
531k
}
5002
#elif defined(_MSC_VER)
5003
#define HAVE_CTZ 1
5004
static inline unsigned int
5005
ctz(size_t v)
5006
{
5007
    unsigned long pos;
5008
#if SIZEOF_SIZE_T == 4
5009
    _BitScanForward(&pos, v);
5010
#else
5011
    _BitScanForward64(&pos, v);
5012
#endif /* SIZEOF_SIZE_T */
5013
    return pos;
5014
}
5015
#else
5016
#define HAVE_CTZ 0
5017
#endif
5018
5019
#if HAVE_CTZ && PY_LITTLE_ENDIAN
5020
// load p[0]..p[size-1] as a size_t without unaligned access nor read ahead.
5021
static size_t
5022
load_unaligned(const unsigned char *p, size_t size)
5023
13.4M
{
5024
13.4M
    union {
5025
13.4M
        size_t s;
5026
13.4M
        unsigned char b[SIZEOF_SIZE_T];
5027
13.4M
    } u;
5028
13.4M
    u.s = 0;
5029
    // This switch statement assumes little endian because:
5030
    // * union is faster than bitwise or and shift.
5031
    // * big endian machine is rare and hard to maintain.
5032
13.4M
    switch (size) {
5033
0
    default:
5034
0
#if SIZEOF_SIZE_T == 8
5035
0
    case 8:
5036
0
        u.b[7] = p[7];
5037
0
        _Py_FALLTHROUGH;
5038
808k
    case 7:
5039
808k
        u.b[6] = p[6];
5040
808k
        _Py_FALLTHROUGH;
5041
3.45M
    case 6:
5042
3.45M
        u.b[5] = p[5];
5043
3.45M
        _Py_FALLTHROUGH;
5044
4.02M
    case 5:
5045
4.02M
        u.b[4] = p[4];
5046
4.02M
        _Py_FALLTHROUGH;
5047
4.02M
#endif
5048
4.56M
    case 4:
5049
4.56M
        u.b[3] = p[3];
5050
4.56M
        _Py_FALLTHROUGH;
5051
9.65M
    case 3:
5052
9.65M
        u.b[2] = p[2];
5053
9.65M
        _Py_FALLTHROUGH;
5054
12.9M
    case 2:
5055
12.9M
        u.b[1] = p[1];
5056
12.9M
        _Py_FALLTHROUGH;
5057
13.2M
    case 1:
5058
13.2M
        u.b[0] = p[0];
5059
13.2M
        break;
5060
187k
    case 0:
5061
187k
        break;
5062
13.4M
    }
5063
13.4M
    return u.s;
5064
13.4M
}
5065
#endif
5066
5067
/*
5068
 * Find the first non-ASCII character in a byte sequence.
5069
 *
5070
 * This function scans a range of bytes from `start` to `end` and returns the
5071
 * index of the first byte that is not an ASCII character (i.e., has the most
5072
 * significant bit set). If all characters in the range are ASCII, it returns
5073
 * `end - start`.
5074
 */
5075
static Py_ssize_t
5076
find_first_nonascii(const unsigned char *start, const unsigned char *end)
5077
13.8M
{
5078
    // The search is done in `size_t` chunks.
5079
    // The start and end might not be aligned at `size_t` boundaries,
5080
    // so they're handled specially.
5081
5082
13.8M
    const unsigned char *p = start;
5083
5084
13.8M
    if (end - start >= SIZEOF_SIZE_T) {
5085
        // Avoid unaligned read.
5086
3.80M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5087
3.80M
        size_t u;
5088
3.80M
        memcpy(&u, p, sizeof(size_t));
5089
3.80M
        u &= ASCII_CHAR_MASK;
5090
3.80M
        if (u) {
5091
204k
            return (ctz(u) - 7) / 8;
5092
204k
        }
5093
3.60M
        p = _Py_ALIGN_DOWN(p + SIZEOF_SIZE_T, SIZEOF_SIZE_T);
5094
#else /* PY_LITTLE_ENDIAN && HAVE_CTZ */
5095
        const unsigned char *p2 = _Py_ALIGN_UP(p, SIZEOF_SIZE_T);
5096
        while (p < p2) {
5097
            if (*p & 0x80) {
5098
                return p - start;
5099
            }
5100
            p++;
5101
        }
5102
#endif
5103
5104
3.60M
        const unsigned char *e = end - SIZEOF_SIZE_T;
5105
105M
        while (p <= e) {
5106
102M
            size_t u = (*(const size_t *)p) & ASCII_CHAR_MASK;
5107
102M
            if (u) {
5108
157k
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5109
157k
                return p - start + (ctz(u) - 7) / 8;
5110
#else
5111
                // big endian and minor compilers are difficult to test.
5112
                // fallback to per byte check.
5113
                break;
5114
#endif
5115
157k
            }
5116
102M
            p += SIZEOF_SIZE_T;
5117
102M
        }
5118
3.60M
    }
5119
13.4M
#if PY_LITTLE_ENDIAN && HAVE_CTZ
5120
13.8M
    assert((end - p) < SIZEOF_SIZE_T);
5121
    // we can not use *(const size_t*)p to avoid buffer overrun.
5122
13.4M
    size_t u = load_unaligned(p, end - p) & ASCII_CHAR_MASK;
5123
13.4M
    if (u) {
5124
169k
        return p - start + (ctz(u) - 7) / 8;
5125
169k
    }
5126
13.2M
    return end - start;
5127
#else
5128
    while (p < end) {
5129
        if (*p & 0x80) {
5130
            break;
5131
        }
5132
        p++;
5133
    }
5134
    return p - start;
5135
#endif
5136
13.4M
}
5137
5138
static inline int
5139
scalar_utf8_start_char(unsigned int ch)
5140
699k
{
5141
    // 0xxxxxxx or 11xxxxxx are first byte.
5142
699k
    return (~ch >> 7 | ch >> 6) & 1;
5143
699k
}
5144
5145
static inline size_t
5146
vector_utf8_start_chars(size_t v)
5147
274M
{
5148
274M
    return ((~v >> 7) | (v >> 6)) & VECTOR_0101;
5149
274M
}
5150
5151
5152
// Count the number of UTF-8 code points in a given byte sequence.
5153
static Py_ssize_t
5154
utf8_count_codepoints(const unsigned char *s, const unsigned char *end)
5155
246k
{
5156
246k
    Py_ssize_t len = 0;
5157
5158
246k
    if (end - s >= SIZEOF_SIZE_T) {
5159
179k
        while (!_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
5160
18.7k
            len += scalar_utf8_start_char(*s++);
5161
18.7k
        }
5162
5163
1.39M
        while (s + SIZEOF_SIZE_T <= end) {
5164
1.23M
            const unsigned char *e = end;
5165
1.23M
            if (e - s > SIZEOF_SIZE_T * 255) {
5166
1.07M
                e = s + SIZEOF_SIZE_T * 255;
5167
1.07M
            }
5168
1.23M
            Py_ssize_t vstart = 0;
5169
276M
            while (s + SIZEOF_SIZE_T <= e) {
5170
274M
                size_t v = *(size_t*)s;
5171
274M
                size_t vs = vector_utf8_start_chars(v);
5172
274M
                vstart += vs;
5173
274M
                s += SIZEOF_SIZE_T;
5174
274M
            }
5175
1.23M
            vstart = (vstart & VECTOR_00FF) + ((vstart >> 8) & VECTOR_00FF);
5176
1.23M
            vstart += vstart >> 16;
5177
1.23M
#if SIZEOF_SIZE_T == 8
5178
1.23M
            vstart += vstart >> 32;
5179
1.23M
#endif
5180
1.23M
            len += vstart & 0x7ff;
5181
1.23M
        }
5182
161k
    }
5183
926k
    while (s < end) {
5184
680k
        len += scalar_utf8_start_char(*s++);
5185
680k
    }
5186
246k
    return len;
5187
246k
}
5188
5189
static Py_ssize_t
5190
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
5191
5.19M
{
5192
5.19M
#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5193
5.19M
    if (_Py_IS_ALIGNED(start, ALIGNOF_SIZE_T)
5194
5.12M
        && _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
5195
847k
    {
5196
        /* Fast path, see in STRINGLIB(utf8_decode) for
5197
           an explanation. */
5198
847k
        const char *p = start;
5199
847k
        Py_UCS1 *q = dest;
5200
1.72M
        while (p + SIZEOF_SIZE_T <= end) {
5201
1.00M
            size_t value = *(const size_t *) p;
5202
1.00M
            if (value & ASCII_CHAR_MASK)
5203
128k
                break;
5204
875k
            *((size_t *)q) = value;
5205
875k
            p += SIZEOF_SIZE_T;
5206
875k
            q += SIZEOF_SIZE_T;
5207
875k
        }
5208
3.89M
        while (p < end) {
5209
3.19M
            if ((unsigned char)*p & 0x80)
5210
146k
                break;
5211
3.04M
            *q++ = *p++;
5212
3.04M
        }
5213
847k
        return p - start;
5214
847k
    }
5215
4.35M
#endif
5216
4.35M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)start,
5217
4.35M
                                         (const unsigned char*)end);
5218
4.35M
    memcpy(dest, start, pos);
5219
4.35M
    return pos;
5220
5.19M
}
5221
5222
static int
5223
unicode_decode_utf8_impl(_PyUnicodeWriter *writer,
5224
                         const char *starts, const char *s, const char *end,
5225
                         _Py_error_handler error_handler,
5226
                         const char *errors,
5227
                         Py_ssize_t *consumed)
5228
533k
{
5229
533k
    Py_ssize_t startinpos, endinpos;
5230
533k
    const char *errmsg = "";
5231
533k
    PyObject *error_handler_obj = NULL;
5232
533k
    PyObject *exc = NULL;
5233
5234
163M
    while (s < end) {
5235
163M
        Py_UCS4 ch;
5236
163M
        int kind = writer->kind;
5237
5238
163M
        if (kind == PyUnicode_1BYTE_KIND) {
5239
486k
            if (PyUnicode_IS_ASCII(writer->buffer))
5240
285k
                ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
5241
200k
            else
5242
200k
                ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
5243
162M
        } else if (kind == PyUnicode_2BYTE_KIND) {
5244
86.1M
            ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
5245
86.1M
        } else {
5246
76.7M
            assert(kind == PyUnicode_4BYTE_KIND);
5247
76.7M
            ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
5248
76.7M
        }
5249
5250
163M
        switch (ch) {
5251
468k
        case 0:
5252
468k
            if (s == end || consumed)
5253
446k
                goto End;
5254
22.3k
            errmsg = "unexpected end of data";
5255
22.3k
            startinpos = s - starts;
5256
22.3k
            endinpos = end - starts;
5257
22.3k
            break;
5258
125M
        case 1:
5259
125M
            errmsg = "invalid start byte";
5260
125M
            startinpos = s - starts;
5261
125M
            endinpos = startinpos + 1;
5262
125M
            break;
5263
35.5M
        case 2:
5264
35.5M
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5265
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5266
0
            {
5267
                /* Truncated surrogate code in range D800-DFFF */
5268
0
                goto End;
5269
0
            }
5270
35.5M
            _Py_FALLTHROUGH;
5271
36.6M
        case 3:
5272
36.7M
        case 4:
5273
36.7M
            errmsg = "invalid continuation byte";
5274
36.7M
            startinpos = s - starts;
5275
36.7M
            endinpos = startinpos + ch - 1;
5276
36.7M
            break;
5277
283k
        default:
5278
            // ch doesn't fit into kind, so change the buffer kind to write
5279
            // the character
5280
283k
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
5281
0
                goto onError;
5282
283k
            continue;
5283
163M
        }
5284
5285
162M
        if (error_handler == _Py_ERROR_UNKNOWN)
5286
107k
            error_handler = _Py_GetErrorHandler(errors);
5287
5288
162M
        switch (error_handler) {
5289
0
        case _Py_ERROR_IGNORE:
5290
0
            s += (endinpos - startinpos);
5291
0
            break;
5292
5293
162M
        case _Py_ERROR_REPLACE:
5294
162M
            if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
5295
0
                goto onError;
5296
162M
            s += (endinpos - startinpos);
5297
162M
            break;
5298
5299
3.29k
        case _Py_ERROR_SURROGATEESCAPE:
5300
3.29k
        {
5301
3.29k
            Py_ssize_t i;
5302
5303
3.29k
            if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
5304
0
                goto onError;
5305
7.00k
            for (i=startinpos; i<endinpos; i++) {
5306
3.71k
                ch = (Py_UCS4)(unsigned char)(starts[i]);
5307
3.71k
                PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
5308
3.71k
                                ch + 0xdc00);
5309
3.71k
                writer->pos++;
5310
3.71k
            }
5311
3.29k
            s += (endinpos - startinpos);
5312
3.29k
            break;
5313
3.29k
        }
5314
5315
3.54k
        default:
5316
3.54k
            if (unicode_decode_call_errorhandler_writer(
5317
3.54k
                    errors, &error_handler_obj,
5318
3.54k
                    "utf-8", errmsg,
5319
3.54k
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5320
3.54k
                    writer)) {
5321
3.54k
                goto onError;
5322
3.54k
            }
5323
5324
0
            if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
5325
0
                return -1;
5326
0
            }
5327
162M
        }
5328
162M
    }
5329
5330
529k
End:
5331
529k
    if (consumed)
5332
902
        *consumed = s - starts;
5333
5334
529k
    Py_XDECREF(error_handler_obj);
5335
529k
    Py_XDECREF(exc);
5336
529k
    return 0;
5337
5338
3.54k
onError:
5339
3.54k
    Py_XDECREF(error_handler_obj);
5340
3.54k
    Py_XDECREF(exc);
5341
3.54k
    return -1;
5342
533k
}
5343
5344
5345
static PyObject *
5346
unicode_decode_utf8(const char *s, Py_ssize_t size,
5347
                    _Py_error_handler error_handler, const char *errors,
5348
                    Py_ssize_t *consumed)
5349
11.4M
{
5350
11.4M
    if (size == 0) {
5351
69.7k
        if (consumed) {
5352
0
            *consumed = 0;
5353
0
        }
5354
69.7k
        _Py_RETURN_UNICODE_EMPTY();
5355
69.7k
    }
5356
5357
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5358
11.3M
    if (size == 1 && (unsigned char)s[0] < 128) {
5359
1.90M
        if (consumed) {
5360
0
            *consumed = 1;
5361
0
        }
5362
1.90M
        return get_latin1_char((unsigned char)s[0]);
5363
1.90M
    }
5364
5365
    // I don't know this check is necessary or not. But there is a test
5366
    // case that requires size=PY_SSIZE_T_MAX cause MemoryError.
5367
9.47M
    if (PY_SSIZE_T_MAX - sizeof(PyCompactUnicodeObject) < (size_t)size) {
5368
0
        PyErr_NoMemory();
5369
0
        return NULL;
5370
0
    }
5371
5372
9.47M
    const char *starts = s;
5373
9.47M
    const char *end = s + size;
5374
5375
9.47M
    Py_ssize_t pos = find_first_nonascii((const unsigned char*)starts, (const unsigned char*)end);
5376
9.47M
    if (pos == size) {  // fast path: ASCII string.
5377
8.98M
        PyObject *u = PyUnicode_New(size, 127);
5378
8.98M
        if (u == NULL) {
5379
0
            return NULL;
5380
0
        }
5381
8.98M
        memcpy(PyUnicode_1BYTE_DATA(u), s, size);
5382
8.98M
        if (consumed) {
5383
0
            *consumed = size;
5384
0
        }
5385
8.98M
        return u;
5386
8.98M
    }
5387
5388
488k
    int maxchr = 127;
5389
488k
    Py_ssize_t maxsize = size;
5390
5391
488k
    unsigned char ch = (unsigned char)(s[pos]);
5392
    // error handler other than strict may remove/replace the invalid byte.
5393
    // consumed != NULL allows 1~3 bytes remainings.
5394
    // 0x80 <= ch < 0xc2 is invalid start byte that cause UnicodeDecodeError.
5395
    // otherwise: check the input and decide the maxchr and maxsize to reduce
5396
    // reallocation and copy.
5397
488k
    if (error_handler == _Py_ERROR_STRICT && !consumed && ch >= 0xc2) {
5398
        // we only calculate the number of codepoints and don't determine the exact maxchr.
5399
        // This is because writing fast and portable SIMD code to find maxchr is difficult.
5400
        // If reallocation occurs for a larger maxchar, knowing the exact number of codepoints
5401
        // means that it is no longer necessary to allocate several times the required amount
5402
        // of memory.
5403
246k
        maxsize = utf8_count_codepoints((const unsigned char *)s, (const unsigned char *)end);
5404
246k
        if (ch < 0xc4) { // latin1
5405
127k
            maxchr = 0xff;
5406
127k
        }
5407
118k
        else if (ch < 0xf0) { // ucs2
5408
107k
            maxchr = 0xffff;
5409
107k
        }
5410
11.4k
        else { // ucs4
5411
11.4k
            maxchr = 0x10ffff;
5412
11.4k
        }
5413
246k
    }
5414
488k
    PyObject *u = PyUnicode_New(maxsize, maxchr);
5415
488k
    if (!u) {
5416
0
        return NULL;
5417
0
    }
5418
5419
    // Use _PyUnicodeWriter after fast path is failed.
5420
488k
    _PyUnicodeWriter writer;
5421
488k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
5422
488k
    if (maxchr <= 255) {
5423
369k
        memcpy(PyUnicode_1BYTE_DATA(u), s, pos);
5424
369k
        s += pos;
5425
369k
        size -= pos;
5426
369k
        writer.pos = pos;
5427
369k
    }
5428
5429
488k
    if (unicode_decode_utf8_impl(&writer, starts, s, end,
5430
488k
                                 error_handler, errors,
5431
488k
                                 consumed) < 0) {
5432
3.54k
        _PyUnicodeWriter_Dealloc(&writer);
5433
3.54k
        return NULL;
5434
3.54k
    }
5435
484k
    return _PyUnicodeWriter_Finish(&writer);
5436
488k
}
5437
5438
5439
// Used by PyUnicodeWriter_WriteUTF8() implementation
5440
static int
5441
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
5442
                           const char *s, Py_ssize_t size,
5443
                           _Py_error_handler error_handler, const char *errors,
5444
                           Py_ssize_t *consumed)
5445
4.36M
{
5446
4.36M
    if (size == 0) {
5447
7.44k
        if (consumed) {
5448
0
            *consumed = 0;
5449
0
        }
5450
7.44k
        return 0;
5451
7.44k
    }
5452
5453
    // fast path: try ASCII string.
5454
4.36M
    if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
5455
0
        return -1;
5456
0
    }
5457
5458
4.36M
    const char *starts = s;
5459
4.36M
    const char *end = s + size;
5460
4.36M
    Py_ssize_t decoded = 0;
5461
4.36M
    Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
5462
4.36M
    if (writer->kind == PyUnicode_1BYTE_KIND) {
5463
4.35M
        decoded = ascii_decode(s, end, dest);
5464
4.35M
        writer->pos += decoded;
5465
5466
4.35M
        if (decoded == size) {
5467
4.31M
            if (consumed) {
5468
903
                *consumed = size;
5469
903
            }
5470
4.31M
            return 0;
5471
4.31M
        }
5472
42.8k
        s += decoded;
5473
42.8k
        size -= decoded;
5474
42.8k
    }
5475
5476
45.1k
    return unicode_decode_utf8_impl(writer, starts, s, end,
5477
45.1k
                                    error_handler, errors, consumed);
5478
4.36M
}
5479
5480
5481
PyObject *
5482
PyUnicode_DecodeUTF8Stateful(const char *s,
5483
                             Py_ssize_t size,
5484
                             const char *errors,
5485
                             Py_ssize_t *consumed)
5486
11.4M
{
5487
11.4M
    return unicode_decode_utf8(s, size,
5488
11.4M
                               errors ? _Py_ERROR_UNKNOWN : _Py_ERROR_STRICT,
5489
11.4M
                               errors, consumed);
5490
11.4M
}
5491
5492
5493
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5494
   non-zero, use strict error handler otherwise.
5495
5496
   On success, write a pointer to a newly allocated wide character string into
5497
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5498
   (in number of wchar_t units) into *wlen (if wlen is set).
5499
5500
   On memory allocation failure, return -1.
5501
5502
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5503
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5504
   is not NULL, write the decoding error message into *reason. */
5505
int
5506
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5507
                 const char **reason, _Py_error_handler errors)
5508
5.24k
{
5509
5.24k
    const char *orig_s = s;
5510
5.24k
    const char *e;
5511
5.24k
    wchar_t *unicode;
5512
5.24k
    Py_ssize_t outpos;
5513
5514
5.24k
    int surrogateescape = 0;
5515
5.24k
    int surrogatepass = 0;
5516
5.24k
    switch (errors)
5517
5.24k
    {
5518
0
    case _Py_ERROR_STRICT:
5519
0
        break;
5520
5.24k
    case _Py_ERROR_SURROGATEESCAPE:
5521
5.24k
        surrogateescape = 1;
5522
5.24k
        break;
5523
0
    case _Py_ERROR_SURROGATEPASS:
5524
0
        surrogatepass = 1;
5525
0
        break;
5526
0
    default:
5527
0
        return -3;
5528
5.24k
    }
5529
5530
    /* Note: size will always be longer than the resulting Unicode
5531
       character count */
5532
5.24k
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1 < size) {
5533
0
        return -1;
5534
0
    }
5535
5536
5.24k
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5537
5.24k
    if (!unicode) {
5538
0
        return -1;
5539
0
    }
5540
5541
    /* Unpack UTF-8 encoded data */
5542
5.24k
    e = s + size;
5543
5.24k
    outpos = 0;
5544
5.24k
    while (s < e) {
5545
5.24k
        Py_UCS4 ch;
5546
5.24k
#if SIZEOF_WCHAR_T == 4
5547
5.24k
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5548
#else
5549
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5550
#endif
5551
5.24k
        if (ch > 0xFF) {
5552
0
#if SIZEOF_WCHAR_T == 4
5553
0
            Py_UNREACHABLE();
5554
#else
5555
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5556
            /* write a surrogate pair */
5557
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5558
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5559
#endif
5560
0
        }
5561
5.24k
        else {
5562
5.24k
            if (!ch && s == e) {
5563
5.24k
                break;
5564
5.24k
            }
5565
5566
0
            if (surrogateescape) {
5567
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5568
0
            }
5569
0
            else {
5570
                /* Is it a valid three-byte code? */
5571
0
                if (surrogatepass
5572
0
                    && (e - s) >= 3
5573
0
                    && (s[0] & 0xf0) == 0xe0
5574
0
                    && (s[1] & 0xc0) == 0x80
5575
0
                    && (s[2] & 0xc0) == 0x80)
5576
0
                {
5577
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5578
0
                    s += 3;
5579
0
                    unicode[outpos++] = ch;
5580
0
                }
5581
0
                else {
5582
0
                    PyMem_RawFree(unicode );
5583
0
                    if (reason != NULL) {
5584
0
                        switch (ch) {
5585
0
                        case 0:
5586
0
                            *reason = "unexpected end of data";
5587
0
                            break;
5588
0
                        case 1:
5589
0
                            *reason = "invalid start byte";
5590
0
                            break;
5591
                        /* 2, 3, 4 */
5592
0
                        default:
5593
0
                            *reason = "invalid continuation byte";
5594
0
                            break;
5595
0
                        }
5596
0
                    }
5597
0
                    if (wlen != NULL) {
5598
0
                        *wlen = s - orig_s;
5599
0
                    }
5600
0
                    return -2;
5601
0
                }
5602
0
            }
5603
0
        }
5604
5.24k
    }
5605
5.24k
    unicode[outpos] = L'\0';
5606
5.24k
    if (wlen) {
5607
5.24k
        *wlen = outpos;
5608
5.24k
    }
5609
5.24k
    *wstr = unicode;
5610
5.24k
    return 0;
5611
5.24k
}
5612
5613
5614
wchar_t*
5615
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5616
                               size_t *wlen)
5617
0
{
5618
0
    wchar_t *wstr;
5619
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5620
0
                               &wstr, wlen,
5621
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5622
0
    if (res != 0) {
5623
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5624
0
        assert(res != -3);
5625
0
        if (wlen) {
5626
0
            *wlen = (size_t)res;
5627
0
        }
5628
0
        return NULL;
5629
0
    }
5630
0
    return wstr;
5631
0
}
5632
5633
5634
/* UTF-8 encoder.
5635
5636
   On success, return 0 and write the newly allocated character string (use
5637
   PyMem_Free() to free the memory) into *str.
5638
5639
   On encoding failure, return -2 and write the position of the invalid
5640
   surrogate character into *error_pos (if error_pos is set) and the decoding
5641
   error message into *reason (if reason is set).
5642
5643
   On memory allocation failure, return -1. */
5644
int
5645
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5646
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5647
650
{
5648
650
    const Py_ssize_t max_char_size = 4;
5649
650
    Py_ssize_t len = wcslen(text);
5650
5651
650
    assert(len >= 0);
5652
5653
650
    int surrogateescape = 0;
5654
650
    int surrogatepass = 0;
5655
650
    switch (errors)
5656
650
    {
5657
64
    case _Py_ERROR_STRICT:
5658
64
        break;
5659
586
    case _Py_ERROR_SURROGATEESCAPE:
5660
586
        surrogateescape = 1;
5661
586
        break;
5662
0
    case _Py_ERROR_SURROGATEPASS:
5663
0
        surrogatepass = 1;
5664
0
        break;
5665
0
    default:
5666
0
        return -3;
5667
650
    }
5668
5669
650
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5670
0
        return -1;
5671
0
    }
5672
650
    char *bytes;
5673
650
    if (raw_malloc) {
5674
650
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5675
650
    }
5676
0
    else {
5677
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5678
0
    }
5679
650
    if (bytes == NULL) {
5680
0
        return -1;
5681
0
    }
5682
5683
650
    char *p = bytes;
5684
650
    Py_ssize_t i;
5685
43.6k
    for (i = 0; i < len; ) {
5686
42.9k
        Py_ssize_t ch_pos = i;
5687
42.9k
        Py_UCS4 ch = text[i];
5688
42.9k
        i++;
5689
#if Py_UNICODE_SIZE == 2
5690
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5691
            && i < len
5692
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5693
        {
5694
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5695
            i++;
5696
        }
5697
#endif
5698
5699
42.9k
        if (ch < 0x80) {
5700
            /* Encode ASCII */
5701
42.9k
            *p++ = (char) ch;
5702
5703
42.9k
        }
5704
0
        else if (ch < 0x0800) {
5705
            /* Encode Latin-1 */
5706
0
            *p++ = (char)(0xc0 | (ch >> 6));
5707
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5708
0
        }
5709
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5710
            /* surrogateescape error handler */
5711
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5712
0
                if (error_pos != NULL) {
5713
0
                    *error_pos = (size_t)ch_pos;
5714
0
                }
5715
0
                if (reason != NULL) {
5716
0
                    *reason = "encoding error";
5717
0
                }
5718
0
                if (raw_malloc) {
5719
0
                    PyMem_RawFree(bytes);
5720
0
                }
5721
0
                else {
5722
0
                    PyMem_Free(bytes);
5723
0
                }
5724
0
                return -2;
5725
0
            }
5726
0
            *p++ = (char)(ch & 0xff);
5727
0
        }
5728
0
        else if (ch < 0x10000) {
5729
0
            *p++ = (char)(0xe0 | (ch >> 12));
5730
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5731
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5732
0
        }
5733
0
        else {  /* ch >= 0x10000 */
5734
0
            assert(ch <= MAX_UNICODE);
5735
            /* Encode UCS4 Unicode ordinals */
5736
0
            *p++ = (char)(0xf0 | (ch >> 18));
5737
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5738
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5739
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5740
0
        }
5741
42.9k
    }
5742
650
    *p++ = '\0';
5743
5744
650
    size_t final_size = (p - bytes);
5745
650
    char *bytes2;
5746
650
    if (raw_malloc) {
5747
650
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5748
650
    }
5749
0
    else {
5750
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5751
0
    }
5752
650
    if (bytes2 == NULL) {
5753
0
        if (error_pos != NULL) {
5754
0
            *error_pos = (size_t)-1;
5755
0
        }
5756
0
        if (raw_malloc) {
5757
0
            PyMem_RawFree(bytes);
5758
0
        }
5759
0
        else {
5760
0
            PyMem_Free(bytes);
5761
0
        }
5762
0
        return -1;
5763
0
    }
5764
650
    *str = bytes2;
5765
650
    return 0;
5766
650
}
5767
5768
5769
/* Primary internal function which creates utf8 encoded bytes objects.
5770
5771
   Allocation strategy:  if the string is short, convert into a stack buffer
5772
   and allocate exactly as much space needed at the end.  Else allocate the
5773
   maximum possible needed (4 result bytes per Unicode character), and return
5774
   the excess memory at the end.
5775
*/
5776
static PyObject *
5777
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5778
                    const char *errors)
5779
15.3M
{
5780
15.3M
    if (!PyUnicode_Check(unicode)) {
5781
0
        PyErr_BadArgument();
5782
0
        return NULL;
5783
0
    }
5784
5785
15.3M
    if (PyUnicode_UTF8(unicode))
5786
9.44M
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5787
9.44M
                                         PyUnicode_UTF8_LENGTH(unicode));
5788
5789
5.89M
    int kind = PyUnicode_KIND(unicode);
5790
5.89M
    const void *data = PyUnicode_DATA(unicode);
5791
5.89M
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5792
5793
5.89M
    PyBytesWriter *writer;
5794
5.89M
    char *end;
5795
5796
5.89M
    switch (kind) {
5797
0
    default:
5798
0
        Py_UNREACHABLE();
5799
4.40M
    case PyUnicode_1BYTE_KIND:
5800
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5801
4.40M
        assert(!PyUnicode_IS_ASCII(unicode));
5802
4.40M
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5803
4.40M
                                      error_handler, errors, &end);
5804
4.40M
        break;
5805
1.42M
    case PyUnicode_2BYTE_KIND:
5806
1.42M
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5807
1.42M
                                      error_handler, errors, &end);
5808
1.42M
        break;
5809
63.0k
    case PyUnicode_4BYTE_KIND:
5810
63.0k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5811
63.0k
                                      error_handler, errors, &end);
5812
63.0k
        break;
5813
5.89M
    }
5814
5815
5.89M
    if (writer == NULL) {
5816
153k
        PyBytesWriter_Discard(writer);
5817
153k
        return NULL;
5818
153k
    }
5819
5.73M
    return PyBytesWriter_FinishWithPointer(writer, end);
5820
5.89M
}
5821
5822
static int
5823
unicode_fill_utf8(PyObject *unicode)
5824
142k
{
5825
142k
    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
5826
    /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5827
142k
    assert(!PyUnicode_IS_ASCII(unicode));
5828
5829
142k
    int kind = PyUnicode_KIND(unicode);
5830
142k
    const void *data = PyUnicode_DATA(unicode);
5831
142k
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5832
5833
142k
    PyBytesWriter *writer;
5834
142k
    char *end;
5835
5836
142k
    switch (kind) {
5837
0
    default:
5838
0
        Py_UNREACHABLE();
5839
111k
    case PyUnicode_1BYTE_KIND:
5840
111k
        writer = ucs1lib_utf8_encoder(unicode, data, size,
5841
111k
                                      _Py_ERROR_STRICT, NULL, &end);
5842
111k
        break;
5843
26.3k
    case PyUnicode_2BYTE_KIND:
5844
26.3k
        writer = ucs2lib_utf8_encoder(unicode, data, size,
5845
26.3k
                                      _Py_ERROR_STRICT, NULL, &end);
5846
26.3k
        break;
5847
5.09k
    case PyUnicode_4BYTE_KIND:
5848
5.09k
        writer = ucs4lib_utf8_encoder(unicode, data, size,
5849
5.09k
                                      _Py_ERROR_STRICT, NULL, &end);
5850
5.09k
        break;
5851
142k
    }
5852
142k
    if (writer == NULL) {
5853
206
        return -1;
5854
206
    }
5855
5856
142k
    const char *start = PyBytesWriter_GetData(writer);
5857
142k
    Py_ssize_t len = end - start;
5858
5859
142k
    char *cache = PyMem_Malloc(len + 1);
5860
142k
    if (cache == NULL) {
5861
0
        PyBytesWriter_Discard(writer);
5862
0
        PyErr_NoMemory();
5863
0
        return -1;
5864
0
    }
5865
142k
    memcpy(cache, start, len);
5866
142k
    cache[len] = '\0';
5867
142k
    PyUnicode_SET_UTF8_LENGTH(unicode, len);
5868
142k
    PyUnicode_SET_UTF8(unicode, cache);
5869
142k
    PyBytesWriter_Discard(writer);
5870
142k
    return 0;
5871
142k
}
5872
5873
PyObject *
5874
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5875
15.3M
{
5876
15.3M
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5877
15.3M
}
5878
5879
5880
PyObject *
5881
PyUnicode_AsUTF8String(PyObject *unicode)
5882
2.98k
{
5883
2.98k
    return _PyUnicode_AsUTF8String(unicode, NULL);
5884
2.98k
}
5885
5886
/* --- UTF-32 Codec ------------------------------------------------------- */
5887
5888
PyObject *
5889
PyUnicode_DecodeUTF32(const char *s,
5890
                      Py_ssize_t size,
5891
                      const char *errors,
5892
                      int *byteorder)
5893
108
{
5894
108
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5895
108
}
5896
5897
PyObject *
5898
PyUnicode_DecodeUTF32Stateful(const char *s,
5899
                              Py_ssize_t size,
5900
                              const char *errors,
5901
                              int *byteorder,
5902
                              Py_ssize_t *consumed)
5903
19.1k
{
5904
19.1k
    const char *starts = s;
5905
19.1k
    Py_ssize_t startinpos;
5906
19.1k
    Py_ssize_t endinpos;
5907
19.1k
    _PyUnicodeWriter writer;
5908
19.1k
    const unsigned char *q, *e;
5909
19.1k
    int le, bo = 0;       /* assume native ordering by default */
5910
19.1k
    const char *encoding;
5911
19.1k
    const char *errmsg = "";
5912
19.1k
    PyObject *errorHandler = NULL;
5913
19.1k
    PyObject *exc = NULL;
5914
5915
19.1k
    q = (const unsigned char *)s;
5916
19.1k
    e = q + size;
5917
5918
19.1k
    if (byteorder)
5919
19.0k
        bo = *byteorder;
5920
5921
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5922
       byte order setting accordingly. In native mode, the leading BOM
5923
       mark is skipped, in all other modes, it is copied to the output
5924
       stream as-is (giving a ZWNBSP character). */
5925
19.1k
    if (bo == 0 && size >= 4) {
5926
16.6k
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5927
16.6k
        if (bom == 0x0000FEFF) {
5928
133
            bo = -1;
5929
133
            q += 4;
5930
133
        }
5931
16.5k
        else if (bom == 0xFFFE0000) {
5932
204
            bo = 1;
5933
204
            q += 4;
5934
204
        }
5935
16.6k
        if (byteorder)
5936
16.5k
            *byteorder = bo;
5937
16.6k
    }
5938
5939
19.1k
    if (q == e) {
5940
78
        if (consumed)
5941
0
            *consumed = size;
5942
78
        _Py_RETURN_UNICODE_EMPTY();
5943
78
    }
5944
5945
#ifdef WORDS_BIGENDIAN
5946
    le = bo < 0;
5947
#else
5948
19.1k
    le = bo <= 0;
5949
19.1k
#endif
5950
19.1k
    encoding = le ? "utf-32-le" : "utf-32-be";
5951
5952
19.1k
    _PyUnicodeWriter_Init(&writer);
5953
19.1k
    writer.min_length = (e - q + 3) / 4;
5954
19.1k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5955
0
        goto onError;
5956
5957
107k
    while (1) {
5958
107k
        Py_UCS4 ch = 0;
5959
107k
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5960
5961
107k
        if (e - q >= 4) {
5962
94.8k
            int kind = writer.kind;
5963
94.8k
            void *data = writer.data;
5964
94.8k
            const unsigned char *last = e - 4;
5965
94.8k
            Py_ssize_t pos = writer.pos;
5966
94.8k
            if (le) {
5967
125k
                do {
5968
125k
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5969
125k
                    if (ch > maxch)
5970
90.3k
                        break;
5971
35.0k
                    if (kind != PyUnicode_1BYTE_KIND &&
5972
8.03k
                        Py_UNICODE_IS_SURROGATE(ch))
5973
246
                        break;
5974
34.7k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5975
34.7k
                    q += 4;
5976
34.7k
                } while (q <= last);
5977
91.6k
            }
5978
3.12k
            else {
5979
5.28k
                do {
5980
5.28k
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5981
5.28k
                    if (ch > maxch)
5982
2.89k
                        break;
5983
2.39k
                    if (kind != PyUnicode_1BYTE_KIND &&
5984
1.80k
                        Py_UNICODE_IS_SURROGATE(ch))
5985
111
                        break;
5986
2.28k
                    PyUnicode_WRITE(kind, data, pos++, ch);
5987
2.28k
                    q += 4;
5988
2.28k
                } while (q <= last);
5989
3.12k
            }
5990
94.8k
            writer.pos = pos;
5991
94.8k
        }
5992
5993
107k
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5994
359
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5995
359
            startinpos = ((const char *)q) - starts;
5996
359
            endinpos = startinpos + 4;
5997
359
        }
5998
107k
        else if (ch <= maxch) {
5999
14.3k
            if (q == e || consumed)
6000
3.33k
                break;
6001
            /* remaining bytes at the end? (size should be divisible by 4) */
6002
11.0k
            errmsg = "truncated data";
6003
11.0k
            startinpos = ((const char *)q) - starts;
6004
11.0k
            endinpos = ((const char *)e) - starts;
6005
11.0k
        }
6006
93.1k
        else {
6007
93.1k
            if (ch < 0x110000) {
6008
4.10k
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6009
0
                    goto onError;
6010
4.10k
                q += 4;
6011
4.10k
                continue;
6012
4.10k
            }
6013
89.0k
            errmsg = "code point not in range(0x110000)";
6014
89.0k
            startinpos = ((const char *)q) - starts;
6015
89.0k
            endinpos = startinpos + 4;
6016
89.0k
        }
6017
6018
        /* The remaining input chars are ignored if the callback
6019
           chooses to skip the input */
6020
100k
        if (unicode_decode_call_errorhandler_writer(
6021
100k
                errors, &errorHandler,
6022
100k
                encoding, errmsg,
6023
100k
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
6024
100k
                &writer))
6025
15.7k
            goto onError;
6026
100k
    }
6027
6028
3.33k
    if (consumed)
6029
0
        *consumed = (const char *)q-starts;
6030
6031
3.33k
    Py_XDECREF(errorHandler);
6032
3.33k
    Py_XDECREF(exc);
6033
3.33k
    return _PyUnicodeWriter_Finish(&writer);
6034
6035
15.7k
  onError:
6036
15.7k
    _PyUnicodeWriter_Dealloc(&writer);
6037
15.7k
    Py_XDECREF(errorHandler);
6038
15.7k
    Py_XDECREF(exc);
6039
15.7k
    return NULL;
6040
19.1k
}
6041
6042
PyObject *
6043
_PyUnicode_EncodeUTF32(PyObject *str,
6044
                       const char *errors,
6045
                       int byteorder)
6046
0
{
6047
0
    if (!PyUnicode_Check(str)) {
6048
0
        PyErr_BadArgument();
6049
0
        return NULL;
6050
0
    }
6051
0
    int kind = PyUnicode_KIND(str);
6052
0
    const void *data = PyUnicode_DATA(str);
6053
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6054
6055
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
6056
0
        return PyErr_NoMemory();
6057
0
    Py_ssize_t nsize = len + (byteorder == 0);
6058
6059
0
#if PY_LITTLE_ENDIAN
6060
0
    int native_ordering = byteorder <= 0;
6061
#else
6062
    int native_ordering = byteorder >= 0;
6063
#endif
6064
6065
0
    if (kind == PyUnicode_1BYTE_KIND) {
6066
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6067
        // on short strings
6068
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 4);
6069
0
        if (v == NULL) {
6070
0
            return NULL;
6071
0
        }
6072
6073
        /* output buffer is 4-bytes aligned */
6074
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
6075
0
        uint32_t *out = (uint32_t *)PyBytes_AS_STRING(v);
6076
0
        if (byteorder == 0) {
6077
0
            *out++ = 0xFEFF;
6078
0
        }
6079
0
        if (len > 0) {
6080
0
            ucs1lib_utf32_encode((const Py_UCS1 *)data, len,
6081
0
                                 &out, native_ordering);
6082
0
        }
6083
0
        return v;
6084
0
    }
6085
6086
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 4);
6087
0
    if (writer == NULL) {
6088
0
        return NULL;
6089
0
    }
6090
6091
    /* output buffer is 4-bytes aligned */
6092
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 4));
6093
0
    uint32_t *out = (uint32_t *)PyBytesWriter_GetData(writer);
6094
0
    if (byteorder == 0) {
6095
0
        *out++ = 0xFEFF;
6096
0
    }
6097
0
    if (len == 0) {
6098
0
        return PyBytesWriter_Finish(writer);
6099
0
    }
6100
6101
0
    const char *encoding;
6102
0
    if (byteorder == -1)
6103
0
        encoding = "utf-32-le";
6104
0
    else if (byteorder == 1)
6105
0
        encoding = "utf-32-be";
6106
0
    else
6107
0
        encoding = "utf-32";
6108
6109
0
    PyObject *errorHandler = NULL;
6110
0
    PyObject *exc = NULL;
6111
0
    PyObject *rep = NULL;
6112
6113
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6114
0
        if (kind == PyUnicode_2BYTE_KIND) {
6115
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
6116
0
                                        &out, native_ordering);
6117
0
        }
6118
0
        else {
6119
0
            assert(kind == PyUnicode_4BYTE_KIND);
6120
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
6121
0
                                        &out, native_ordering);
6122
0
        }
6123
0
        if (pos == len)
6124
0
            break;
6125
6126
0
        Py_ssize_t newpos;
6127
0
        rep = unicode_encode_call_errorhandler(
6128
0
                errors, &errorHandler,
6129
0
                encoding, "surrogates not allowed",
6130
0
                str, &exc, pos, pos + 1, &newpos);
6131
0
        if (!rep)
6132
0
            goto error;
6133
6134
0
        Py_ssize_t repsize, moreunits;
6135
0
        if (PyBytes_Check(rep)) {
6136
0
            repsize = PyBytes_GET_SIZE(rep);
6137
0
            if (repsize & 3) {
6138
0
                raise_encode_exception(&exc, encoding,
6139
0
                                       str, pos, pos + 1,
6140
0
                                       "surrogates not allowed");
6141
0
                goto error;
6142
0
            }
6143
0
            moreunits = repsize / 4;
6144
0
        }
6145
0
        else {
6146
0
            assert(PyUnicode_Check(rep));
6147
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6148
0
            if (!PyUnicode_IS_ASCII(rep)) {
6149
0
                raise_encode_exception(&exc, encoding,
6150
0
                                       str, pos, pos + 1,
6151
0
                                       "surrogates not allowed");
6152
0
                goto error;
6153
0
            }
6154
0
        }
6155
0
        moreunits += pos - newpos;
6156
0
        pos = newpos;
6157
6158
        /* four bytes are reserved for each surrogate */
6159
0
        if (moreunits > 0) {
6160
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 4 * moreunits, out);
6161
0
            if (out == NULL) {
6162
0
                goto error;
6163
0
            }
6164
0
        }
6165
6166
0
        if (PyBytes_Check(rep)) {
6167
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6168
0
            out += repsize / 4;
6169
0
        }
6170
0
        else {
6171
            /* rep is unicode */
6172
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6173
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6174
0
                                 &out, native_ordering);
6175
0
        }
6176
6177
0
        Py_CLEAR(rep);
6178
0
    }
6179
6180
0
    Py_XDECREF(errorHandler);
6181
0
    Py_XDECREF(exc);
6182
6183
    /* Cut back to size actually needed. This is necessary for, for example,
6184
       encoding of a string containing isolated surrogates and the 'ignore'
6185
       handler is used. */
6186
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6187
6188
0
  error:
6189
0
    Py_XDECREF(rep);
6190
0
    Py_XDECREF(errorHandler);
6191
0
    Py_XDECREF(exc);
6192
0
    PyBytesWriter_Discard(writer);
6193
0
    return NULL;
6194
0
}
6195
6196
PyObject *
6197
PyUnicode_AsUTF32String(PyObject *unicode)
6198
0
{
6199
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
6200
0
}
6201
6202
/* --- UTF-16 Codec ------------------------------------------------------- */
6203
6204
PyObject *
6205
PyUnicode_DecodeUTF16(const char *s,
6206
                      Py_ssize_t size,
6207
                      const char *errors,
6208
                      int *byteorder)
6209
107
{
6210
107
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6211
107
}
6212
6213
PyObject *
6214
PyUnicode_DecodeUTF16Stateful(const char *s,
6215
                              Py_ssize_t size,
6216
                              const char *errors,
6217
                              int *byteorder,
6218
                              Py_ssize_t *consumed)
6219
14.6k
{
6220
14.6k
    const char *starts = s;
6221
14.6k
    Py_ssize_t startinpos;
6222
14.6k
    Py_ssize_t endinpos;
6223
14.6k
    _PyUnicodeWriter writer;
6224
14.6k
    const unsigned char *q, *e;
6225
14.6k
    int bo = 0;       /* assume native ordering by default */
6226
14.6k
    int native_ordering;
6227
14.6k
    const char *errmsg = "";
6228
14.6k
    PyObject *errorHandler = NULL;
6229
14.6k
    PyObject *exc = NULL;
6230
14.6k
    const char *encoding;
6231
6232
14.6k
    q = (const unsigned char *)s;
6233
14.6k
    e = q + size;
6234
6235
14.6k
    if (byteorder)
6236
14.5k
        bo = *byteorder;
6237
6238
    /* Check for BOM marks (U+FEFF) in the input and adjust current
6239
       byte order setting accordingly. In native mode, the leading BOM
6240
       mark is skipped, in all other modes, it is copied to the output
6241
       stream as-is (giving a ZWNBSP character). */
6242
14.6k
    if (bo == 0 && size >= 2) {
6243
13.8k
        const Py_UCS4 bom = (q[1] << 8) | q[0];
6244
13.8k
        if (bom == 0xFEFF) {
6245
278
            q += 2;
6246
278
            bo = -1;
6247
278
        }
6248
13.5k
        else if (bom == 0xFFFE) {
6249
1.97k
            q += 2;
6250
1.97k
            bo = 1;
6251
1.97k
        }
6252
13.8k
        if (byteorder)
6253
13.7k
            *byteorder = bo;
6254
13.8k
    }
6255
6256
14.6k
    if (q == e) {
6257
71
        if (consumed)
6258
0
            *consumed = size;
6259
71
        _Py_RETURN_UNICODE_EMPTY();
6260
71
    }
6261
6262
14.5k
#if PY_LITTLE_ENDIAN
6263
14.5k
    native_ordering = bo <= 0;
6264
14.5k
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
6265
#else
6266
    native_ordering = bo >= 0;
6267
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
6268
#endif
6269
6270
    /* Note: size will always be longer than the resulting Unicode
6271
       character count normally.  Error handler will take care of
6272
       resizing when needed. */
6273
14.5k
    _PyUnicodeWriter_Init(&writer);
6274
14.5k
    writer.min_length = (e - q + 1) / 2;
6275
14.5k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
6276
0
        goto onError;
6277
6278
51.6k
    while (1) {
6279
51.6k
        Py_UCS4 ch = 0;
6280
51.6k
        if (e - q >= 2) {
6281
43.0k
            int kind = writer.kind;
6282
43.0k
            if (kind == PyUnicode_1BYTE_KIND) {
6283
17.1k
                if (PyUnicode_IS_ASCII(writer.buffer))
6284
13.9k
                    ch = asciilib_utf16_decode(&q, e,
6285
13.9k
                            (Py_UCS1*)writer.data, &writer.pos,
6286
13.9k
                            native_ordering);
6287
3.25k
                else
6288
3.25k
                    ch = ucs1lib_utf16_decode(&q, e,
6289
3.25k
                            (Py_UCS1*)writer.data, &writer.pos,
6290
3.25k
                            native_ordering);
6291
25.8k
            } else if (kind == PyUnicode_2BYTE_KIND) {
6292
11.0k
                ch = ucs2lib_utf16_decode(&q, e,
6293
11.0k
                        (Py_UCS2*)writer.data, &writer.pos,
6294
11.0k
                        native_ordering);
6295
14.8k
            } else {
6296
14.8k
                assert(kind == PyUnicode_4BYTE_KIND);
6297
14.8k
                ch = ucs4lib_utf16_decode(&q, e,
6298
14.8k
                        (Py_UCS4*)writer.data, &writer.pos,
6299
14.8k
                        native_ordering);
6300
14.8k
            }
6301
43.0k
        }
6302
6303
51.6k
        switch (ch)
6304
51.6k
        {
6305
14.4k
        case 0:
6306
            /* remaining byte at the end? (size should be even) */
6307
14.4k
            if (q == e || consumed)
6308
9.86k
                goto End;
6309
4.54k
            errmsg = "truncated data";
6310
4.54k
            startinpos = ((const char *)q) - starts;
6311
4.54k
            endinpos = ((const char *)e) - starts;
6312
4.54k
            break;
6313
            /* The remaining input chars are ignored if the callback
6314
               chooses to skip the input */
6315
1.84k
        case 1:
6316
1.84k
            q -= 2;
6317
1.84k
            if (consumed)
6318
0
                goto End;
6319
1.84k
            errmsg = "unexpected end of data";
6320
1.84k
            startinpos = ((const char *)q) - starts;
6321
1.84k
            endinpos = ((const char *)e) - starts;
6322
1.84k
            break;
6323
13.1k
        case 2:
6324
13.1k
            errmsg = "illegal encoding";
6325
13.1k
            startinpos = ((const char *)q) - 2 - starts;
6326
13.1k
            endinpos = startinpos + 2;
6327
13.1k
            break;
6328
6.29k
        case 3:
6329
6.29k
            errmsg = "illegal UTF-16 surrogate";
6330
6.29k
            startinpos = ((const char *)q) - 4 - starts;
6331
6.29k
            endinpos = startinpos + 2;
6332
6.29k
            break;
6333
15.9k
        default:
6334
15.9k
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6335
0
                goto onError;
6336
15.9k
            continue;
6337
51.6k
        }
6338
6339
25.7k
        if (unicode_decode_call_errorhandler_writer(
6340
25.7k
                errors,
6341
25.7k
                &errorHandler,
6342
25.7k
                encoding, errmsg,
6343
25.7k
                &starts,
6344
25.7k
                (const char **)&e,
6345
25.7k
                &startinpos,
6346
25.7k
                &endinpos,
6347
25.7k
                &exc,
6348
25.7k
                (const char **)&q,
6349
25.7k
                &writer))
6350
4.71k
            goto onError;
6351
25.7k
    }
6352
6353
9.86k
End:
6354
9.86k
    if (consumed)
6355
0
        *consumed = (const char *)q-starts;
6356
6357
9.86k
    Py_XDECREF(errorHandler);
6358
9.86k
    Py_XDECREF(exc);
6359
9.86k
    return _PyUnicodeWriter_Finish(&writer);
6360
6361
4.71k
  onError:
6362
4.71k
    _PyUnicodeWriter_Dealloc(&writer);
6363
4.71k
    Py_XDECREF(errorHandler);
6364
4.71k
    Py_XDECREF(exc);
6365
4.71k
    return NULL;
6366
14.5k
}
6367
6368
PyObject *
6369
_PyUnicode_EncodeUTF16(PyObject *str,
6370
                       const char *errors,
6371
                       int byteorder)
6372
0
{
6373
0
    if (!PyUnicode_Check(str)) {
6374
0
        PyErr_BadArgument();
6375
0
        return NULL;
6376
0
    }
6377
0
    int kind = PyUnicode_KIND(str);
6378
0
    const void *data = PyUnicode_DATA(str);
6379
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(str);
6380
6381
0
    Py_ssize_t pairs = 0;
6382
0
    if (kind == PyUnicode_4BYTE_KIND) {
6383
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
6384
0
        const Py_UCS4 *end = in + len;
6385
0
        while (in < end) {
6386
0
            if (*in++ >= 0x10000) {
6387
0
                pairs++;
6388
0
            }
6389
0
        }
6390
0
    }
6391
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
6392
0
        return PyErr_NoMemory();
6393
0
    }
6394
0
    Py_ssize_t nsize = len + pairs + (byteorder == 0);
6395
6396
#if PY_BIG_ENDIAN
6397
    int native_ordering = byteorder >= 0;
6398
#else
6399
0
    int native_ordering = byteorder <= 0;
6400
0
#endif
6401
6402
0
    if (kind == PyUnicode_1BYTE_KIND) {
6403
        // gh-139156: Don't use PyBytesWriter API here since it has an overhead
6404
        // on short strings
6405
0
        PyObject *v = PyBytes_FromStringAndSize(NULL, nsize * 2);
6406
0
        if (v == NULL) {
6407
0
            return NULL;
6408
0
        }
6409
6410
        /* output buffer is 2-bytes aligned */
6411
0
        assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
6412
0
        unsigned short *out = (unsigned short *)PyBytes_AS_STRING(v);
6413
0
        if (byteorder == 0) {
6414
0
            *out++ = 0xFEFF;
6415
0
        }
6416
0
        if (len > 0) {
6417
0
            ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6418
0
        }
6419
0
        return v;
6420
0
    }
6421
6422
0
    PyBytesWriter *writer = PyBytesWriter_Create(nsize * 2);
6423
0
    if (writer == NULL) {
6424
0
        return NULL;
6425
0
    }
6426
6427
    /* output buffer is 2-bytes aligned */
6428
0
    assert(_Py_IS_ALIGNED(PyBytesWriter_GetData(writer), 2));
6429
0
    unsigned short *out = PyBytesWriter_GetData(writer);
6430
0
    if (byteorder == 0) {
6431
0
        *out++ = 0xFEFF;
6432
0
    }
6433
0
    if (len == 0) {
6434
0
        return PyBytesWriter_Finish(writer);
6435
0
    }
6436
6437
0
    const char *encoding;
6438
0
    if (byteorder < 0) {
6439
0
        encoding = "utf-16-le";
6440
0
    }
6441
0
    else if (byteorder > 0) {
6442
0
        encoding = "utf-16-be";
6443
0
    }
6444
0
    else {
6445
0
        encoding = "utf-16";
6446
0
    }
6447
6448
0
    PyObject *errorHandler = NULL;
6449
0
    PyObject *exc = NULL;
6450
0
    PyObject *rep = NULL;
6451
6452
0
    for (Py_ssize_t pos = 0; pos < len; ) {
6453
0
        if (kind == PyUnicode_2BYTE_KIND) {
6454
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6455
0
                                        &out, native_ordering);
6456
0
        }
6457
0
        else {
6458
0
            assert(kind == PyUnicode_4BYTE_KIND);
6459
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6460
0
                                        &out, native_ordering);
6461
0
        }
6462
0
        if (pos == len)
6463
0
            break;
6464
6465
0
        Py_ssize_t newpos;
6466
0
        rep = unicode_encode_call_errorhandler(
6467
0
                errors, &errorHandler,
6468
0
                encoding, "surrogates not allowed",
6469
0
                str, &exc, pos, pos + 1, &newpos);
6470
0
        if (!rep)
6471
0
            goto error;
6472
6473
0
        Py_ssize_t repsize, moreunits;
6474
0
        if (PyBytes_Check(rep)) {
6475
0
            repsize = PyBytes_GET_SIZE(rep);
6476
0
            if (repsize & 1) {
6477
0
                raise_encode_exception(&exc, encoding,
6478
0
                                       str, pos, pos + 1,
6479
0
                                       "surrogates not allowed");
6480
0
                goto error;
6481
0
            }
6482
0
            moreunits = repsize / 2;
6483
0
        }
6484
0
        else {
6485
0
            assert(PyUnicode_Check(rep));
6486
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6487
0
            if (!PyUnicode_IS_ASCII(rep)) {
6488
0
                raise_encode_exception(&exc, encoding,
6489
0
                                       str, pos, pos + 1,
6490
0
                                       "surrogates not allowed");
6491
0
                goto error;
6492
0
            }
6493
0
        }
6494
0
        moreunits += pos - newpos;
6495
0
        pos = newpos;
6496
6497
        /* two bytes are reserved for each surrogate */
6498
0
        if (moreunits > 0) {
6499
0
            out = PyBytesWriter_GrowAndUpdatePointer(writer, 2 * moreunits, out);
6500
0
            if (out == NULL) {
6501
0
                goto error;
6502
0
            }
6503
0
        }
6504
6505
0
        if (PyBytes_Check(rep)) {
6506
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6507
0
            out += repsize / 2;
6508
0
        } else {
6509
            /* rep is unicode */
6510
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6511
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6512
0
                                 &out, native_ordering);
6513
0
        }
6514
6515
0
        Py_CLEAR(rep);
6516
0
    }
6517
6518
0
    Py_XDECREF(errorHandler);
6519
0
    Py_XDECREF(exc);
6520
6521
    /* Cut back to size actually needed. This is necessary for, for example,
6522
    encoding of a string containing isolated surrogates and the 'ignore' handler
6523
    is used. */
6524
0
    return PyBytesWriter_FinishWithPointer(writer, out);
6525
6526
0
  error:
6527
0
    Py_XDECREF(rep);
6528
0
    Py_XDECREF(errorHandler);
6529
0
    Py_XDECREF(exc);
6530
0
    PyBytesWriter_Discard(writer);
6531
0
    return NULL;
6532
0
}
6533
6534
PyObject *
6535
PyUnicode_AsUTF16String(PyObject *unicode)
6536
0
{
6537
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6538
0
}
6539
6540
_PyUnicode_Name_CAPI *
6541
_PyUnicode_GetNameCAPI(void)
6542
2.78k
{
6543
2.78k
    PyInterpreterState *interp = _PyInterpreterState_GET();
6544
2.78k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6545
6546
2.78k
    ucnhash_capi = _Py_atomic_load_ptr(&interp->unicode.ucnhash_capi);
6547
2.78k
    if (ucnhash_capi == NULL) {
6548
1
        ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6549
1
                PyUnicodeData_CAPSULE_NAME, 1);
6550
6551
        // It's fine if we overwrite the value here. It's always the same value.
6552
1
        _Py_atomic_store_ptr(&interp->unicode.ucnhash_capi, ucnhash_capi);
6553
1
    }
6554
2.78k
    return ucnhash_capi;
6555
2.78k
}
6556
6557
/* --- Unicode Escape Codec ----------------------------------------------- */
6558
6559
PyObject *
6560
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
6561
                               Py_ssize_t size,
6562
                               const char *errors,
6563
                               Py_ssize_t *consumed,
6564
                               int *first_invalid_escape_char,
6565
                               const char **first_invalid_escape_ptr)
6566
31.9k
{
6567
31.9k
    const char *starts = s;
6568
31.9k
    const char *initial_starts = starts;
6569
31.9k
    _PyUnicodeWriter writer;
6570
31.9k
    const char *end;
6571
31.9k
    PyObject *errorHandler = NULL;
6572
31.9k
    PyObject *exc = NULL;
6573
31.9k
    _PyUnicode_Name_CAPI *ucnhash_capi;
6574
6575
    // so we can remember if we've seen an invalid escape char or not
6576
31.9k
    *first_invalid_escape_char = -1;
6577
31.9k
    *first_invalid_escape_ptr = NULL;
6578
6579
31.9k
    if (size == 0) {
6580
2.01k
        if (consumed) {
6581
0
            *consumed = 0;
6582
0
        }
6583
2.01k
        _Py_RETURN_UNICODE_EMPTY();
6584
2.01k
    }
6585
    /* Escaped strings will always be longer than the resulting
6586
       Unicode string, so we start with size here and then reduce the
6587
       length after conversion to the true value.
6588
       (but if the error callback returns a long replacement string
6589
       we'll have to allocate more space) */
6590
29.9k
    _PyUnicodeWriter_Init(&writer);
6591
29.9k
    writer.min_length = size;
6592
29.9k
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6593
0
        goto onError;
6594
0
    }
6595
6596
29.9k
    end = s + size;
6597
193k
    while (s < end) {
6598
163k
        unsigned char c = (unsigned char) *s++;
6599
163k
        Py_UCS4 ch;
6600
163k
        int count;
6601
163k
        const char *message;
6602
6603
163k
#define WRITE_ASCII_CHAR(ch)                                                  \
6604
163k
            do {                                                              \
6605
14.9k
                assert(ch <= 127);                                            \
6606
14.9k
                assert(writer.pos < writer.size);                             \
6607
14.9k
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6608
14.9k
            } while(0)
6609
6610
163k
#define WRITE_CHAR(ch)                                                        \
6611
163k
            do {                                                              \
6612
153k
                if (ch <= writer.maxchar) {                                   \
6613
137k
                    assert(writer.pos < writer.size);                         \
6614
137k
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6615
137k
                }                                                             \
6616
153k
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6617
0
                    goto onError;                                             \
6618
0
                }                                                             \
6619
153k
            } while(0)
6620
6621
        /* Non-escape characters are interpreted as Unicode ordinals */
6622
163k
        if (c != '\\') {
6623
100k
            WRITE_CHAR(c);
6624
100k
            continue;
6625
100k
        }
6626
6627
63.5k
        Py_ssize_t startinpos = s - starts - 1;
6628
        /* \ - Escapes */
6629
63.5k
        if (s >= end) {
6630
0
            message = "\\ at end of string";
6631
0
            goto incomplete;
6632
0
        }
6633
63.5k
        c = (unsigned char) *s++;
6634
6635
63.5k
        assert(writer.pos < writer.size);
6636
63.5k
        switch (c) {
6637
6638
            /* \x escapes */
6639
813
        case '\n': continue;
6640
1.48k
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6641
868
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6642
1.21k
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6643
496
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6644
        /* FF */
6645
963
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6646
842
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6647
910
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6648
1.41k
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6649
        /* VT */
6650
832
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6651
        /* BEL, not classic C */
6652
694
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6653
6654
            /* \OOO (octal) escapes */
6655
3.62k
        case '0': case '1': case '2': case '3':
6656
6.72k
        case '4': case '5': case '6': case '7':
6657
6.72k
            ch = c - '0';
6658
6.72k
            if (s < end && '0' <= *s && *s <= '7') {
6659
2.57k
                ch = (ch<<3) + *s++ - '0';
6660
2.57k
                if (s < end && '0' <= *s && *s <= '7') {
6661
1.33k
                    ch = (ch<<3) + *s++ - '0';
6662
1.33k
                }
6663
2.57k
            }
6664
6.72k
            if (ch > 0377) {
6665
1.03k
                if (*first_invalid_escape_char == -1) {
6666
790
                    *first_invalid_escape_char = ch;
6667
790
                    if (starts == initial_starts) {
6668
                        /* Back up 3 chars, since we've already incremented s. */
6669
790
                        *first_invalid_escape_ptr = s - 3;
6670
790
                    }
6671
790
                }
6672
1.03k
            }
6673
6.72k
            WRITE_CHAR(ch);
6674
6.72k
            continue;
6675
6676
            /* hex escapes */
6677
            /* \xXX */
6678
6.72k
        case 'x':
6679
5.96k
            count = 2;
6680
5.96k
            message = "truncated \\xXX escape";
6681
5.96k
            goto hexescape;
6682
6683
            /* \uXXXX */
6684
9.33k
        case 'u':
6685
9.33k
            count = 4;
6686
9.33k
            message = "truncated \\uXXXX escape";
6687
9.33k
            goto hexescape;
6688
6689
            /* \UXXXXXXXX */
6690
22.9k
        case 'U':
6691
22.9k
            count = 8;
6692
22.9k
            message = "truncated \\UXXXXXXXX escape";
6693
38.2k
        hexescape:
6694
271k
            for (ch = 0; count; ++s, --count) {
6695
233k
                if (s >= end) {
6696
5
                    goto incomplete;
6697
5
                }
6698
233k
                c = (unsigned char)*s;
6699
233k
                ch <<= 4;
6700
233k
                if (c >= '0' && c <= '9') {
6701
173k
                    ch += c - '0';
6702
173k
                }
6703
59.5k
                else if (c >= 'a' && c <= 'f') {
6704
59.3k
                    ch += c - ('a' - 10);
6705
59.3k
                }
6706
244
                else if (c >= 'A' && c <= 'F') {
6707
239
                    ch += c - ('A' - 10);
6708
239
                }
6709
5
                else {
6710
5
                    goto error;
6711
5
                }
6712
233k
            }
6713
6714
            /* when we get here, ch is a 32-bit unicode character */
6715
38.2k
            if (ch > MAX_UNICODE) {
6716
1
                message = "illegal Unicode character";
6717
1
                goto error;
6718
1
            }
6719
6720
38.2k
            WRITE_CHAR(ch);
6721
38.2k
            continue;
6722
6723
            /* \N{name} */
6724
38.2k
        case 'N':
6725
2.78k
            ucnhash_capi = _PyUnicode_GetNameCAPI();
6726
2.78k
            if (ucnhash_capi == NULL) {
6727
0
                PyErr_SetString(
6728
0
                        PyExc_UnicodeError,
6729
0
                        "\\N escapes not supported (can't load unicodedata module)"
6730
0
                );
6731
0
                goto onError;
6732
0
            }
6733
6734
2.78k
            message = "malformed \\N character escape";
6735
2.78k
            if (s >= end) {
6736
4
                goto incomplete;
6737
4
            }
6738
2.78k
            if (*s == '{') {
6739
2.77k
                const char *start = ++s;
6740
2.77k
                size_t namelen;
6741
                /* look for the closing brace */
6742
40.7k
                while (s < end && *s != '}')
6743
38.0k
                    s++;
6744
2.77k
                if (s >= end) {
6745
12
                    goto incomplete;
6746
12
                }
6747
2.76k
                namelen = s - start;
6748
2.76k
                if (namelen) {
6749
                    /* found a name.  look it up in the unicode database */
6750
2.76k
                    s++;
6751
2.76k
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6752
2.76k
                    if (namelen <= INT_MAX &&
6753
2.76k
                        ucnhash_capi->getcode(start, (int)namelen,
6754
2.76k
                                              &ch, 0)) {
6755
2.70k
                        assert(ch <= MAX_UNICODE);
6756
2.70k
                        WRITE_CHAR(ch);
6757
2.70k
                        continue;
6758
2.70k
                    }
6759
64
                    message = "unknown Unicode character name";
6760
64
                }
6761
2.76k
            }
6762
69
            goto error;
6763
6764
5.25k
        default:
6765
5.25k
            if (*first_invalid_escape_char == -1) {
6766
3.92k
                *first_invalid_escape_char = c;
6767
3.92k
                if (starts == initial_starts) {
6768
                    /* Back up one char, since we've already incremented s. */
6769
3.92k
                    *first_invalid_escape_ptr = s - 1;
6770
3.92k
                }
6771
3.92k
            }
6772
5.25k
            WRITE_ASCII_CHAR('\\');
6773
5.25k
            WRITE_CHAR(c);
6774
5.25k
            continue;
6775
63.5k
        }
6776
6777
21
      incomplete:
6778
21
        if (consumed) {
6779
0
            *consumed = startinpos;
6780
0
            break;
6781
0
        }
6782
96
      error:;
6783
96
        Py_ssize_t endinpos = s-starts;
6784
96
        writer.min_length = end - s + writer.pos;
6785
96
        if (unicode_decode_call_errorhandler_writer(
6786
96
                errors, &errorHandler,
6787
96
                "unicodeescape", message,
6788
96
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6789
96
                &writer)) {
6790
96
            goto onError;
6791
96
        }
6792
96
        assert(end - s <= writer.size - writer.pos);
6793
6794
0
#undef WRITE_ASCII_CHAR
6795
0
#undef WRITE_CHAR
6796
0
    }
6797
6798
29.8k
    Py_XDECREF(errorHandler);
6799
29.8k
    Py_XDECREF(exc);
6800
29.8k
    return _PyUnicodeWriter_Finish(&writer);
6801
6802
96
  onError:
6803
96
    _PyUnicodeWriter_Dealloc(&writer);
6804
96
    Py_XDECREF(errorHandler);
6805
96
    Py_XDECREF(exc);
6806
96
    return NULL;
6807
29.9k
}
6808
6809
PyObject *
6810
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
6811
                              Py_ssize_t size,
6812
                              const char *errors,
6813
                              Py_ssize_t *consumed)
6814
0
{
6815
0
    int first_invalid_escape_char;
6816
0
    const char *first_invalid_escape_ptr;
6817
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
6818
0
                                                      consumed,
6819
0
                                                      &first_invalid_escape_char,
6820
0
                                                      &first_invalid_escape_ptr);
6821
0
    if (result == NULL)
6822
0
        return NULL;
6823
0
    if (first_invalid_escape_char != -1) {
6824
0
        if (first_invalid_escape_char > 0xff) {
6825
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6826
0
                                 "\"\\%o\" is an invalid octal escape sequence. "
6827
0
                                 "Such sequences will not work in the future. ",
6828
0
                                 first_invalid_escape_char) < 0)
6829
0
            {
6830
0
                Py_DECREF(result);
6831
0
                return NULL;
6832
0
            }
6833
0
        }
6834
0
        else {
6835
0
            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6836
0
                                 "\"\\%c\" is an invalid escape sequence. "
6837
0
                                 "Such sequences will not work in the future. ",
6838
0
                                 first_invalid_escape_char) < 0)
6839
0
            {
6840
0
                Py_DECREF(result);
6841
0
                return NULL;
6842
0
            }
6843
0
        }
6844
0
    }
6845
0
    return result;
6846
0
}
6847
6848
PyObject *
6849
PyUnicode_DecodeUnicodeEscape(const char *s,
6850
                              Py_ssize_t size,
6851
                              const char *errors)
6852
0
{
6853
0
    return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6854
0
}
6855
6856
/* Return a Unicode-Escape string version of the Unicode object. */
6857
6858
PyObject *
6859
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6860
613k
{
6861
613k
    if (!PyUnicode_Check(unicode)) {
6862
0
        PyErr_BadArgument();
6863
0
        return NULL;
6864
0
    }
6865
6866
613k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
6867
613k
    if (len == 0) {
6868
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
6869
0
    }
6870
613k
    int kind = PyUnicode_KIND(unicode);
6871
613k
    const void *data = PyUnicode_DATA(unicode);
6872
6873
    /* Initial allocation is based on the longest-possible character
6874
     * escape.
6875
     *
6876
     * For UCS1 strings it's '\xxx', 4 bytes per source character.
6877
     * For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6878
     * For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character. */
6879
613k
    Py_ssize_t expandsize = kind * 2 + 2;
6880
613k
    if (len > PY_SSIZE_T_MAX / expandsize) {
6881
0
        return PyErr_NoMemory();
6882
0
    }
6883
6884
613k
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
6885
613k
    if (writer == NULL) {
6886
0
        return NULL;
6887
0
    }
6888
613k
    char *p = PyBytesWriter_GetData(writer);
6889
6890
1.22M
    for (Py_ssize_t i = 0; i < len; i++) {
6891
613k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6892
6893
        /* U+0000-U+00ff range */
6894
613k
        if (ch < 0x100) {
6895
605k
            if (ch >= ' ' && ch < 127) {
6896
49.2k
                if (ch != '\\') {
6897
                    /* Copy printable US ASCII as-is */
6898
0
                    *p++ = (char) ch;
6899
0
                }
6900
                /* Escape backslashes */
6901
49.2k
                else {
6902
49.2k
                    *p++ = '\\';
6903
49.2k
                    *p++ = '\\';
6904
49.2k
                }
6905
49.2k
            }
6906
6907
            /* Map special whitespace to '\t', \n', '\r' */
6908
556k
            else if (ch == '\t') {
6909
2.77k
                *p++ = '\\';
6910
2.77k
                *p++ = 't';
6911
2.77k
            }
6912
553k
            else if (ch == '\n') {
6913
932
                *p++ = '\\';
6914
932
                *p++ = 'n';
6915
932
            }
6916
552k
            else if (ch == '\r') {
6917
546
                *p++ = '\\';
6918
546
                *p++ = 'r';
6919
546
            }
6920
6921
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6922
551k
            else {
6923
551k
                *p++ = '\\';
6924
551k
                *p++ = 'x';
6925
551k
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6926
551k
                *p++ = Py_hexdigits[ch & 0x000F];
6927
551k
            }
6928
605k
        }
6929
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6930
7.63k
        else if (ch < 0x10000) {
6931
6.48k
            *p++ = '\\';
6932
6.48k
            *p++ = 'u';
6933
6.48k
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6934
6.48k
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6935
6.48k
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6936
6.48k
            *p++ = Py_hexdigits[ch & 0x000F];
6937
6.48k
        }
6938
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6939
1.15k
        else {
6940
6941
            /* Make sure that the first two digits are zero */
6942
1.15k
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6943
1.15k
            *p++ = '\\';
6944
1.15k
            *p++ = 'U';
6945
1.15k
            *p++ = '0';
6946
1.15k
            *p++ = '0';
6947
1.15k
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6948
1.15k
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6949
1.15k
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6950
1.15k
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6951
1.15k
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6952
1.15k
            *p++ = Py_hexdigits[ch & 0x0000000F];
6953
1.15k
        }
6954
613k
    }
6955
6956
613k
    return PyBytesWriter_FinishWithPointer(writer, p);
6957
613k
}
6958
6959
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6960
6961
PyObject *
6962
_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6963
                                          Py_ssize_t size,
6964
                                          const char *errors,
6965
                                          Py_ssize_t *consumed)
6966
0
{
6967
0
    const char *starts = s;
6968
0
    _PyUnicodeWriter writer;
6969
0
    const char *end;
6970
0
    PyObject *errorHandler = NULL;
6971
0
    PyObject *exc = NULL;
6972
6973
0
    if (size == 0) {
6974
0
        if (consumed) {
6975
0
            *consumed = 0;
6976
0
        }
6977
0
        _Py_RETURN_UNICODE_EMPTY();
6978
0
    }
6979
6980
    /* Escaped strings will always be longer than the resulting
6981
       Unicode string, so we start with size here and then reduce the
6982
       length after conversion to the true value. (But decoding error
6983
       handler might have to resize the string) */
6984
0
    _PyUnicodeWriter_Init(&writer);
6985
0
    writer.min_length = size;
6986
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6987
0
        goto onError;
6988
0
    }
6989
6990
0
    end = s + size;
6991
0
    while (s < end) {
6992
0
        unsigned char c = (unsigned char) *s++;
6993
0
        Py_UCS4 ch;
6994
0
        int count;
6995
0
        const char *message;
6996
6997
0
#define WRITE_CHAR(ch)                                                        \
6998
0
            do {                                                              \
6999
0
                if (ch <= writer.maxchar) {                                   \
7000
0
                    assert(writer.pos < writer.size);                         \
7001
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
7002
0
                }                                                             \
7003
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
7004
0
                    goto onError;                                             \
7005
0
                }                                                             \
7006
0
            } while(0)
7007
7008
        /* Non-escape characters are interpreted as Unicode ordinals */
7009
0
        if (c != '\\' || (s >= end && !consumed)) {
7010
0
            WRITE_CHAR(c);
7011
0
            continue;
7012
0
        }
7013
7014
0
        Py_ssize_t startinpos = s - starts - 1;
7015
        /* \ - Escapes */
7016
0
        if (s >= end) {
7017
0
            assert(consumed);
7018
            // Set message to silent compiler warning.
7019
            // Actually it is never used.
7020
0
            message = "\\ at end of string";
7021
0
            goto incomplete;
7022
0
        }
7023
7024
0
        c = (unsigned char) *s++;
7025
0
        if (c == 'u') {
7026
0
            count = 4;
7027
0
            message = "truncated \\uXXXX escape";
7028
0
        }
7029
0
        else if (c == 'U') {
7030
0
            count = 8;
7031
0
            message = "truncated \\UXXXXXXXX escape";
7032
0
        }
7033
0
        else {
7034
0
            assert(writer.pos < writer.size);
7035
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
7036
0
            WRITE_CHAR(c);
7037
0
            continue;
7038
0
        }
7039
7040
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
7041
0
        for (ch = 0; count; ++s, --count) {
7042
0
            if (s >= end) {
7043
0
                goto incomplete;
7044
0
            }
7045
0
            c = (unsigned char)*s;
7046
0
            ch <<= 4;
7047
0
            if (c >= '0' && c <= '9') {
7048
0
                ch += c - '0';
7049
0
            }
7050
0
            else if (c >= 'a' && c <= 'f') {
7051
0
                ch += c - ('a' - 10);
7052
0
            }
7053
0
            else if (c >= 'A' && c <= 'F') {
7054
0
                ch += c - ('A' - 10);
7055
0
            }
7056
0
            else {
7057
0
                goto error;
7058
0
            }
7059
0
        }
7060
0
        if (ch > MAX_UNICODE) {
7061
0
            message = "\\Uxxxxxxxx out of range";
7062
0
            goto error;
7063
0
        }
7064
0
        WRITE_CHAR(ch);
7065
0
        continue;
7066
7067
0
      incomplete:
7068
0
        if (consumed) {
7069
0
            *consumed = startinpos;
7070
0
            break;
7071
0
        }
7072
0
      error:;
7073
0
        Py_ssize_t endinpos = s-starts;
7074
0
        writer.min_length = end - s + writer.pos;
7075
0
        if (unicode_decode_call_errorhandler_writer(
7076
0
                errors, &errorHandler,
7077
0
                "rawunicodeescape", message,
7078
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
7079
0
                &writer)) {
7080
0
            goto onError;
7081
0
        }
7082
0
        assert(end - s <= writer.size - writer.pos);
7083
7084
0
#undef WRITE_CHAR
7085
0
    }
7086
0
    Py_XDECREF(errorHandler);
7087
0
    Py_XDECREF(exc);
7088
0
    return _PyUnicodeWriter_Finish(&writer);
7089
7090
0
  onError:
7091
0
    _PyUnicodeWriter_Dealloc(&writer);
7092
0
    Py_XDECREF(errorHandler);
7093
0
    Py_XDECREF(exc);
7094
0
    return NULL;
7095
0
}
7096
7097
PyObject *
7098
PyUnicode_DecodeRawUnicodeEscape(const char *s,
7099
                                 Py_ssize_t size,
7100
                                 const char *errors)
7101
0
{
7102
0
    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
7103
0
}
7104
7105
7106
PyObject *
7107
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
7108
198k
{
7109
198k
    if (!PyUnicode_Check(unicode)) {
7110
0
        PyErr_BadArgument();
7111
0
        return NULL;
7112
0
    }
7113
198k
    int kind = PyUnicode_KIND(unicode);
7114
198k
    const void *data = PyUnicode_DATA(unicode);
7115
198k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
7116
198k
    if (len == 0) {
7117
348
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7118
348
    }
7119
197k
    if (kind == PyUnicode_1BYTE_KIND) {
7120
197k
        return PyBytes_FromStringAndSize(data, len);
7121
197k
    }
7122
7123
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
7124
       bytes, and 1 byte characters 4. */
7125
317
    Py_ssize_t expandsize = kind * 2 + 2;
7126
317
    if (len > PY_SSIZE_T_MAX / expandsize) {
7127
0
        return PyErr_NoMemory();
7128
0
    }
7129
7130
317
    PyBytesWriter *writer = PyBytesWriter_Create(expandsize * len);
7131
317
    if (writer == NULL) {
7132
0
        return NULL;
7133
0
    }
7134
317
    char *p = PyBytesWriter_GetData(writer);
7135
7136
4.95M
    for (Py_ssize_t pos = 0; pos < len; pos++) {
7137
4.95M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7138
7139
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7140
4.95M
        if (ch < 0x100) {
7141
4.92M
            *p++ = (char) ch;
7142
4.92M
        }
7143
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
7144
25.8k
        else if (ch < 0x10000) {
7145
25.1k
            *p++ = '\\';
7146
25.1k
            *p++ = 'u';
7147
25.1k
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7148
25.1k
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7149
25.1k
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7150
25.1k
            *p++ = Py_hexdigits[ch & 15];
7151
25.1k
        }
7152
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7153
634
        else {
7154
634
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7155
634
            *p++ = '\\';
7156
634
            *p++ = 'U';
7157
634
            *p++ = '0';
7158
634
            *p++ = '0';
7159
634
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7160
634
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7161
634
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7162
634
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7163
634
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7164
634
            *p++ = Py_hexdigits[ch & 15];
7165
634
        }
7166
4.95M
    }
7167
7168
317
    return PyBytesWriter_FinishWithPointer(writer, p);
7169
317
}
7170
7171
/* --- Latin-1 Codec ------------------------------------------------------ */
7172
7173
PyObject *
7174
PyUnicode_DecodeLatin1(const char *s,
7175
                       Py_ssize_t size,
7176
                       const char *errors)
7177
3.16M
{
7178
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
7179
3.16M
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
7180
3.16M
}
7181
7182
/* create or adjust a UnicodeEncodeError */
7183
static void
7184
make_encode_exception(PyObject **exceptionObject,
7185
                      const char *encoding,
7186
                      PyObject *unicode,
7187
                      Py_ssize_t startpos, Py_ssize_t endpos,
7188
                      const char *reason)
7189
202k
{
7190
202k
    if (*exceptionObject == NULL) {
7191
202k
        *exceptionObject = PyObject_CallFunction(
7192
202k
            PyExc_UnicodeEncodeError, "sOnns",
7193
202k
            encoding, unicode, startpos, endpos, reason);
7194
202k
    }
7195
0
    else {
7196
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7197
0
            goto onError;
7198
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7199
0
            goto onError;
7200
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7201
0
            goto onError;
7202
0
        return;
7203
0
      onError:
7204
0
        Py_CLEAR(*exceptionObject);
7205
0
    }
7206
202k
}
7207
7208
/* raises a UnicodeEncodeError */
7209
static void
7210
raise_encode_exception(PyObject **exceptionObject,
7211
                       const char *encoding,
7212
                       PyObject *unicode,
7213
                       Py_ssize_t startpos, Py_ssize_t endpos,
7214
                       const char *reason)
7215
39.1k
{
7216
39.1k
    make_encode_exception(exceptionObject,
7217
39.1k
                          encoding, unicode, startpos, endpos, reason);
7218
39.1k
    if (*exceptionObject != NULL)
7219
39.1k
        PyCodec_StrictErrors(*exceptionObject);
7220
39.1k
}
7221
7222
/* error handling callback helper:
7223
   build arguments, call the callback and check the arguments,
7224
   put the result into newpos and return the replacement string, which
7225
   has to be freed by the caller */
7226
static PyObject *
7227
unicode_encode_call_errorhandler(const char *errors,
7228
                                 PyObject **errorHandler,
7229
                                 const char *encoding, const char *reason,
7230
                                 PyObject *unicode, PyObject **exceptionObject,
7231
                                 Py_ssize_t startpos, Py_ssize_t endpos,
7232
                                 Py_ssize_t *newpos)
7233
163k
{
7234
163k
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
7235
163k
    Py_ssize_t len;
7236
163k
    PyObject *restuple;
7237
163k
    PyObject *resunicode;
7238
7239
163k
    if (*errorHandler == NULL) {
7240
163k
        *errorHandler = PyCodec_LookupError(errors);
7241
163k
        if (*errorHandler == NULL)
7242
0
            return NULL;
7243
163k
    }
7244
7245
163k
    len = PyUnicode_GET_LENGTH(unicode);
7246
7247
163k
    make_encode_exception(exceptionObject,
7248
163k
                          encoding, unicode, startpos, endpos, reason);
7249
163k
    if (*exceptionObject == NULL)
7250
0
        return NULL;
7251
7252
163k
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
7253
163k
    if (restuple == NULL)
7254
163k
        return NULL;
7255
0
    if (!PyTuple_Check(restuple)) {
7256
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7257
0
        Py_DECREF(restuple);
7258
0
        return NULL;
7259
0
    }
7260
0
    if (!PyArg_ParseTuple(restuple, argparse,
7261
0
                          &resunicode, newpos)) {
7262
0
        Py_DECREF(restuple);
7263
0
        return NULL;
7264
0
    }
7265
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7266
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
7267
0
        Py_DECREF(restuple);
7268
0
        return NULL;
7269
0
    }
7270
0
    if (*newpos<0)
7271
0
        *newpos = len + *newpos;
7272
0
    if (*newpos<0 || *newpos>len) {
7273
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7274
0
        Py_DECREF(restuple);
7275
0
        return NULL;
7276
0
    }
7277
0
    Py_INCREF(resunicode);
7278
0
    Py_DECREF(restuple);
7279
0
    return resunicode;
7280
0
}
7281
7282
static PyObject *
7283
unicode_encode_ucs1(PyObject *unicode,
7284
                    const char *errors,
7285
                    const Py_UCS4 limit)
7286
49.4k
{
7287
    /* input state */
7288
49.4k
    Py_ssize_t pos=0, size;
7289
49.4k
    int kind;
7290
49.4k
    const void *data;
7291
49.4k
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7292
49.4k
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
7293
49.4k
    PyObject *error_handler_obj = NULL;
7294
49.4k
    PyObject *exc = NULL;
7295
49.4k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7296
49.4k
    PyObject *rep = NULL;
7297
7298
49.4k
    size = PyUnicode_GET_LENGTH(unicode);
7299
49.4k
    kind = PyUnicode_KIND(unicode);
7300
49.4k
    data = PyUnicode_DATA(unicode);
7301
    /* allocate enough for a simple encoding without
7302
       replacements, if we need more, we'll resize */
7303
49.4k
    if (size == 0)
7304
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
7305
7306
    /* output object */
7307
49.4k
    PyBytesWriter *writer = PyBytesWriter_Create(size);
7308
49.4k
    if (writer == NULL) {
7309
0
        return NULL;
7310
0
    }
7311
    /* pointer into the output */
7312
49.4k
    char *str = PyBytesWriter_GetData(writer);
7313
7314
3.48M
    while (pos < size) {
7315
3.48M
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
7316
7317
        /* can we encode this? */
7318
3.48M
        if (ch < limit) {
7319
            /* no overflow check, because we know that the space is enough */
7320
3.43M
            *str++ = (char)ch;
7321
3.43M
            ++pos;
7322
3.43M
        }
7323
49.4k
        else {
7324
49.4k
            Py_ssize_t newpos, i;
7325
            /* startpos for collecting unencodable chars */
7326
49.4k
            Py_ssize_t collstart = pos;
7327
49.4k
            Py_ssize_t collend = collstart + 1;
7328
            /* find all unecodable characters */
7329
7330
393k
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
7331
343k
                ++collend;
7332
7333
            /* Only overallocate the buffer if it's not the last write */
7334
49.4k
            writer->overallocate = (collend < size);
7335
7336
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
7337
49.4k
            if (error_handler == _Py_ERROR_UNKNOWN)
7338
49.4k
                error_handler = _Py_GetErrorHandler(errors);
7339
7340
49.4k
            switch (error_handler) {
7341
39.1k
            case _Py_ERROR_STRICT:
7342
39.1k
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
7343
39.1k
                goto onError;
7344
7345
0
            case _Py_ERROR_REPLACE:
7346
0
                memset(str, '?', collend - collstart);
7347
0
                str += (collend - collstart);
7348
0
                _Py_FALLTHROUGH;
7349
0
            case _Py_ERROR_IGNORE:
7350
0
                pos = collend;
7351
0
                break;
7352
7353
0
            case _Py_ERROR_BACKSLASHREPLACE:
7354
                /* subtract preallocated bytes */
7355
0
                writer->size -= (collend - collstart);
7356
0
                str = backslashreplace(writer, str,
7357
0
                                       unicode, collstart, collend);
7358
0
                if (str == NULL)
7359
0
                    goto onError;
7360
0
                pos = collend;
7361
0
                break;
7362
7363
0
            case _Py_ERROR_XMLCHARREFREPLACE:
7364
                /* subtract preallocated bytes */
7365
0
                writer->size -= (collend - collstart);
7366
0
                str = xmlcharrefreplace(writer, str,
7367
0
                                        unicode, collstart, collend);
7368
0
                if (str == NULL)
7369
0
                    goto onError;
7370
0
                pos = collend;
7371
0
                break;
7372
7373
10.3k
            case _Py_ERROR_SURROGATEESCAPE:
7374
10.3k
                for (i = collstart; i < collend; ++i) {
7375
10.3k
                    ch = PyUnicode_READ(kind, data, i);
7376
10.3k
                    if (ch < 0xdc80 || 0xdcff < ch) {
7377
                        /* Not a UTF-8b surrogate */
7378
10.3k
                        break;
7379
10.3k
                    }
7380
0
                    *str++ = (char)(ch - 0xdc00);
7381
0
                    ++pos;
7382
0
                }
7383
10.3k
                if (i >= collend)
7384
0
                    break;
7385
10.3k
                collstart = pos;
7386
10.3k
                assert(collstart != collend);
7387
10.3k
                _Py_FALLTHROUGH;
7388
7389
10.3k
            default:
7390
10.3k
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7391
10.3k
                                                       encoding, reason, unicode, &exc,
7392
10.3k
                                                       collstart, collend, &newpos);
7393
10.3k
                if (rep == NULL)
7394
10.3k
                    goto onError;
7395
7396
0
                if (newpos < collstart) {
7397
0
                    writer->overallocate = 1;
7398
0
                    str = PyBytesWriter_GrowAndUpdatePointer(writer,
7399
0
                                                             collstart - newpos,
7400
0
                                                             str);
7401
0
                    if (str == NULL) {
7402
0
                        goto onError;
7403
0
                    }
7404
0
                }
7405
0
                else {
7406
                    /* subtract preallocated bytes */
7407
0
                    writer->size -= newpos - collstart;
7408
                    /* Only overallocate the buffer if it's not the last write */
7409
0
                    writer->overallocate = (newpos < size);
7410
0
                }
7411
7412
0
                char *rep_str;
7413
0
                Py_ssize_t rep_len;
7414
0
                if (PyBytes_Check(rep)) {
7415
                    /* Directly copy bytes result to output. */
7416
0
                    rep_str = PyBytes_AS_STRING(rep);
7417
0
                    rep_len = PyBytes_GET_SIZE(rep);
7418
0
                }
7419
0
                else {
7420
0
                    assert(PyUnicode_Check(rep));
7421
7422
0
                    if (limit == 256 ?
7423
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7424
0
                        !PyUnicode_IS_ASCII(rep))
7425
0
                    {
7426
                        /* Not all characters are smaller than limit */
7427
0
                        raise_encode_exception(&exc, encoding, unicode,
7428
0
                                               collstart, collend, reason);
7429
0
                        goto onError;
7430
0
                    }
7431
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7432
0
                    rep_str = PyUnicode_DATA(rep);
7433
0
                    rep_len = PyUnicode_GET_LENGTH(rep);
7434
0
                }
7435
7436
0
                str = PyBytesWriter_GrowAndUpdatePointer(writer, rep_len, str);
7437
0
                if (str == NULL) {
7438
0
                    goto onError;
7439
0
                }
7440
0
                memcpy(str, rep_str, rep_len);
7441
0
                str += rep_len;
7442
7443
0
                pos = newpos;
7444
0
                Py_CLEAR(rep);
7445
49.4k
            }
7446
7447
            /* If overallocation was disabled, ensure that it was the last
7448
               write. Otherwise, we missed an optimization */
7449
49.4k
            assert(writer->overallocate || pos == size);
7450
0
        }
7451
3.48M
    }
7452
7453
0
    Py_XDECREF(error_handler_obj);
7454
0
    Py_XDECREF(exc);
7455
0
    return PyBytesWriter_FinishWithPointer(writer, str);
7456
7457
49.4k
  onError:
7458
49.4k
    Py_XDECREF(rep);
7459
49.4k
    PyBytesWriter_Discard(writer);
7460
49.4k
    Py_XDECREF(error_handler_obj);
7461
49.4k
    Py_XDECREF(exc);
7462
49.4k
    return NULL;
7463
49.4k
}
7464
7465
PyObject *
7466
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
7467
0
{
7468
0
    if (!PyUnicode_Check(unicode)) {
7469
0
        PyErr_BadArgument();
7470
0
        return NULL;
7471
0
    }
7472
    /* Fast path: if it is a one-byte string, construct
7473
       bytes object directly. */
7474
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7475
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7476
0
                                         PyUnicode_GET_LENGTH(unicode));
7477
    /* Non-Latin-1 characters present. Defer to above function to
7478
       raise the exception. */
7479
0
    return unicode_encode_ucs1(unicode, errors, 256);
7480
0
}
7481
7482
PyObject*
7483
PyUnicode_AsLatin1String(PyObject *unicode)
7484
0
{
7485
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
7486
0
}
7487
7488
/* --- 7-bit ASCII Codec -------------------------------------------------- */
7489
7490
PyObject *
7491
PyUnicode_DecodeASCII(const char *s,
7492
                      Py_ssize_t size,
7493
                      const char *errors)
7494
845k
{
7495
845k
    const char *starts = s;
7496
845k
    const char *e = s + size;
7497
845k
    PyObject *error_handler_obj = NULL;
7498
845k
    PyObject *exc = NULL;
7499
845k
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
7500
7501
845k
    if (size == 0)
7502
0
        _Py_RETURN_UNICODE_EMPTY();
7503
7504
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7505
845k
    if (size == 1 && (unsigned char)s[0] < 128) {
7506
7.42k
        return get_latin1_char((unsigned char)s[0]);
7507
7.42k
    }
7508
7509
    // Shortcut for simple case
7510
837k
    PyObject *u = PyUnicode_New(size, 127);
7511
837k
    if (u == NULL) {
7512
0
        return NULL;
7513
0
    }
7514
837k
    Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
7515
837k
    if (outpos == size) {
7516
691k
        return u;
7517
691k
    }
7518
7519
146k
    _PyUnicodeWriter writer;
7520
146k
    _PyUnicodeWriter_InitWithBuffer(&writer, u);
7521
146k
    writer.pos = outpos;
7522
7523
146k
    s += outpos;
7524
146k
    int kind = writer.kind;
7525
146k
    void *data = writer.data;
7526
146k
    Py_ssize_t startinpos, endinpos;
7527
7528
19.7M
    while (s < e) {
7529
19.6M
        unsigned char c = (unsigned char)*s;
7530
19.6M
        if (c < 128) {
7531
6.12M
            PyUnicode_WRITE(kind, data, writer.pos, c);
7532
6.12M
            writer.pos++;
7533
6.12M
            ++s;
7534
6.12M
            continue;
7535
6.12M
        }
7536
7537
        /* byte outsize range 0x00..0x7f: call the error handler */
7538
7539
13.5M
        if (error_handler == _Py_ERROR_UNKNOWN)
7540
146k
            error_handler = _Py_GetErrorHandler(errors);
7541
7542
13.5M
        switch (error_handler)
7543
13.5M
        {
7544
659k
        case _Py_ERROR_REPLACE:
7545
13.5M
        case _Py_ERROR_SURROGATEESCAPE:
7546
            /* Fast-path: the error handler only writes one character,
7547
               but we may switch to UCS2 at the first write */
7548
13.5M
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7549
0
                goto onError;
7550
13.5M
            kind = writer.kind;
7551
13.5M
            data = writer.data;
7552
7553
13.5M
            if (error_handler == _Py_ERROR_REPLACE)
7554
659k
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7555
12.8M
            else
7556
12.8M
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7557
13.5M
            writer.pos++;
7558
13.5M
            ++s;
7559
13.5M
            break;
7560
7561
0
        case _Py_ERROR_IGNORE:
7562
0
            ++s;
7563
0
            break;
7564
7565
7.80k
        default:
7566
7.80k
            startinpos = s-starts;
7567
7.80k
            endinpos = startinpos + 1;
7568
7.80k
            if (unicode_decode_call_errorhandler_writer(
7569
7.80k
                    errors, &error_handler_obj,
7570
7.80k
                    "ascii", "ordinal not in range(128)",
7571
7.80k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7572
7.80k
                    &writer))
7573
7.80k
                goto onError;
7574
0
            kind = writer.kind;
7575
0
            data = writer.data;
7576
13.5M
        }
7577
13.5M
    }
7578
138k
    Py_XDECREF(error_handler_obj);
7579
138k
    Py_XDECREF(exc);
7580
138k
    return _PyUnicodeWriter_Finish(&writer);
7581
7582
7.80k
  onError:
7583
7.80k
    _PyUnicodeWriter_Dealloc(&writer);
7584
7.80k
    Py_XDECREF(error_handler_obj);
7585
7.80k
    Py_XDECREF(exc);
7586
7.80k
    return NULL;
7587
146k
}
7588
7589
PyObject *
7590
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7591
108k
{
7592
108k
    if (!PyUnicode_Check(unicode)) {
7593
0
        PyErr_BadArgument();
7594
0
        return NULL;
7595
0
    }
7596
    /* Fast path: if it is an ASCII-only string, construct bytes object
7597
       directly. Else defer to above function to raise the exception. */
7598
108k
    if (PyUnicode_IS_ASCII(unicode))
7599
58.7k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7600
58.7k
                                         PyUnicode_GET_LENGTH(unicode));
7601
49.4k
    return unicode_encode_ucs1(unicode, errors, 128);
7602
108k
}
7603
7604
PyObject *
7605
PyUnicode_AsASCIIString(PyObject *unicode)
7606
4
{
7607
4
    return _PyUnicode_AsASCIIString(unicode, NULL);
7608
4
}
7609
7610
#ifdef MS_WINDOWS
7611
7612
/* --- MBCS codecs for Windows -------------------------------------------- */
7613
7614
#if SIZEOF_INT < SIZEOF_SIZE_T
7615
#define NEED_RETRY
7616
#endif
7617
7618
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7619
   transcoding from UTF-16), but INT_MAX / 4 performs better in
7620
   both cases also and avoids partial characters overrunning the
7621
   length limit in MultiByteToWideChar on Windows */
7622
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7623
7624
#ifndef WC_ERR_INVALID_CHARS
7625
#  define WC_ERR_INVALID_CHARS 0x0080
7626
#endif
7627
7628
static const char*
7629
code_page_name(UINT code_page, PyObject **obj)
7630
{
7631
    *obj = NULL;
7632
    if (code_page == CP_ACP)
7633
        return "mbcs";
7634
7635
    *obj = PyBytes_FromFormat("cp%u", code_page);
7636
    if (*obj == NULL)
7637
        return NULL;
7638
    return PyBytes_AS_STRING(*obj);
7639
}
7640
7641
static DWORD
7642
decode_code_page_flags(UINT code_page)
7643
{
7644
    if (code_page == CP_UTF7) {
7645
        /* The CP_UTF7 decoder only supports flags=0 */
7646
        return 0;
7647
    }
7648
    else
7649
        return MB_ERR_INVALID_CHARS;
7650
}
7651
7652
/*
7653
 * Decode a byte string from a Windows code page into unicode object in strict
7654
 * mode.
7655
 *
7656
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7657
 * OSError and returns -1 on other error.
7658
 */
7659
static int
7660
decode_code_page_strict(UINT code_page,
7661
                        wchar_t **buf,
7662
                        Py_ssize_t *bufsize,
7663
                        const char *in,
7664
                        int insize)
7665
{
7666
    DWORD flags = MB_ERR_INVALID_CHARS;
7667
    wchar_t *out;
7668
    DWORD outsize;
7669
7670
    /* First get the size of the result */
7671
    assert(insize > 0);
7672
    while ((outsize = MultiByteToWideChar(code_page, flags,
7673
                                          in, insize, NULL, 0)) <= 0)
7674
    {
7675
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7676
            goto error;
7677
        }
7678
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7679
        flags = 0;
7680
    }
7681
7682
    /* Extend a wchar_t* buffer */
7683
    Py_ssize_t n = *bufsize;   /* Get the current length */
7684
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7685
        return -1;
7686
    }
7687
    out = *buf + n;
7688
7689
    /* Do the conversion */
7690
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7691
    if (outsize <= 0)
7692
        goto error;
7693
    return insize;
7694
7695
error:
7696
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7697
        return -2;
7698
    PyErr_SetFromWindowsErr(0);
7699
    return -1;
7700
}
7701
7702
/*
7703
 * Decode a byte string from a code page into unicode object with an error
7704
 * handler.
7705
 *
7706
 * Returns consumed size if succeed, or raise an OSError or
7707
 * UnicodeDecodeError exception and returns -1 on error.
7708
 */
7709
static int
7710
decode_code_page_errors(UINT code_page,
7711
                        wchar_t **buf,
7712
                        Py_ssize_t *bufsize,
7713
                        const char *in, const int size,
7714
                        const char *errors, int final)
7715
{
7716
    const char *startin = in;
7717
    const char *endin = in + size;
7718
    DWORD flags = MB_ERR_INVALID_CHARS;
7719
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7720
       2000 English version of the message. */
7721
    const char *reason = "No mapping for the Unicode character exists "
7722
                         "in the target code page.";
7723
    /* each step cannot decode more than 1 character, but a character can be
7724
       represented as a surrogate pair */
7725
    wchar_t buffer[2], *out;
7726
    int insize;
7727
    Py_ssize_t outsize;
7728
    PyObject *errorHandler = NULL;
7729
    PyObject *exc = NULL;
7730
    PyObject *encoding_obj = NULL;
7731
    const char *encoding;
7732
    DWORD err;
7733
    int ret = -1;
7734
7735
    assert(size > 0);
7736
7737
    encoding = code_page_name(code_page, &encoding_obj);
7738
    if (encoding == NULL)
7739
        return -1;
7740
7741
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7742
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7743
           UnicodeDecodeError. */
7744
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7745
        if (exc != NULL) {
7746
            PyCodec_StrictErrors(exc);
7747
            Py_CLEAR(exc);
7748
        }
7749
        goto error;
7750
    }
7751
7752
    /* Extend a wchar_t* buffer */
7753
    Py_ssize_t n = *bufsize;   /* Get the current length */
7754
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7755
        PyErr_NoMemory();
7756
        goto error;
7757
    }
7758
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7759
        goto error;
7760
    }
7761
    out = *buf + n;
7762
7763
    /* Decode the byte string character per character */
7764
    while (in < endin)
7765
    {
7766
        /* Decode a character */
7767
        insize = 1;
7768
        do
7769
        {
7770
            outsize = MultiByteToWideChar(code_page, flags,
7771
                                          in, insize,
7772
                                          buffer, Py_ARRAY_LENGTH(buffer));
7773
            if (outsize > 0)
7774
                break;
7775
            err = GetLastError();
7776
            if (err == ERROR_INVALID_FLAGS && flags) {
7777
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7778
                flags = 0;
7779
                continue;
7780
            }
7781
            if (err != ERROR_NO_UNICODE_TRANSLATION
7782
                && err != ERROR_INSUFFICIENT_BUFFER)
7783
            {
7784
                PyErr_SetFromWindowsErr(err);
7785
                goto error;
7786
            }
7787
            insize++;
7788
        }
7789
        /* 4=maximum length of a UTF-8 sequence */
7790
        while (insize <= 4 && (in + insize) <= endin);
7791
7792
        if (outsize <= 0) {
7793
            Py_ssize_t startinpos, endinpos, outpos;
7794
7795
            /* last character in partial decode? */
7796
            if (in + insize >= endin && !final)
7797
                break;
7798
7799
            startinpos = in - startin;
7800
            endinpos = startinpos + 1;
7801
            outpos = out - *buf;
7802
            if (unicode_decode_call_errorhandler_wchar(
7803
                    errors, &errorHandler,
7804
                    encoding, reason,
7805
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7806
                    buf, bufsize, &outpos))
7807
            {
7808
                goto error;
7809
            }
7810
            out = *buf + outpos;
7811
        }
7812
        else {
7813
            in += insize;
7814
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7815
            out += outsize;
7816
        }
7817
    }
7818
7819
    /* Shrink the buffer */
7820
    assert(out - *buf <= *bufsize);
7821
    *bufsize = out - *buf;
7822
    /* (in - startin) <= size and size is an int */
7823
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7824
7825
error:
7826
    Py_XDECREF(encoding_obj);
7827
    Py_XDECREF(errorHandler);
7828
    Py_XDECREF(exc);
7829
    return ret;
7830
}
7831
7832
static PyObject *
7833
decode_code_page_stateful(int code_page,
7834
                          const char *s, Py_ssize_t size,
7835
                          const char *errors, Py_ssize_t *consumed)
7836
{
7837
    wchar_t *buf = NULL;
7838
    Py_ssize_t bufsize = 0;
7839
    int chunk_size, final, converted, done;
7840
7841
    if (code_page < 0) {
7842
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7843
        return NULL;
7844
    }
7845
    if (size < 0) {
7846
        PyErr_BadInternalCall();
7847
        return NULL;
7848
    }
7849
7850
    if (consumed)
7851
        *consumed = 0;
7852
7853
    do
7854
    {
7855
#ifdef NEED_RETRY
7856
        if (size > DECODING_CHUNK_SIZE) {
7857
            chunk_size = DECODING_CHUNK_SIZE;
7858
            final = 0;
7859
            done = 0;
7860
        }
7861
        else
7862
#endif
7863
        {
7864
            chunk_size = (int)size;
7865
            final = (consumed == NULL);
7866
            done = 1;
7867
        }
7868
7869
        if (chunk_size == 0 && done) {
7870
            if (buf != NULL)
7871
                break;
7872
            _Py_RETURN_UNICODE_EMPTY();
7873
        }
7874
7875
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7876
                                            s, chunk_size);
7877
        if (converted == -2)
7878
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7879
                                                s, chunk_size,
7880
                                                errors, final);
7881
        assert(converted != 0 || done);
7882
7883
        if (converted < 0) {
7884
            PyMem_Free(buf);
7885
            return NULL;
7886
        }
7887
7888
        if (consumed)
7889
            *consumed += converted;
7890
7891
        s += converted;
7892
        size -= converted;
7893
    } while (!done);
7894
7895
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7896
    PyMem_Free(buf);
7897
    return v;
7898
}
7899
7900
PyObject *
7901
PyUnicode_DecodeCodePageStateful(int code_page,
7902
                                 const char *s,
7903
                                 Py_ssize_t size,
7904
                                 const char *errors,
7905
                                 Py_ssize_t *consumed)
7906
{
7907
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7908
}
7909
7910
PyObject *
7911
PyUnicode_DecodeMBCSStateful(const char *s,
7912
                             Py_ssize_t size,
7913
                             const char *errors,
7914
                             Py_ssize_t *consumed)
7915
{
7916
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7917
}
7918
7919
PyObject *
7920
PyUnicode_DecodeMBCS(const char *s,
7921
                     Py_ssize_t size,
7922
                     const char *errors)
7923
{
7924
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7925
}
7926
7927
static DWORD
7928
encode_code_page_flags(UINT code_page, const char *errors)
7929
{
7930
    if (code_page == CP_UTF8) {
7931
        return WC_ERR_INVALID_CHARS;
7932
    }
7933
    else if (code_page == CP_UTF7) {
7934
        /* CP_UTF7 only supports flags=0 */
7935
        return 0;
7936
    }
7937
    else {
7938
        if (errors != NULL && strcmp(errors, "replace") == 0)
7939
            return 0;
7940
        else
7941
            return WC_NO_BEST_FIT_CHARS;
7942
    }
7943
}
7944
7945
/*
7946
 * Encode a Unicode string to a Windows code page into a byte string in strict
7947
 * mode.
7948
 *
7949
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7950
 * an OSError and returns -1 on other error.
7951
 */
7952
static int
7953
encode_code_page_strict(UINT code_page, PyBytesWriter **writer,
7954
                        PyObject *unicode, Py_ssize_t offset, int len,
7955
                        const char* errors)
7956
{
7957
    BOOL usedDefaultChar = FALSE;
7958
    BOOL *pusedDefaultChar = &usedDefaultChar;
7959
    int outsize;
7960
    wchar_t *p;
7961
    Py_ssize_t size;
7962
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7963
    char *out;
7964
    /* Create a substring so that we can get the UTF-16 representation
7965
       of just the slice under consideration. */
7966
    PyObject *substring;
7967
    int ret = -1;
7968
7969
    assert(len > 0);
7970
7971
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7972
        pusedDefaultChar = &usedDefaultChar;
7973
    else
7974
        pusedDefaultChar = NULL;
7975
7976
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7977
    if (substring == NULL)
7978
        return -1;
7979
    p = PyUnicode_AsWideCharString(substring, &size);
7980
    Py_CLEAR(substring);
7981
    if (p == NULL) {
7982
        return -1;
7983
    }
7984
    assert(size <= INT_MAX);
7985
7986
    /* First get the size of the result */
7987
    outsize = WideCharToMultiByte(code_page, flags,
7988
                                  p, (int)size,
7989
                                  NULL, 0,
7990
                                  NULL, pusedDefaultChar);
7991
    if (outsize <= 0)
7992
        goto error;
7993
    /* If we used a default char, then we failed! */
7994
    if (pusedDefaultChar && *pusedDefaultChar) {
7995
        ret = -2;
7996
        goto done;
7997
    }
7998
7999
    if (*writer == NULL) {
8000
        /* Create string object */
8001
        *writer = PyBytesWriter_Create(outsize);
8002
        if (*writer == NULL) {
8003
            goto done;
8004
        }
8005
        out = PyBytesWriter_GetData(*writer);
8006
    }
8007
    else {
8008
        /* Extend string object */
8009
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8010
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8011
            goto done;
8012
        }
8013
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8014
    }
8015
8016
    /* Do the conversion */
8017
    outsize = WideCharToMultiByte(code_page, flags,
8018
                                  p, (int)size,
8019
                                  out, outsize,
8020
                                  NULL, pusedDefaultChar);
8021
    if (outsize <= 0)
8022
        goto error;
8023
    if (pusedDefaultChar && *pusedDefaultChar) {
8024
        ret = -2;
8025
        goto done;
8026
    }
8027
    ret = 0;
8028
8029
done:
8030
    PyMem_Free(p);
8031
    return ret;
8032
8033
error:
8034
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
8035
        ret = -2;
8036
        goto done;
8037
    }
8038
    PyErr_SetFromWindowsErr(0);
8039
    goto done;
8040
}
8041
8042
/*
8043
 * Encode a Unicode string to a Windows code page into a byte string using an
8044
 * error handler.
8045
 *
8046
 * Returns consumed characters if succeed, or raise an OSError and returns
8047
 * -1 on other error.
8048
 */
8049
static int
8050
encode_code_page_errors(UINT code_page, PyBytesWriter **writer,
8051
                        PyObject *unicode, Py_ssize_t unicode_offset,
8052
                        Py_ssize_t insize, const char* errors)
8053
{
8054
    const DWORD flags = encode_code_page_flags(code_page, errors);
8055
    Py_ssize_t pos = unicode_offset;
8056
    Py_ssize_t endin = unicode_offset + insize;
8057
    /* Ideally, we should get reason from FormatMessage. This is the Windows
8058
       2000 English version of the message. */
8059
    const char *reason = "invalid character";
8060
    /* 4=maximum length of a UTF-8 sequence */
8061
    char buffer[4];
8062
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
8063
    Py_ssize_t outsize;
8064
    char *out;
8065
    PyObject *errorHandler = NULL;
8066
    PyObject *exc = NULL;
8067
    PyObject *encoding_obj = NULL;
8068
    const char *encoding;
8069
    Py_ssize_t newpos;
8070
    PyObject *rep;
8071
    int ret = -1;
8072
8073
    assert(insize > 0);
8074
8075
    encoding = code_page_name(code_page, &encoding_obj);
8076
    if (encoding == NULL)
8077
        return -1;
8078
8079
    if (errors == NULL || strcmp(errors, "strict") == 0) {
8080
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8081
           then we raise a UnicodeEncodeError. */
8082
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
8083
        if (exc != NULL) {
8084
            PyCodec_StrictErrors(exc);
8085
            Py_DECREF(exc);
8086
        }
8087
        Py_XDECREF(encoding_obj);
8088
        return -1;
8089
    }
8090
8091
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
8092
        pusedDefaultChar = &usedDefaultChar;
8093
    else
8094
        pusedDefaultChar = NULL;
8095
8096
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8097
        PyErr_NoMemory();
8098
        goto error;
8099
    }
8100
    outsize = insize * Py_ARRAY_LENGTH(buffer);
8101
8102
    if (*writer == NULL) {
8103
        /* Create string object */
8104
        *writer = PyBytesWriter_Create(outsize);
8105
        if (*writer == NULL) {
8106
            goto error;
8107
        }
8108
        out = PyBytesWriter_GetData(*writer);
8109
    }
8110
    else {
8111
        /* Extend string object */
8112
        Py_ssize_t n = PyBytesWriter_GetSize(*writer);
8113
        if (PyBytesWriter_Grow(*writer, outsize) < 0) {
8114
            goto error;
8115
        }
8116
        out = (char*)PyBytesWriter_GetData(*writer) + n;
8117
    }
8118
8119
    /* Encode the string character per character */
8120
    while (pos < endin)
8121
    {
8122
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8123
        wchar_t chars[2];
8124
        int charsize;
8125
        if (ch < 0x10000) {
8126
            chars[0] = (wchar_t)ch;
8127
            charsize = 1;
8128
        }
8129
        else {
8130
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8131
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
8132
            charsize = 2;
8133
        }
8134
8135
        outsize = WideCharToMultiByte(code_page, flags,
8136
                                      chars, charsize,
8137
                                      buffer, Py_ARRAY_LENGTH(buffer),
8138
                                      NULL, pusedDefaultChar);
8139
        if (outsize > 0) {
8140
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8141
            {
8142
                pos++;
8143
                memcpy(out, buffer, outsize);
8144
                out += outsize;
8145
                continue;
8146
            }
8147
        }
8148
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8149
            PyErr_SetFromWindowsErr(0);
8150
            goto error;
8151
        }
8152
8153
        rep = unicode_encode_call_errorhandler(
8154
                  errors, &errorHandler, encoding, reason,
8155
                  unicode, &exc,
8156
                  pos, pos + 1, &newpos);
8157
        if (rep == NULL)
8158
            goto error;
8159
8160
        Py_ssize_t morebytes = pos - newpos;
8161
        if (PyBytes_Check(rep)) {
8162
            outsize = PyBytes_GET_SIZE(rep);
8163
            morebytes += outsize;
8164
            if (morebytes > 0) {
8165
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8166
                if (out == NULL) {
8167
                    Py_DECREF(rep);
8168
                    goto error;
8169
                }
8170
            }
8171
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
8172
            out += outsize;
8173
        }
8174
        else {
8175
            Py_ssize_t i;
8176
            int kind;
8177
            const void *data;
8178
8179
            outsize = PyUnicode_GET_LENGTH(rep);
8180
            morebytes += outsize;
8181
            if (morebytes > 0) {
8182
                out = PyBytesWriter_GrowAndUpdatePointer(*writer, morebytes, out);
8183
                if (out == NULL) {
8184
                    Py_DECREF(rep);
8185
                    goto error;
8186
                }
8187
            }
8188
            kind = PyUnicode_KIND(rep);
8189
            data = PyUnicode_DATA(rep);
8190
            for (i=0; i < outsize; i++) {
8191
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8192
                if (ch > 127) {
8193
                    raise_encode_exception(&exc,
8194
                        encoding, unicode,
8195
                        pos, pos + 1,
8196
                        "unable to encode error handler result to ASCII");
8197
                    Py_DECREF(rep);
8198
                    goto error;
8199
                }
8200
                *out = (unsigned char)ch;
8201
                out++;
8202
            }
8203
        }
8204
        pos = newpos;
8205
        Py_DECREF(rep);
8206
    }
8207
    /* write a NUL byte */
8208
    *out = 0;
8209
    outsize = out - (char*)PyBytesWriter_GetData(*writer);
8210
    assert(outsize <= PyBytesWriter_GetSize(*writer));
8211
    if (PyBytesWriter_Resize(*writer, outsize) < 0) {
8212
        goto error;
8213
    }
8214
    ret = 0;
8215
8216
error:
8217
    Py_XDECREF(encoding_obj);
8218
    Py_XDECREF(errorHandler);
8219
    Py_XDECREF(exc);
8220
    return ret;
8221
}
8222
8223
8224
PyObject *
8225
PyUnicode_EncodeCodePage(int code_page,
8226
                         PyObject *unicode,
8227
                         const char *errors)
8228
{
8229
    Py_ssize_t len;
8230
    PyBytesWriter *writer = NULL;
8231
    Py_ssize_t offset;
8232
    int chunk_len, ret, done;
8233
8234
    if (!PyUnicode_Check(unicode)) {
8235
        PyErr_BadArgument();
8236
        return NULL;
8237
    }
8238
8239
    len = PyUnicode_GET_LENGTH(unicode);
8240
8241
    if (code_page < 0) {
8242
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
8243
        return NULL;
8244
    }
8245
8246
    if (len == 0)
8247
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
8248
8249
    offset = 0;
8250
    do
8251
    {
8252
#ifdef NEED_RETRY
8253
        if (len > DECODING_CHUNK_SIZE) {
8254
            chunk_len = DECODING_CHUNK_SIZE;
8255
            done = 0;
8256
        }
8257
        else
8258
#endif
8259
        {
8260
            chunk_len = (int)len;
8261
            done = 1;
8262
        }
8263
8264
        ret = encode_code_page_strict(code_page, &writer,
8265
                                      unicode, offset, chunk_len,
8266
                                      errors);
8267
        if (ret == -2)
8268
            ret = encode_code_page_errors(code_page, &writer,
8269
                                          unicode, offset,
8270
                                          chunk_len, errors);
8271
        if (ret < 0) {
8272
            PyBytesWriter_Discard(writer);
8273
            return NULL;
8274
        }
8275
8276
        offset += chunk_len;
8277
        len -= chunk_len;
8278
    } while (!done);
8279
8280
    return PyBytesWriter_Finish(writer);
8281
}
8282
8283
8284
PyObject *
8285
PyUnicode_AsMBCSString(PyObject *unicode)
8286
{
8287
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
8288
}
8289
8290
#undef NEED_RETRY
8291
8292
#endif /* MS_WINDOWS */
8293
8294
/* --- Character Mapping Codec -------------------------------------------- */
8295
8296
static int
8297
charmap_decode_string(const char *s,
8298
                      Py_ssize_t size,
8299
                      PyObject *mapping,
8300
                      const char *errors,
8301
                      _PyUnicodeWriter *writer)
8302
11.0k
{
8303
11.0k
    const char *starts = s;
8304
11.0k
    const char *e;
8305
11.0k
    Py_ssize_t startinpos, endinpos;
8306
11.0k
    PyObject *errorHandler = NULL, *exc = NULL;
8307
11.0k
    Py_ssize_t maplen;
8308
11.0k
    int mapkind;
8309
11.0k
    const void *mapdata;
8310
11.0k
    Py_UCS4 x;
8311
11.0k
    unsigned char ch;
8312
8313
11.0k
    maplen = PyUnicode_GET_LENGTH(mapping);
8314
11.0k
    mapdata = PyUnicode_DATA(mapping);
8315
11.0k
    mapkind = PyUnicode_KIND(mapping);
8316
8317
11.0k
    e = s + size;
8318
8319
11.0k
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8320
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8321
         * is disabled in encoding aliases, latin1 is preferred because
8322
         * its implementation is faster. */
8323
137
        const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
8324
137
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8325
137
        Py_UCS4 maxchar = writer->maxchar;
8326
8327
137
        assert (writer->kind == PyUnicode_1BYTE_KIND);
8328
2.53k
        while (s < e) {
8329
2.39k
            ch = *s;
8330
2.39k
            x = mapdata_ucs1[ch];
8331
2.39k
            if (x > maxchar) {
8332
127
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8333
0
                    goto onError;
8334
127
                maxchar = writer->maxchar;
8335
127
                outdata = (Py_UCS1 *)writer->data;
8336
127
            }
8337
2.39k
            outdata[writer->pos] = x;
8338
2.39k
            writer->pos++;
8339
2.39k
            ++s;
8340
2.39k
        }
8341
137
        return 0;
8342
137
    }
8343
8344
44.0k
    while (s < e) {
8345
38.4k
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8346
38.4k
            int outkind = writer->kind;
8347
38.4k
            const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
8348
38.4k
            if (outkind == PyUnicode_1BYTE_KIND) {
8349
20.6k
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8350
20.6k
                Py_UCS4 maxchar = writer->maxchar;
8351
75.9k
                while (s < e) {
8352
74.5k
                    ch = *s;
8353
74.5k
                    x = mapdata_ucs2[ch];
8354
74.5k
                    if (x > maxchar)
8355
19.2k
                        goto Error;
8356
55.2k
                    outdata[writer->pos] = x;
8357
55.2k
                    writer->pos++;
8358
55.2k
                    ++s;
8359
55.2k
                }
8360
1.40k
                break;
8361
20.6k
            }
8362
17.7k
            else if (outkind == PyUnicode_2BYTE_KIND) {
8363
17.7k
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8364
463k
                while (s < e) {
8365
459k
                    ch = *s;
8366
459k
                    x = mapdata_ucs2[ch];
8367
459k
                    if (x == 0xFFFE)
8368
13.8k
                        goto Error;
8369
445k
                    outdata[writer->pos] = x;
8370
445k
                    writer->pos++;
8371
445k
                    ++s;
8372
445k
                }
8373
3.98k
                break;
8374
17.7k
            }
8375
38.4k
        }
8376
0
        ch = *s;
8377
8378
0
        if (ch < maplen)
8379
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
8380
0
        else
8381
0
            x = 0xfffe; /* invalid value */
8382
33.0k
Error:
8383
33.0k
        if (x == 0xfffe)
8384
21.7k
        {
8385
            /* undefined mapping */
8386
21.7k
            startinpos = s-starts;
8387
21.7k
            endinpos = startinpos+1;
8388
21.7k
            if (unicode_decode_call_errorhandler_writer(
8389
21.7k
                    errors, &errorHandler,
8390
21.7k
                    "charmap", "character maps to <undefined>",
8391
21.7k
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
8392
21.7k
                    writer)) {
8393
21
                goto onError;
8394
21
            }
8395
21.7k
            continue;
8396
21.7k
        }
8397
8398
11.3k
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8399
0
            goto onError;
8400
11.3k
        ++s;
8401
11.3k
    }
8402
10.9k
    Py_XDECREF(errorHandler);
8403
10.9k
    Py_XDECREF(exc);
8404
10.9k
    return 0;
8405
8406
21
onError:
8407
21
    Py_XDECREF(errorHandler);
8408
21
    Py_XDECREF(exc);
8409
21
    return -1;
8410
10.9k
}
8411
8412
static int
8413
charmap_decode_mapping(const char *s,
8414
                       Py_ssize_t size,
8415
                       PyObject *mapping,
8416
                       const char *errors,
8417
                       _PyUnicodeWriter *writer)
8418
0
{
8419
0
    const char *starts = s;
8420
0
    const char *e;
8421
0
    Py_ssize_t startinpos, endinpos;
8422
0
    PyObject *errorHandler = NULL, *exc = NULL;
8423
0
    unsigned char ch;
8424
0
    PyObject *key, *item = NULL;
8425
8426
0
    e = s + size;
8427
8428
0
    while (s < e) {
8429
0
        ch = *s;
8430
8431
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
8432
0
        key = PyLong_FromLong((long)ch);
8433
0
        if (key == NULL)
8434
0
            goto onError;
8435
8436
0
        int rc = PyMapping_GetOptionalItem(mapping, key, &item);
8437
0
        Py_DECREF(key);
8438
0
        if (rc == 0) {
8439
            /* No mapping found means: mapping is undefined. */
8440
0
            goto Undefined;
8441
0
        }
8442
0
        if (item == NULL) {
8443
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8444
                /* No mapping found means: mapping is undefined. */
8445
0
                PyErr_Clear();
8446
0
                goto Undefined;
8447
0
            } else
8448
0
                goto onError;
8449
0
        }
8450
8451
        /* Apply mapping */
8452
0
        if (item == Py_None)
8453
0
            goto Undefined;
8454
0
        if (PyLong_Check(item)) {
8455
0
            long value = PyLong_AsLong(item);
8456
0
            if (value == 0xFFFE)
8457
0
                goto Undefined;
8458
0
            if (value < 0 || value > MAX_UNICODE) {
8459
0
                PyErr_Format(PyExc_TypeError,
8460
0
                             "character mapping must be in range(0x%x)",
8461
0
                             (unsigned long)MAX_UNICODE + 1);
8462
0
                goto onError;
8463
0
            }
8464
8465
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8466
0
                goto onError;
8467
0
        }
8468
0
        else if (PyUnicode_Check(item)) {
8469
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8470
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8471
0
                if (value == 0xFFFE)
8472
0
                    goto Undefined;
8473
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8474
0
                    goto onError;
8475
0
            }
8476
0
            else {
8477
0
                writer->overallocate = 1;
8478
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8479
0
                    goto onError;
8480
0
            }
8481
0
        }
8482
0
        else {
8483
            /* wrong return value */
8484
0
            PyErr_SetString(PyExc_TypeError,
8485
0
                            "character mapping must return integer, None or str");
8486
0
            goto onError;
8487
0
        }
8488
0
        Py_CLEAR(item);
8489
0
        ++s;
8490
0
        continue;
8491
8492
0
Undefined:
8493
        /* undefined mapping */
8494
0
        Py_CLEAR(item);
8495
0
        startinpos = s-starts;
8496
0
        endinpos = startinpos+1;
8497
0
        if (unicode_decode_call_errorhandler_writer(
8498
0
                errors, &errorHandler,
8499
0
                "charmap", "character maps to <undefined>",
8500
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8501
0
                writer)) {
8502
0
            goto onError;
8503
0
        }
8504
0
    }
8505
0
    Py_XDECREF(errorHandler);
8506
0
    Py_XDECREF(exc);
8507
0
    return 0;
8508
8509
0
onError:
8510
0
    Py_XDECREF(item);
8511
0
    Py_XDECREF(errorHandler);
8512
0
    Py_XDECREF(exc);
8513
0
    return -1;
8514
0
}
8515
8516
PyObject *
8517
PyUnicode_DecodeCharmap(const char *s,
8518
                        Py_ssize_t size,
8519
                        PyObject *mapping,
8520
                        const char *errors)
8521
11.0k
{
8522
11.0k
    _PyUnicodeWriter writer;
8523
8524
    /* Default to Latin-1 */
8525
11.0k
    if (mapping == NULL)
8526
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8527
8528
11.0k
    if (size == 0)
8529
0
        _Py_RETURN_UNICODE_EMPTY();
8530
11.0k
    _PyUnicodeWriter_Init(&writer);
8531
11.0k
    writer.min_length = size;
8532
11.0k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8533
0
        goto onError;
8534
8535
11.0k
    if (PyUnicode_CheckExact(mapping)) {
8536
11.0k
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8537
21
            goto onError;
8538
11.0k
    }
8539
0
    else {
8540
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8541
0
            goto onError;
8542
0
    }
8543
11.0k
    return _PyUnicodeWriter_Finish(&writer);
8544
8545
21
  onError:
8546
21
    _PyUnicodeWriter_Dealloc(&writer);
8547
21
    return NULL;
8548
11.0k
}
8549
8550
/* Charmap encoding: the lookup table */
8551
8552
/*[clinic input]
8553
class EncodingMap "struct encoding_map *" "&EncodingMapType"
8554
[clinic start generated code]*/
8555
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=14e46bbb6c522d22]*/
8556
8557
struct encoding_map {
8558
    PyObject_HEAD
8559
    unsigned char level1[32];
8560
    int count2, count3;
8561
    unsigned char level23[1];
8562
};
8563
8564
/*[clinic input]
8565
EncodingMap.size
8566
8567
Return the size (in bytes) of this object.
8568
[clinic start generated code]*/
8569
8570
static PyObject *
8571
EncodingMap_size_impl(struct encoding_map *self)
8572
/*[clinic end generated code: output=c4c969e4c99342a4 input=004ff13f26bb5366]*/
8573
0
{
8574
0
    return PyLong_FromLong((sizeof(*self) - 1) + 16*self->count2 +
8575
0
                           128*self->count3);
8576
0
}
8577
8578
static PyMethodDef encoding_map_methods[] = {
8579
    ENCODINGMAP_SIZE_METHODDEF
8580
    {NULL, NULL}
8581
};
8582
8583
static PyTypeObject EncodingMapType = {
8584
    PyVarObject_HEAD_INIT(NULL, 0)
8585
    .tp_name = "EncodingMap",
8586
    .tp_basicsize = sizeof(struct encoding_map),
8587
    /* methods */
8588
    .tp_flags = Py_TPFLAGS_DEFAULT,
8589
    .tp_methods = encoding_map_methods,
8590
};
8591
8592
PyObject*
8593
PyUnicode_BuildEncodingMap(PyObject* string)
8594
118
{
8595
118
    PyObject *result;
8596
118
    struct encoding_map *mresult;
8597
118
    int i;
8598
118
    int need_dict = 0;
8599
118
    unsigned char level1[32];
8600
118
    unsigned char level2[512];
8601
118
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8602
118
    int count2 = 0, count3 = 0;
8603
118
    int kind;
8604
118
    const void *data;
8605
118
    int length;
8606
118
    Py_UCS4 ch;
8607
8608
118
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8609
0
        PyErr_BadArgument();
8610
0
        return NULL;
8611
0
    }
8612
118
    kind = PyUnicode_KIND(string);
8613
118
    data = PyUnicode_DATA(string);
8614
118
    length = (int)Py_MIN(PyUnicode_GET_LENGTH(string), 256);
8615
118
    memset(level1, 0xFF, sizeof level1);
8616
118
    memset(level2, 0xFF, sizeof level2);
8617
8618
    /* If there isn't a one-to-one mapping of NULL to \0,
8619
       or if there are non-BMP characters, we need to use
8620
       a mapping dictionary. */
8621
118
    if (PyUnicode_READ(kind, data, 0) != 0)
8622
0
        need_dict = 1;
8623
30.2k
    for (i = 1; i < length; i++) {
8624
30.0k
        int l1, l2;
8625
30.0k
        ch = PyUnicode_READ(kind, data, i);
8626
30.0k
        if (ch == 0 || ch > 0xFFFF) {
8627
0
            need_dict = 1;
8628
0
            break;
8629
0
        }
8630
30.0k
        if (ch == 0xFFFE)
8631
            /* unmapped character */
8632
745
            continue;
8633
29.3k
        l1 = ch >> 11;
8634
29.3k
        l2 = ch >> 7;
8635
29.3k
        if (level1[l1] == 0xFF)
8636
215
            level1[l1] = count2++;
8637
29.3k
        if (level2[l2] == 0xFF)
8638
647
            level2[l2] = count3++;
8639
29.3k
    }
8640
8641
118
    if (count2 >= 0xFF || count3 >= 0xFF)
8642
0
        need_dict = 1;
8643
8644
118
    if (need_dict) {
8645
0
        PyObject *result = PyDict_New();
8646
0
        if (!result)
8647
0
            return NULL;
8648
0
        for (i = 0; i < length; i++) {
8649
0
            Py_UCS4 c = PyUnicode_READ(kind, data, i);
8650
0
            PyObject *key = PyLong_FromLong(c);
8651
0
            if (key == NULL) {
8652
0
                Py_DECREF(result);
8653
0
                return NULL;
8654
0
            }
8655
0
            PyObject *value = PyLong_FromLong(i);
8656
0
            if (value == NULL) {
8657
0
                Py_DECREF(key);
8658
0
                Py_DECREF(result);
8659
0
                return NULL;
8660
0
            }
8661
0
            int rc = PyDict_SetItem(result, key, value);
8662
0
            Py_DECREF(key);
8663
0
            Py_DECREF(value);
8664
0
            if (rc < 0) {
8665
0
                Py_DECREF(result);
8666
0
                return NULL;
8667
0
            }
8668
0
        }
8669
0
        return result;
8670
0
    }
8671
8672
    /* Create a three-level trie */
8673
118
    result = PyObject_Malloc(sizeof(struct encoding_map) +
8674
118
                             16*count2 + 128*count3 - 1);
8675
118
    if (!result) {
8676
0
        return PyErr_NoMemory();
8677
0
    }
8678
8679
118
    _PyObject_Init(result, &EncodingMapType);
8680
118
    mresult = (struct encoding_map*)result;
8681
118
    mresult->count2 = count2;
8682
118
    mresult->count3 = count3;
8683
118
    mlevel1 = mresult->level1;
8684
118
    mlevel2 = mresult->level23;
8685
118
    mlevel3 = mresult->level23 + 16*count2;
8686
118
    memcpy(mlevel1, level1, 32);
8687
118
    memset(mlevel2, 0xFF, 16*count2);
8688
118
    memset(mlevel3, 0, 128*count3);
8689
118
    count3 = 0;
8690
30.2k
    for (i = 1; i < length; i++) {
8691
30.0k
        int o1, o2, o3, i2, i3;
8692
30.0k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8693
30.0k
        if (ch == 0xFFFE)
8694
            /* unmapped character */
8695
745
            continue;
8696
29.3k
        o1 = ch>>11;
8697
29.3k
        o2 = (ch>>7) & 0xF;
8698
29.3k
        i2 = 16*mlevel1[o1] + o2;
8699
29.3k
        if (mlevel2[i2] == 0xFF)
8700
647
            mlevel2[i2] = count3++;
8701
29.3k
        o3 = ch & 0x7F;
8702
29.3k
        i3 = 128*mlevel2[i2] + o3;
8703
29.3k
        mlevel3[i3] = i;
8704
29.3k
    }
8705
118
    return result;
8706
118
}
8707
8708
static int
8709
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8710
0
{
8711
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8712
0
    int l1 = c>>11;
8713
0
    int l2 = (c>>7) & 0xF;
8714
0
    int l3 = c & 0x7F;
8715
0
    int i;
8716
8717
0
    if (c > 0xFFFF)
8718
0
        return -1;
8719
0
    if (c == 0)
8720
0
        return 0;
8721
    /* level 1*/
8722
0
    i = map->level1[l1];
8723
0
    if (i == 0xFF) {
8724
0
        return -1;
8725
0
    }
8726
    /* level 2*/
8727
0
    i = map->level23[16*i+l2];
8728
0
    if (i == 0xFF) {
8729
0
        return -1;
8730
0
    }
8731
    /* level 3 */
8732
0
    i = map->level23[16*map->count2 + 128*i + l3];
8733
0
    if (i == 0) {
8734
0
        return -1;
8735
0
    }
8736
0
    return i;
8737
0
}
8738
8739
/* Lookup the character in the mapping.
8740
   On success, return PyLong, PyBytes or None (if the character can't be found).
8741
   If the result is PyLong, put its value in replace.
8742
   On error, return NULL.
8743
   */
8744
static PyObject *
8745
charmapencode_lookup(Py_UCS4 c, PyObject *mapping, unsigned char *replace)
8746
0
{
8747
0
    PyObject *w = PyLong_FromLong((long)c);
8748
0
    PyObject *x;
8749
8750
0
    if (w == NULL)
8751
0
        return NULL;
8752
0
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
8753
0
    Py_DECREF(w);
8754
0
    if (rc == 0) {
8755
        /* No mapping found means: mapping is undefined. */
8756
0
        Py_RETURN_NONE;
8757
0
    }
8758
0
    if (x == NULL) {
8759
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8760
            /* No mapping found means: mapping is undefined. */
8761
0
            PyErr_Clear();
8762
0
            Py_RETURN_NONE;
8763
0
        } else
8764
0
            return NULL;
8765
0
    }
8766
0
    else if (x == Py_None)
8767
0
        return x;
8768
0
    else if (PyLong_Check(x)) {
8769
0
        long value = PyLong_AsLong(x);
8770
0
        if (value < 0 || value > 255) {
8771
0
            PyErr_SetString(PyExc_TypeError,
8772
0
                            "character mapping must be in range(256)");
8773
0
            Py_DECREF(x);
8774
0
            return NULL;
8775
0
        }
8776
0
        *replace = (unsigned char)value;
8777
0
        return x;
8778
0
    }
8779
0
    else if (PyBytes_Check(x))
8780
0
        return x;
8781
0
    else {
8782
        /* wrong return value */
8783
0
        PyErr_Format(PyExc_TypeError,
8784
0
                     "character mapping must return integer, bytes or None, not %.400s",
8785
0
                     Py_TYPE(x)->tp_name);
8786
0
        Py_DECREF(x);
8787
0
        return NULL;
8788
0
    }
8789
0
}
8790
8791
static int
8792
charmapencode_resize(PyBytesWriter *writer, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8793
0
{
8794
0
    Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8795
    /* exponentially overallocate to minimize reallocations */
8796
0
    if (requiredsize < 2 * outsize)
8797
0
        requiredsize = 2 * outsize;
8798
0
    return PyBytesWriter_Resize(writer, requiredsize);
8799
0
}
8800
8801
typedef enum charmapencode_result {
8802
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8803
} charmapencode_result;
8804
/* lookup the character, put the result in the output string and adjust
8805
   various state variables. Resize the output bytes object if not enough
8806
   space is available. Return a new reference to the object that
8807
   was put in the output buffer, or Py_None, if the mapping was undefined
8808
   (in which case no character was written) or NULL, if a
8809
   reallocation error occurred. The caller must decref the result */
8810
static charmapencode_result
8811
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8812
                     PyBytesWriter *writer, Py_ssize_t *outpos)
8813
0
{
8814
0
    PyObject *rep;
8815
0
    unsigned char replace;
8816
0
    char *outstart;
8817
0
    Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
8818
8819
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8820
0
        int res = encoding_map_lookup(c, mapping);
8821
0
        Py_ssize_t requiredsize = *outpos+1;
8822
0
        if (res == -1) {
8823
0
            return enc_FAILED;
8824
0
        }
8825
8826
0
        if (outsize<requiredsize) {
8827
0
            if (charmapencode_resize(writer, outpos, requiredsize)) {
8828
0
                return enc_EXCEPTION;
8829
0
            }
8830
0
        }
8831
0
        outstart = _PyBytesWriter_GetData(writer);
8832
0
        outstart[(*outpos)++] = (char)res;
8833
0
        return enc_SUCCESS;
8834
0
    }
8835
8836
0
    rep = charmapencode_lookup(c, mapping, &replace);
8837
0
    if (rep==NULL)
8838
0
        return enc_EXCEPTION;
8839
0
    else if (rep==Py_None) {
8840
0
        Py_DECREF(rep);
8841
0
        return enc_FAILED;
8842
0
    } else {
8843
0
        if (PyLong_Check(rep)) {
8844
0
            Py_ssize_t requiredsize = *outpos+1;
8845
0
            if (outsize<requiredsize)
8846
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8847
0
                    Py_DECREF(rep);
8848
0
                    return enc_EXCEPTION;
8849
0
                }
8850
0
            outstart = _PyBytesWriter_GetData(writer);
8851
0
            outstart[(*outpos)++] = (char)replace;
8852
0
        }
8853
0
        else {
8854
0
            const char *repchars = PyBytes_AS_STRING(rep);
8855
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8856
0
            Py_ssize_t requiredsize = *outpos+repsize;
8857
0
            if (outsize<requiredsize)
8858
0
                if (charmapencode_resize(writer, outpos, requiredsize)) {
8859
0
                    Py_DECREF(rep);
8860
0
                    return enc_EXCEPTION;
8861
0
                }
8862
0
            outstart = _PyBytesWriter_GetData(writer);
8863
0
            memcpy(outstart + *outpos, repchars, repsize);
8864
0
            *outpos += repsize;
8865
0
        }
8866
0
    }
8867
0
    Py_DECREF(rep);
8868
0
    return enc_SUCCESS;
8869
0
}
8870
8871
/* handle an error in _PyUnicode_EncodeCharmap()
8872
   Return 0 on success, -1 on error */
8873
static int
8874
charmap_encoding_error(
8875
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8876
    PyObject **exceptionObject,
8877
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8878
    PyBytesWriter *writer, Py_ssize_t *respos)
8879
0
{
8880
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8881
0
    Py_ssize_t size, repsize;
8882
0
    Py_ssize_t newpos;
8883
0
    int kind;
8884
0
    const void *data;
8885
0
    Py_ssize_t index;
8886
    /* startpos for collecting unencodable chars */
8887
0
    Py_ssize_t collstartpos = *inpos;
8888
0
    Py_ssize_t collendpos = *inpos+1;
8889
0
    Py_ssize_t collpos;
8890
0
    const char *encoding = "charmap";
8891
0
    const char *reason = "character maps to <undefined>";
8892
0
    charmapencode_result x;
8893
0
    Py_UCS4 ch;
8894
0
    int val;
8895
8896
0
    size = PyUnicode_GET_LENGTH(unicode);
8897
    /* find all unencodable characters */
8898
0
    while (collendpos < size) {
8899
0
        PyObject *rep;
8900
0
        unsigned char replace;
8901
0
        if (Py_IS_TYPE(mapping, &EncodingMapType)) {
8902
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8903
0
            val = encoding_map_lookup(ch, mapping);
8904
0
            if (val != -1)
8905
0
                break;
8906
0
            ++collendpos;
8907
0
            continue;
8908
0
        }
8909
8910
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8911
0
        rep = charmapencode_lookup(ch, mapping, &replace);
8912
0
        if (rep==NULL)
8913
0
            return -1;
8914
0
        else if (rep!=Py_None) {
8915
0
            Py_DECREF(rep);
8916
0
            break;
8917
0
        }
8918
0
        Py_DECREF(rep);
8919
0
        ++collendpos;
8920
0
    }
8921
    /* cache callback name lookup
8922
     * (if not done yet, i.e. it's the first error) */
8923
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8924
0
        *error_handler = _Py_GetErrorHandler(errors);
8925
8926
0
    switch (*error_handler) {
8927
0
    case _Py_ERROR_STRICT:
8928
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8929
0
        return -1;
8930
8931
0
    case _Py_ERROR_REPLACE:
8932
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8933
0
            x = charmapencode_output('?', mapping, writer, respos);
8934
0
            if (x==enc_EXCEPTION) {
8935
0
                return -1;
8936
0
            }
8937
0
            else if (x==enc_FAILED) {
8938
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8939
0
                return -1;
8940
0
            }
8941
0
        }
8942
0
        _Py_FALLTHROUGH;
8943
0
    case _Py_ERROR_IGNORE:
8944
0
        *inpos = collendpos;
8945
0
        break;
8946
8947
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8948
        /* generate replacement (temporarily (mis)uses p) */
8949
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8950
0
            char buffer[2+29+1+1];
8951
0
            char *cp;
8952
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8953
0
            for (cp = buffer; *cp; ++cp) {
8954
0
                x = charmapencode_output(*cp, mapping, writer, respos);
8955
0
                if (x==enc_EXCEPTION)
8956
0
                    return -1;
8957
0
                else if (x==enc_FAILED) {
8958
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8959
0
                    return -1;
8960
0
                }
8961
0
            }
8962
0
        }
8963
0
        *inpos = collendpos;
8964
0
        break;
8965
8966
0
    default:
8967
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8968
0
                                                      encoding, reason, unicode, exceptionObject,
8969
0
                                                      collstartpos, collendpos, &newpos);
8970
0
        if (repunicode == NULL)
8971
0
            return -1;
8972
0
        if (PyBytes_Check(repunicode)) {
8973
            /* Directly copy bytes result to output. */
8974
0
            Py_ssize_t outsize = PyBytesWriter_GetSize(writer);
8975
0
            Py_ssize_t requiredsize;
8976
0
            repsize = PyBytes_Size(repunicode);
8977
0
            requiredsize = *respos + repsize;
8978
0
            if (requiredsize > outsize)
8979
                /* Make room for all additional bytes. */
8980
0
                if (charmapencode_resize(writer, respos, requiredsize)) {
8981
0
                    Py_DECREF(repunicode);
8982
0
                    return -1;
8983
0
                }
8984
0
            memcpy((char*)PyBytesWriter_GetData(writer) + *respos,
8985
0
                   PyBytes_AsString(repunicode),  repsize);
8986
0
            *respos += repsize;
8987
0
            *inpos = newpos;
8988
0
            Py_DECREF(repunicode);
8989
0
            break;
8990
0
        }
8991
        /* generate replacement  */
8992
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8993
0
        data = PyUnicode_DATA(repunicode);
8994
0
        kind = PyUnicode_KIND(repunicode);
8995
0
        for (index = 0; index < repsize; index++) {
8996
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8997
0
            x = charmapencode_output(repch, mapping, writer, respos);
8998
0
            if (x==enc_EXCEPTION) {
8999
0
                Py_DECREF(repunicode);
9000
0
                return -1;
9001
0
            }
9002
0
            else if (x==enc_FAILED) {
9003
0
                Py_DECREF(repunicode);
9004
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
9005
0
                return -1;
9006
0
            }
9007
0
        }
9008
0
        *inpos = newpos;
9009
0
        Py_DECREF(repunicode);
9010
0
    }
9011
0
    return 0;
9012
0
}
9013
9014
PyObject *
9015
_PyUnicode_EncodeCharmap(PyObject *unicode,
9016
                         PyObject *mapping,
9017
                         const char *errors)
9018
0
{
9019
    /* Default to Latin-1 */
9020
0
    if (mapping == NULL) {
9021
0
        return unicode_encode_ucs1(unicode, errors, 256);
9022
0
    }
9023
9024
0
    Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
9025
0
    if (size == 0) {
9026
0
        return Py_GetConstant(Py_CONSTANT_EMPTY_BYTES);
9027
0
    }
9028
0
    const void *data = PyUnicode_DATA(unicode);
9029
0
    int kind = PyUnicode_KIND(unicode);
9030
9031
0
    PyObject *error_handler_obj = NULL;
9032
0
    PyObject *exc = NULL;
9033
9034
    /* output object */
9035
0
    PyBytesWriter *writer;
9036
    /* allocate enough for a simple encoding without
9037
       replacements, if we need more, we'll resize */
9038
0
    writer = PyBytesWriter_Create(size);
9039
0
    if (writer == NULL) {
9040
0
        goto onError;
9041
0
    }
9042
9043
    /* current input position */
9044
0
    Py_ssize_t inpos = 0;
9045
    /* current output position */
9046
0
    Py_ssize_t respos = 0;
9047
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
9048
9049
0
    if (Py_IS_TYPE(mapping, &EncodingMapType)) {
9050
0
        char *outstart = _PyBytesWriter_GetData(writer);
9051
0
        Py_ssize_t outsize = _PyBytesWriter_GetSize(writer);
9052
9053
0
        while (inpos<size) {
9054
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9055
9056
            /* try to encode it */
9057
0
            int res = encoding_map_lookup(ch, mapping);
9058
0
            Py_ssize_t requiredsize = respos+1;
9059
0
            if (res == -1) {
9060
0
                goto enc_FAILED;
9061
0
            }
9062
9063
0
            if (outsize<requiredsize) {
9064
0
                if (charmapencode_resize(writer, &respos, requiredsize)) {
9065
0
                    goto onError;
9066
0
                }
9067
0
                outstart = _PyBytesWriter_GetData(writer);
9068
0
                outsize = _PyBytesWriter_GetSize(writer);
9069
0
            }
9070
0
            outstart[respos++] = (char)res;
9071
9072
            /* done with this character => adjust input position */
9073
0
            ++inpos;
9074
0
            continue;
9075
9076
0
enc_FAILED:
9077
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
9078
0
                                       &exc,
9079
0
                                       &error_handler, &error_handler_obj, errors,
9080
0
                                       writer, &respos)) {
9081
0
                goto onError;
9082
0
            }
9083
0
            outstart = _PyBytesWriter_GetData(writer);
9084
0
            outsize = _PyBytesWriter_GetSize(writer);
9085
0
        }
9086
0
    }
9087
0
    else {
9088
0
        while (inpos<size) {
9089
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
9090
            /* try to encode it */
9091
0
            charmapencode_result x = charmapencode_output(ch, mapping, writer, &respos);
9092
0
            if (x==enc_EXCEPTION) { /* error */
9093
0
                goto onError;
9094
0
            }
9095
0
            if (x==enc_FAILED) { /* unencodable character */
9096
0
                if (charmap_encoding_error(unicode, &inpos, mapping,
9097
0
                                           &exc,
9098
0
                                           &error_handler, &error_handler_obj, errors,
9099
0
                                           writer, &respos)) {
9100
0
                    goto onError;
9101
0
                }
9102
0
            }
9103
0
            else {
9104
                /* done with this character => adjust input position */
9105
0
                ++inpos;
9106
0
            }
9107
0
        }
9108
0
    }
9109
9110
0
    Py_XDECREF(exc);
9111
0
    Py_XDECREF(error_handler_obj);
9112
9113
    /* Resize if we allocated too much */
9114
0
    return PyBytesWriter_FinishWithSize(writer, respos);
9115
9116
0
  onError:
9117
0
    PyBytesWriter_Discard(writer);
9118
0
    Py_XDECREF(exc);
9119
0
    Py_XDECREF(error_handler_obj);
9120
0
    return NULL;
9121
0
}
9122
9123
PyObject *
9124
PyUnicode_AsCharmapString(PyObject *unicode,
9125
                          PyObject *mapping)
9126
0
{
9127
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
9128
0
        PyErr_BadArgument();
9129
0
        return NULL;
9130
0
    }
9131
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
9132
0
}
9133
9134
/* create or adjust a UnicodeTranslateError */
9135
static void
9136
make_translate_exception(PyObject **exceptionObject,
9137
                         PyObject *unicode,
9138
                         Py_ssize_t startpos, Py_ssize_t endpos,
9139
                         const char *reason)
9140
0
{
9141
0
    if (*exceptionObject == NULL) {
9142
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
9143
0
            unicode, startpos, endpos, reason);
9144
0
    }
9145
0
    else {
9146
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9147
0
            goto onError;
9148
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9149
0
            goto onError;
9150
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9151
0
            goto onError;
9152
0
        return;
9153
0
      onError:
9154
0
        Py_CLEAR(*exceptionObject);
9155
0
    }
9156
0
}
9157
9158
/* error handling callback helper:
9159
   build arguments, call the callback and check the arguments,
9160
   put the result into newpos and return the replacement string, which
9161
   has to be freed by the caller */
9162
static PyObject *
9163
unicode_translate_call_errorhandler(const char *errors,
9164
                                    PyObject **errorHandler,
9165
                                    const char *reason,
9166
                                    PyObject *unicode, PyObject **exceptionObject,
9167
                                    Py_ssize_t startpos, Py_ssize_t endpos,
9168
                                    Py_ssize_t *newpos)
9169
0
{
9170
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
9171
9172
0
    Py_ssize_t i_newpos;
9173
0
    PyObject *restuple;
9174
0
    PyObject *resunicode;
9175
9176
0
    if (*errorHandler == NULL) {
9177
0
        *errorHandler = PyCodec_LookupError(errors);
9178
0
        if (*errorHandler == NULL)
9179
0
            return NULL;
9180
0
    }
9181
9182
0
    make_translate_exception(exceptionObject,
9183
0
                             unicode, startpos, endpos, reason);
9184
0
    if (*exceptionObject == NULL)
9185
0
        return NULL;
9186
9187
0
    restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
9188
0
    if (restuple == NULL)
9189
0
        return NULL;
9190
0
    if (!PyTuple_Check(restuple)) {
9191
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
9192
0
        Py_DECREF(restuple);
9193
0
        return NULL;
9194
0
    }
9195
0
    if (!PyArg_ParseTuple(restuple, argparse,
9196
0
                          &resunicode, &i_newpos)) {
9197
0
        Py_DECREF(restuple);
9198
0
        return NULL;
9199
0
    }
9200
0
    if (i_newpos<0)
9201
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
9202
0
    else
9203
0
        *newpos = i_newpos;
9204
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
9205
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
9206
0
        Py_DECREF(restuple);
9207
0
        return NULL;
9208
0
    }
9209
0
    Py_INCREF(resunicode);
9210
0
    Py_DECREF(restuple);
9211
0
    return resunicode;
9212
0
}
9213
9214
/* Lookup the character ch in the mapping and put the result in result,
9215
   which must be decrefed by the caller.
9216
   The result can be PyLong, PyUnicode, None or NULL.
9217
   If the result is PyLong, put its value in replace.
9218
   Return 0 on success, -1 on error */
9219
static int
9220
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result, Py_UCS4 *replace)
9221
338
{
9222
338
    PyObject *w = PyLong_FromLong((long)c);
9223
338
    PyObject *x;
9224
9225
338
    if (w == NULL)
9226
0
        return -1;
9227
338
    int rc = PyMapping_GetOptionalItem(mapping, w, &x);
9228
338
    Py_DECREF(w);
9229
338
    if (rc == 0) {
9230
        /* No mapping found means: use 1:1 mapping. */
9231
158
        *result = NULL;
9232
158
        return 0;
9233
158
    }
9234
180
    if (x == NULL) {
9235
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9236
            /* No mapping found means: use 1:1 mapping. */
9237
0
            PyErr_Clear();
9238
0
            *result = NULL;
9239
0
            return 0;
9240
0
        } else
9241
0
            return -1;
9242
0
    }
9243
180
    else if (x == Py_None) {
9244
0
        *result = x;
9245
0
        return 0;
9246
0
    }
9247
180
    else if (PyLong_Check(x)) {
9248
0
        long value = PyLong_AsLong(x);
9249
0
        if (value < 0 || value > MAX_UNICODE) {
9250
0
            PyErr_Format(PyExc_ValueError,
9251
0
                         "character mapping must be in range(0x%x)",
9252
0
                         MAX_UNICODE+1);
9253
0
            Py_DECREF(x);
9254
0
            return -1;
9255
0
        }
9256
0
        *result = x;
9257
0
        *replace = (Py_UCS4)value;
9258
0
        return 0;
9259
0
    }
9260
180
    else if (PyUnicode_Check(x)) {
9261
180
        *result = x;
9262
180
        return 0;
9263
180
    }
9264
0
    else {
9265
        /* wrong return value */
9266
0
        PyErr_SetString(PyExc_TypeError,
9267
0
                        "character mapping must return integer, None or str");
9268
0
        Py_DECREF(x);
9269
0
        return -1;
9270
0
    }
9271
180
}
9272
9273
/* lookup the character, write the result into the writer.
9274
   Return 1 if the result was written into the writer, return 0 if the mapping
9275
   was undefined, raise an exception return -1 on error. */
9276
static int
9277
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9278
                        _PyUnicodeWriter *writer)
9279
199
{
9280
199
    PyObject *item;
9281
199
    Py_UCS4 replace;
9282
9283
199
    if (charmaptranslate_lookup(ch, mapping, &item, &replace))
9284
0
        return -1;
9285
9286
199
    if (item == NULL) {
9287
        /* not found => default to 1:1 mapping */
9288
75
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9289
0
            return -1;
9290
0
        }
9291
75
        return 1;
9292
75
    }
9293
9294
124
    if (item == Py_None) {
9295
0
        Py_DECREF(item);
9296
0
        return 0;
9297
0
    }
9298
9299
124
    if (PyLong_Check(item)) {
9300
0
        if (_PyUnicodeWriter_WriteCharInline(writer, replace) < 0) {
9301
0
            Py_DECREF(item);
9302
0
            return -1;
9303
0
        }
9304
0
        Py_DECREF(item);
9305
0
        return 1;
9306
0
    }
9307
9308
124
    if (!PyUnicode_Check(item)) {
9309
0
        Py_DECREF(item);
9310
0
        return -1;
9311
0
    }
9312
9313
124
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9314
0
        Py_DECREF(item);
9315
0
        return -1;
9316
0
    }
9317
9318
124
    Py_DECREF(item);
9319
124
    return 1;
9320
124
}
9321
9322
static int
9323
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9324
                              Py_UCS1 *translate)
9325
139
{
9326
139
    PyObject *item = NULL;
9327
139
    Py_UCS4 replace;
9328
139
    int ret = 0;
9329
9330
139
    if (charmaptranslate_lookup(ch, mapping, &item, &replace)) {
9331
0
        return -1;
9332
0
    }
9333
9334
139
    if (item == Py_None) {
9335
        /* deletion */
9336
0
        translate[ch] = 0xfe;
9337
0
    }
9338
139
    else if (item == NULL) {
9339
        /* not found => default to 1:1 mapping */
9340
83
        translate[ch] = ch;
9341
83
        return 1;
9342
83
    }
9343
56
    else if (PyLong_Check(item)) {
9344
0
        if (replace > 127) {
9345
            /* invalid character or character outside ASCII:
9346
               skip the fast translate */
9347
0
            goto exit;
9348
0
        }
9349
0
        translate[ch] = (Py_UCS1)replace;
9350
0
    }
9351
56
    else if (PyUnicode_Check(item)) {
9352
56
        if (PyUnicode_GET_LENGTH(item) != 1)
9353
56
            goto exit;
9354
9355
0
        replace = PyUnicode_READ_CHAR(item, 0);
9356
0
        if (replace > 127)
9357
0
            goto exit;
9358
0
        translate[ch] = (Py_UCS1)replace;
9359
0
    }
9360
0
    else {
9361
        /* not None, NULL, long or unicode */
9362
0
        goto exit;
9363
0
    }
9364
0
    ret = 1;
9365
9366
56
  exit:
9367
56
    Py_DECREF(item);
9368
56
    return ret;
9369
0
}
9370
9371
/* Fast path for ascii => ascii translation. Return 1 if the whole string
9372
   was translated into writer, return 0 if the input string was partially
9373
   translated into writer, raise an exception and return -1 on error. */
9374
static int
9375
unicode_fast_translate(PyObject *input, PyObject *mapping,
9376
                       _PyUnicodeWriter *writer, int ignore,
9377
                       Py_ssize_t *input_pos)
9378
104
{
9379
104
    Py_UCS1 ascii_table[128], ch, ch2;
9380
104
    Py_ssize_t len;
9381
104
    const Py_UCS1 *in, *end;
9382
104
    Py_UCS1 *out;
9383
104
    int res = 0;
9384
9385
104
    len = PyUnicode_GET_LENGTH(input);
9386
9387
104
    memset(ascii_table, 0xff, 128);
9388
9389
104
    in = PyUnicode_1BYTE_DATA(input);
9390
104
    end = in + len;
9391
9392
104
    assert(PyUnicode_IS_ASCII(writer->buffer));
9393
104
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9394
104
    out = PyUnicode_1BYTE_DATA(writer->buffer);
9395
9396
201
    for (; in < end; in++) {
9397
153
        ch = *in;
9398
153
        ch2 = ascii_table[ch];
9399
153
        if (ch2 == 0xff) {
9400
139
            int translate = unicode_fast_translate_lookup(mapping, ch,
9401
139
                                                          ascii_table);
9402
139
            if (translate < 0)
9403
0
                return -1;
9404
139
            if (translate == 0)
9405
56
                goto exit;
9406
83
            ch2 = ascii_table[ch];
9407
83
        }
9408
97
        if (ch2 == 0xfe) {
9409
0
            if (ignore)
9410
0
                continue;
9411
0
            goto exit;
9412
0
        }
9413
97
        assert(ch2 < 128);
9414
97
        *out = ch2;
9415
97
        out++;
9416
97
    }
9417
48
    res = 1;
9418
9419
104
exit:
9420
104
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
9421
104
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
9422
104
    return res;
9423
48
}
9424
9425
static PyObject *
9426
_PyUnicode_TranslateCharmap(PyObject *input,
9427
                            PyObject *mapping,
9428
                            const char *errors)
9429
104
{
9430
    /* input object */
9431
104
    const void *data;
9432
104
    Py_ssize_t size, i;
9433
104
    int kind;
9434
    /* output buffer */
9435
104
    _PyUnicodeWriter writer;
9436
    /* error handler */
9437
104
    const char *reason = "character maps to <undefined>";
9438
104
    PyObject *errorHandler = NULL;
9439
104
    PyObject *exc = NULL;
9440
104
    int ignore;
9441
104
    int res;
9442
9443
104
    if (mapping == NULL) {
9444
0
        PyErr_BadArgument();
9445
0
        return NULL;
9446
0
    }
9447
9448
104
    data = PyUnicode_DATA(input);
9449
104
    kind = PyUnicode_KIND(input);
9450
104
    size = PyUnicode_GET_LENGTH(input);
9451
9452
104
    if (size == 0)
9453
0
        return PyUnicode_FromObject(input);
9454
9455
    /* allocate enough for a simple 1:1 translation without
9456
       replacements, if we need more, we'll resize */
9457
104
    _PyUnicodeWriter_Init(&writer);
9458
104
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
9459
0
        goto onError;
9460
9461
104
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9462
9463
104
    if (PyUnicode_IS_ASCII(input)) {
9464
104
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9465
104
        if (res < 0) {
9466
0
            _PyUnicodeWriter_Dealloc(&writer);
9467
0
            return NULL;
9468
0
        }
9469
104
        if (res == 1)
9470
48
            return _PyUnicodeWriter_Finish(&writer);
9471
104
    }
9472
0
    else {
9473
0
        i = 0;
9474
0
    }
9475
9476
255
    while (i<size) {
9477
        /* try to encode it */
9478
199
        int translate;
9479
199
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9480
199
        Py_ssize_t newpos;
9481
        /* startpos for collecting untranslatable chars */
9482
199
        Py_ssize_t collstart;
9483
199
        Py_ssize_t collend;
9484
199
        Py_UCS4 ch;
9485
9486
199
        ch = PyUnicode_READ(kind, data, i);
9487
199
        translate = charmaptranslate_output(ch, mapping, &writer);
9488
199
        if (translate < 0)
9489
0
            goto onError;
9490
9491
199
        if (translate != 0) {
9492
            /* it worked => adjust input pointer */
9493
199
            ++i;
9494
199
            continue;
9495
199
        }
9496
9497
        /* untranslatable character */
9498
0
        collstart = i;
9499
0
        collend = i+1;
9500
9501
        /* find all untranslatable characters */
9502
0
        while (collend < size) {
9503
0
            PyObject *x;
9504
0
            Py_UCS4 replace;
9505
0
            ch = PyUnicode_READ(kind, data, collend);
9506
0
            if (charmaptranslate_lookup(ch, mapping, &x, &replace))
9507
0
                goto onError;
9508
0
            Py_XDECREF(x);
9509
0
            if (x != Py_None)
9510
0
                break;
9511
0
            ++collend;
9512
0
        }
9513
9514
0
        if (ignore) {
9515
0
            i = collend;
9516
0
        }
9517
0
        else {
9518
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9519
0
                                                             reason, input, &exc,
9520
0
                                                             collstart, collend, &newpos);
9521
0
            if (repunicode == NULL)
9522
0
                goto onError;
9523
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9524
0
                Py_DECREF(repunicode);
9525
0
                goto onError;
9526
0
            }
9527
0
            Py_DECREF(repunicode);
9528
0
            i = newpos;
9529
0
        }
9530
0
    }
9531
56
    Py_XDECREF(exc);
9532
56
    Py_XDECREF(errorHandler);
9533
56
    return _PyUnicodeWriter_Finish(&writer);
9534
9535
0
  onError:
9536
0
    _PyUnicodeWriter_Dealloc(&writer);
9537
0
    Py_XDECREF(exc);
9538
0
    Py_XDECREF(errorHandler);
9539
0
    return NULL;
9540
56
}
9541
9542
PyObject *
9543
PyUnicode_Translate(PyObject *str,
9544
                    PyObject *mapping,
9545
                    const char *errors)
9546
0
{
9547
0
    if (ensure_unicode(str) < 0)
9548
0
        return NULL;
9549
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9550
0
}
9551
9552
PyObject *
9553
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9554
4.54M
{
9555
4.54M
    if (!PyUnicode_Check(unicode)) {
9556
0
        PyErr_BadInternalCall();
9557
0
        return NULL;
9558
0
    }
9559
4.54M
    if (PyUnicode_IS_ASCII(unicode)) {
9560
        /* If the string is already ASCII, just return the same string */
9561
4.54M
        return Py_NewRef(unicode);
9562
4.54M
    }
9563
9564
2.58k
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9565
2.58k
    PyObject *result = PyUnicode_New(len, 127);
9566
2.58k
    if (result == NULL) {
9567
0
        return NULL;
9568
0
    }
9569
9570
2.58k
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9571
2.58k
    int kind = PyUnicode_KIND(unicode);
9572
2.58k
    const void *data = PyUnicode_DATA(unicode);
9573
2.58k
    Py_ssize_t i;
9574
45.6k
    for (i = 0; i < len; ++i) {
9575
43.1k
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9576
43.1k
        if (ch < 127) {
9577
40.1k
            out[i] = ch;
9578
40.1k
        }
9579
2.99k
        else if (Py_UNICODE_ISSPACE(ch)) {
9580
1.06k
            out[i] = ' ';
9581
1.06k
        }
9582
1.93k
        else {
9583
1.93k
            int decimal = Py_UNICODE_TODECIMAL(ch);
9584
1.93k
            if (decimal < 0) {
9585
134
                out[i] = '?';
9586
134
                out[i+1] = '\0';
9587
134
                _PyUnicode_LENGTH(result) = i + 1;
9588
134
                break;
9589
134
            }
9590
1.80k
            out[i] = '0' + decimal;
9591
1.80k
        }
9592
43.1k
    }
9593
9594
2.58k
    assert(_PyUnicode_CheckConsistency(result, 1));
9595
2.58k
    return result;
9596
2.58k
}
9597
9598
/* --- Helpers ------------------------------------------------------------ */
9599
9600
/* helper macro to fixup start/end slice values */
9601
#define ADJUST_INDICES(start, end, len) \
9602
135M
    do {                                \
9603
135M
        if (end > len) {                \
9604
121M
            end = len;                  \
9605
121M
        }                               \
9606
135M
        else if (end < 0) {             \
9607
0
            end += len;                 \
9608
0
            if (end < 0) {              \
9609
0
                end = 0;                \
9610
0
            }                           \
9611
0
        }                               \
9612
135M
        if (start < 0) {                \
9613
0
            start += len;               \
9614
0
            if (start < 0) {            \
9615
0
                start = 0;              \
9616
0
            }                           \
9617
0
        }                               \
9618
135M
    } while (0)
9619
9620
static Py_ssize_t
9621
any_find_slice(PyObject* s1, PyObject* s2,
9622
               Py_ssize_t start,
9623
               Py_ssize_t end,
9624
               int direction)
9625
16.7M
{
9626
16.7M
    int kind1, kind2;
9627
16.7M
    const void *buf1, *buf2;
9628
16.7M
    Py_ssize_t len1, len2, result;
9629
9630
16.7M
    kind1 = PyUnicode_KIND(s1);
9631
16.7M
    kind2 = PyUnicode_KIND(s2);
9632
16.7M
    if (kind1 < kind2)
9633
0
        return -1;
9634
9635
16.7M
    len1 = PyUnicode_GET_LENGTH(s1);
9636
16.7M
    len2 = PyUnicode_GET_LENGTH(s2);
9637
16.7M
    ADJUST_INDICES(start, end, len1);
9638
16.7M
    if (end - start < len2)
9639
26.8k
        return -1;
9640
9641
16.6M
    buf1 = PyUnicode_DATA(s1);
9642
16.6M
    buf2 = PyUnicode_DATA(s2);
9643
16.6M
    if (len2 == 1) {
9644
16.6M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9645
16.6M
        result = findchar((const char *)buf1 + kind1*start,
9646
16.6M
                          kind1, end - start, ch, direction);
9647
16.6M
        if (result == -1)
9648
198k
            return -1;
9649
16.4M
        else
9650
16.4M
            return start + result;
9651
16.6M
    }
9652
9653
37.6k
    if (kind2 != kind1) {
9654
27.2k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
9655
27.2k
        if (!buf2)
9656
0
            return -2;
9657
27.2k
    }
9658
9659
37.6k
    if (direction > 0) {
9660
37.6k
        switch (kind1) {
9661
10.3k
        case PyUnicode_1BYTE_KIND:
9662
10.3k
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9663
5.54k
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9664
4.79k
            else
9665
4.79k
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9666
10.3k
            break;
9667
19.9k
        case PyUnicode_2BYTE_KIND:
9668
19.9k
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9669
19.9k
            break;
9670
7.37k
        case PyUnicode_4BYTE_KIND:
9671
7.37k
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9672
7.37k
            break;
9673
0
        default:
9674
0
            Py_UNREACHABLE();
9675
37.6k
        }
9676
37.6k
    }
9677
0
    else {
9678
0
        switch (kind1) {
9679
0
        case PyUnicode_1BYTE_KIND:
9680
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9681
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9682
0
            else
9683
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9684
0
            break;
9685
0
        case PyUnicode_2BYTE_KIND:
9686
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9687
0
            break;
9688
0
        case PyUnicode_4BYTE_KIND:
9689
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9690
0
            break;
9691
0
        default:
9692
0
            Py_UNREACHABLE();
9693
0
        }
9694
0
    }
9695
9696
37.6k
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
9697
37.6k
    if (kind2 != kind1)
9698
27.2k
        PyMem_Free((void *)buf2);
9699
9700
37.6k
    return result;
9701
37.6k
}
9702
9703
9704
Py_ssize_t
9705
PyUnicode_Count(PyObject *str,
9706
                PyObject *substr,
9707
                Py_ssize_t start,
9708
                Py_ssize_t end)
9709
0
{
9710
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9711
0
        return -1;
9712
9713
0
    return unicode_count_impl(str, substr, start, end);
9714
0
}
9715
9716
Py_ssize_t
9717
PyUnicode_Find(PyObject *str,
9718
               PyObject *substr,
9719
               Py_ssize_t start,
9720
               Py_ssize_t end,
9721
               int direction)
9722
0
{
9723
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9724
0
        return -2;
9725
9726
0
    return any_find_slice(str, substr, start, end, direction);
9727
0
}
9728
9729
Py_ssize_t
9730
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9731
                   Py_ssize_t start, Py_ssize_t end,
9732
                   int direction)
9733
469k
{
9734
469k
    int kind;
9735
469k
    Py_ssize_t len, result;
9736
469k
    len = PyUnicode_GET_LENGTH(str);
9737
469k
    ADJUST_INDICES(start, end, len);
9738
469k
    if (end - start < 1)
9739
0
        return -1;
9740
469k
    kind = PyUnicode_KIND(str);
9741
469k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9742
469k
                      kind, end-start, ch, direction);
9743
469k
    if (result == -1)
9744
50.2k
        return -1;
9745
418k
    else
9746
418k
        return start + result;
9747
469k
}
9748
9749
static int
9750
tailmatch(PyObject *self,
9751
          PyObject *substring,
9752
          Py_ssize_t start,
9753
          Py_ssize_t end,
9754
          int direction)
9755
98.4M
{
9756
98.4M
    int kind_self;
9757
98.4M
    int kind_sub;
9758
98.4M
    const void *data_self;
9759
98.4M
    const void *data_sub;
9760
98.4M
    Py_ssize_t offset;
9761
98.4M
    Py_ssize_t i;
9762
98.4M
    Py_ssize_t end_sub;
9763
9764
98.4M
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9765
98.4M
    end -= PyUnicode_GET_LENGTH(substring);
9766
98.4M
    if (end < start)
9767
12.1M
        return 0;
9768
9769
86.3M
    if (PyUnicode_GET_LENGTH(substring) == 0)
9770
0
        return 1;
9771
9772
86.3M
    kind_self = PyUnicode_KIND(self);
9773
86.3M
    data_self = PyUnicode_DATA(self);
9774
86.3M
    kind_sub = PyUnicode_KIND(substring);
9775
86.3M
    data_sub = PyUnicode_DATA(substring);
9776
86.3M
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9777
9778
86.3M
    if (direction > 0)
9779
7.96M
        offset = end;
9780
78.3M
    else
9781
78.3M
        offset = start;
9782
9783
86.3M
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9784
86.3M
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9785
42.1M
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9786
42.1M
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9787
        /* If both are of the same kind, memcmp is sufficient */
9788
13.2M
        if (kind_self == kind_sub) {
9789
9.24M
            return ! memcmp((char *)data_self +
9790
9.24M
                                (offset * PyUnicode_KIND(substring)),
9791
9.24M
                            data_sub,
9792
9.24M
                            PyUnicode_GET_LENGTH(substring) *
9793
9.24M
                                PyUnicode_KIND(substring));
9794
9.24M
        }
9795
        /* otherwise we have to compare each character by first accessing it */
9796
4.04M
        else {
9797
            /* We do not need to compare 0 and len(substring)-1 because
9798
               the if statement above ensured already that they are equal
9799
               when we end up here. */
9800
4.10M
            for (i = 1; i < end_sub; ++i) {
9801
65.8k
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9802
65.8k
                    PyUnicode_READ(kind_sub, data_sub, i))
9803
4.94k
                    return 0;
9804
65.8k
            }
9805
4.03M
            return 1;
9806
4.04M
        }
9807
13.2M
    }
9808
9809
73.0M
    return 0;
9810
86.3M
}
9811
9812
Py_ssize_t
9813
PyUnicode_Tailmatch(PyObject *str,
9814
                    PyObject *substr,
9815
                    Py_ssize_t start,
9816
                    Py_ssize_t end,
9817
                    int direction)
9818
0
{
9819
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9820
0
        return -1;
9821
9822
0
    return tailmatch(str, substr, start, end, direction);
9823
0
}
9824
9825
static PyObject *
9826
ascii_upper_or_lower(PyObject *self, int lower)
9827
71.1M
{
9828
71.1M
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9829
71.1M
    const char *data = PyUnicode_DATA(self);
9830
71.1M
    char *resdata;
9831
71.1M
    PyObject *res;
9832
9833
71.1M
    res = PyUnicode_New(len, 127);
9834
71.1M
    if (res == NULL)
9835
0
        return NULL;
9836
71.1M
    resdata = PyUnicode_DATA(res);
9837
71.1M
    if (lower)
9838
71.1M
        _Py_bytes_lower(resdata, data, len);
9839
102
    else
9840
102
        _Py_bytes_upper(resdata, data, len);
9841
71.1M
    return res;
9842
71.1M
}
9843
9844
static Py_UCS4
9845
handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
9846
37.5k
{
9847
37.5k
    Py_ssize_t j;
9848
37.5k
    int final_sigma;
9849
37.5k
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9850
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9851
9852
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9853
9854
    where ! is a negation and \p{xxx} is a character with property xxx.
9855
    */
9856
86.5k
    for (j = i - 1; j >= 0; j--) {
9857
84.7k
        c = PyUnicode_READ(kind, data, j);
9858
84.7k
        if (!_PyUnicode_IsCaseIgnorable(c))
9859
35.8k
            break;
9860
84.7k
    }
9861
37.5k
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9862
37.5k
    if (final_sigma) {
9863
65.4k
        for (j = i + 1; j < length; j++) {
9864
63.3k
            c = PyUnicode_READ(kind, data, j);
9865
63.3k
            if (!_PyUnicode_IsCaseIgnorable(c))
9866
24.6k
                break;
9867
63.3k
        }
9868
26.7k
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9869
26.7k
    }
9870
37.5k
    return (final_sigma) ? 0x3C2 : 0x3C3;
9871
37.5k
}
9872
9873
static int
9874
lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
9875
           Py_UCS4 c, Py_UCS4 *mapped)
9876
78.2M
{
9877
    /* Obscure special case. */
9878
78.2M
    if (c == 0x3A3) {
9879
37.5k
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9880
37.5k
        return 1;
9881
37.5k
    }
9882
78.2M
    return _PyUnicode_ToLowerFull(c, mapped);
9883
78.2M
}
9884
9885
static Py_ssize_t
9886
do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9887
0
{
9888
0
    Py_ssize_t i, k = 0;
9889
0
    int n_res, j;
9890
0
    Py_UCS4 c, mapped[3];
9891
9892
0
    c = PyUnicode_READ(kind, data, 0);
9893
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9894
0
    for (j = 0; j < n_res; j++) {
9895
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9896
0
        res[k++] = mapped[j];
9897
0
    }
9898
0
    for (i = 1; i < length; i++) {
9899
0
        c = PyUnicode_READ(kind, data, i);
9900
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9901
0
        for (j = 0; j < n_res; j++) {
9902
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9903
0
            res[k++] = mapped[j];
9904
0
        }
9905
0
    }
9906
0
    return k;
9907
0
}
9908
9909
static Py_ssize_t
9910
0
do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9911
0
    Py_ssize_t i, k = 0;
9912
9913
0
    for (i = 0; i < length; i++) {
9914
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9915
0
        int n_res, j;
9916
0
        if (Py_UNICODE_ISUPPER(c)) {
9917
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9918
0
        }
9919
0
        else if (Py_UNICODE_ISLOWER(c)) {
9920
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9921
0
        }
9922
0
        else {
9923
0
            n_res = 1;
9924
0
            mapped[0] = c;
9925
0
        }
9926
0
        for (j = 0; j < n_res; j++) {
9927
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9928
0
            res[k++] = mapped[j];
9929
0
        }
9930
0
    }
9931
0
    return k;
9932
0
}
9933
9934
static Py_ssize_t
9935
do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
9936
                  Py_UCS4 *maxchar, int lower)
9937
21.5M
{
9938
21.5M
    Py_ssize_t i, k = 0;
9939
9940
99.8M
    for (i = 0; i < length; i++) {
9941
78.2M
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9942
78.2M
        int n_res, j;
9943
78.2M
        if (lower)
9944
78.2M
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9945
0
        else
9946
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9947
156M
        for (j = 0; j < n_res; j++) {
9948
78.2M
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9949
78.2M
            res[k++] = mapped[j];
9950
78.2M
        }
9951
78.2M
    }
9952
21.5M
    return k;
9953
21.5M
}
9954
9955
static Py_ssize_t
9956
do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9957
0
{
9958
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9959
0
}
9960
9961
static Py_ssize_t
9962
do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9963
21.5M
{
9964
21.5M
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9965
21.5M
}
9966
9967
static Py_ssize_t
9968
do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9969
0
{
9970
0
    Py_ssize_t i, k = 0;
9971
9972
0
    for (i = 0; i < length; i++) {
9973
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9974
0
        Py_UCS4 mapped[3];
9975
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9976
0
        for (j = 0; j < n_res; j++) {
9977
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9978
0
            res[k++] = mapped[j];
9979
0
        }
9980
0
    }
9981
0
    return k;
9982
0
}
9983
9984
static Py_ssize_t
9985
do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9986
0
{
9987
0
    Py_ssize_t i, k = 0;
9988
0
    int previous_is_cased;
9989
9990
0
    previous_is_cased = 0;
9991
0
    for (i = 0; i < length; i++) {
9992
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9993
0
        Py_UCS4 mapped[3];
9994
0
        int n_res, j;
9995
9996
0
        if (previous_is_cased)
9997
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9998
0
        else
9999
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
10000
10001
0
        for (j = 0; j < n_res; j++) {
10002
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
10003
0
            res[k++] = mapped[j];
10004
0
        }
10005
10006
0
        previous_is_cased = _PyUnicode_IsCased(c);
10007
0
    }
10008
0
    return k;
10009
0
}
10010
10011
static PyObject *
10012
case_operation(PyObject *self,
10013
               Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
10014
21.5M
{
10015
21.5M
    PyObject *res = NULL;
10016
21.5M
    Py_ssize_t length, newlength = 0;
10017
21.5M
    int kind, outkind;
10018
21.5M
    const void *data;
10019
21.5M
    void *outdata;
10020
21.5M
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
10021
10022
21.5M
    kind = PyUnicode_KIND(self);
10023
21.5M
    data = PyUnicode_DATA(self);
10024
21.5M
    length = PyUnicode_GET_LENGTH(self);
10025
21.5M
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
10026
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
10027
0
        return NULL;
10028
0
    }
10029
21.5M
    tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
10030
21.5M
    if (tmp == NULL)
10031
0
        return PyErr_NoMemory();
10032
21.5M
    newlength = perform(kind, data, length, tmp, &maxchar);
10033
21.5M
    res = PyUnicode_New(newlength, maxchar);
10034
21.5M
    if (res == NULL)
10035
0
        goto leave;
10036
21.5M
    tmpend = tmp + newlength;
10037
21.5M
    outdata = PyUnicode_DATA(res);
10038
21.5M
    outkind = PyUnicode_KIND(res);
10039
21.5M
    switch (outkind) {
10040
217k
    case PyUnicode_1BYTE_KIND:
10041
217k
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10042
217k
        break;
10043
21.3M
    case PyUnicode_2BYTE_KIND:
10044
21.3M
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10045
21.3M
        break;
10046
48.8k
    case PyUnicode_4BYTE_KIND:
10047
48.8k
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10048
48.8k
        break;
10049
0
    default:
10050
0
        Py_UNREACHABLE();
10051
21.5M
    }
10052
21.5M
  leave:
10053
21.5M
    PyMem_Free(tmp);
10054
21.5M
    return res;
10055
21.5M
}
10056
10057
PyObject *
10058
PyUnicode_Join(PyObject *separator, PyObject *seq)
10059
21.8M
{
10060
21.8M
    PyObject *res;
10061
21.8M
    PyObject *fseq;
10062
21.8M
    Py_ssize_t seqlen;
10063
21.8M
    PyObject **items;
10064
10065
21.8M
    fseq = PySequence_Fast(seq, "can only join an iterable");
10066
21.8M
    if (fseq == NULL) {
10067
643
        return NULL;
10068
643
    }
10069
10070
21.8M
    Py_BEGIN_CRITICAL_SECTION_SEQUENCE_FAST(seq);
10071
10072
21.8M
    items = PySequence_Fast_ITEMS(fseq);
10073
21.8M
    seqlen = PySequence_Fast_GET_SIZE(fseq);
10074
21.8M
    res = _PyUnicode_JoinArray(separator, items, seqlen);
10075
10076
21.8M
    Py_END_CRITICAL_SECTION_SEQUENCE_FAST();
10077
10078
21.8M
    Py_DECREF(fseq);
10079
21.8M
    return res;
10080
21.8M
}
10081
10082
PyObject *
10083
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
10084
61.1M
{
10085
61.1M
    PyObject *res = NULL; /* the result */
10086
61.1M
    PyObject *sep = NULL;
10087
61.1M
    Py_ssize_t seplen;
10088
61.1M
    PyObject *item;
10089
61.1M
    Py_ssize_t sz, i, res_offset;
10090
61.1M
    Py_UCS4 maxchar;
10091
61.1M
    Py_UCS4 item_maxchar;
10092
61.1M
    int use_memcpy;
10093
61.1M
    unsigned char *res_data = NULL, *sep_data = NULL;
10094
61.1M
    PyObject *last_obj;
10095
61.1M
    int kind = 0;
10096
10097
    /* If empty sequence, return u"". */
10098
61.1M
    if (seqlen == 0) {
10099
5.35M
        _Py_RETURN_UNICODE_EMPTY();
10100
5.35M
    }
10101
10102
    /* If singleton sequence with an exact Unicode, return that. */
10103
55.8M
    last_obj = NULL;
10104
55.8M
    if (seqlen == 1) {
10105
6.93M
        if (PyUnicode_CheckExact(items[0])) {
10106
5.24M
            res = items[0];
10107
5.24M
            return Py_NewRef(res);
10108
5.24M
        }
10109
1.69M
        seplen = 0;
10110
1.69M
        maxchar = 0;
10111
1.69M
    }
10112
48.8M
    else {
10113
        /* Set up sep and seplen */
10114
48.8M
        if (separator == NULL) {
10115
            /* fall back to a blank space separator */
10116
0
            sep = PyUnicode_FromOrdinal(' ');
10117
0
            if (!sep)
10118
0
                goto onError;
10119
0
            seplen = 1;
10120
0
            maxchar = 32;
10121
0
        }
10122
48.8M
        else {
10123
48.8M
            if (!PyUnicode_Check(separator)) {
10124
0
                PyErr_Format(PyExc_TypeError,
10125
0
                             "separator: expected str instance,"
10126
0
                             " %.80s found",
10127
0
                             Py_TYPE(separator)->tp_name);
10128
0
                goto onError;
10129
0
            }
10130
48.8M
            sep = separator;
10131
48.8M
            seplen = PyUnicode_GET_LENGTH(separator);
10132
48.8M
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10133
            /* inc refcount to keep this code path symmetric with the
10134
               above case of a blank separator */
10135
48.8M
            Py_INCREF(sep);
10136
48.8M
        }
10137
48.8M
        last_obj = sep;
10138
48.8M
    }
10139
10140
    /* There are at least two things to join, or else we have a subclass
10141
     * of str in the sequence.
10142
     * Do a pre-pass to figure out the total amount of space we'll
10143
     * need (sz), and see whether all argument are strings.
10144
     */
10145
50.5M
    sz = 0;
10146
#ifdef Py_DEBUG
10147
    use_memcpy = 0;
10148
#else
10149
50.5M
    use_memcpy = 1;
10150
50.5M
#endif
10151
403M
    for (i = 0; i < seqlen; i++) {
10152
353M
        size_t add_sz;
10153
353M
        item = items[i];
10154
353M
        if (!PyUnicode_Check(item)) {
10155
0
            PyErr_Format(PyExc_TypeError,
10156
0
                         "sequence item %zd: expected str instance,"
10157
0
                         " %.80s found",
10158
0
                         i, Py_TYPE(item)->tp_name);
10159
0
            goto onError;
10160
0
        }
10161
353M
        add_sz = PyUnicode_GET_LENGTH(item);
10162
353M
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10163
353M
        maxchar = Py_MAX(maxchar, item_maxchar);
10164
353M
        if (i != 0) {
10165
302M
            add_sz += seplen;
10166
302M
        }
10167
353M
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10168
0
            PyErr_SetString(PyExc_OverflowError,
10169
0
                            "join() result is too long for a Python string");
10170
0
            goto onError;
10171
0
        }
10172
353M
        sz += add_sz;
10173
353M
        if (use_memcpy && last_obj != NULL) {
10174
286M
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10175
4.56M
                use_memcpy = 0;
10176
286M
        }
10177
353M
        last_obj = item;
10178
353M
    }
10179
10180
50.5M
    res = PyUnicode_New(sz, maxchar);
10181
50.5M
    if (res == NULL)
10182
0
        goto onError;
10183
10184
    /* Catenate everything. */
10185
#ifdef Py_DEBUG
10186
    use_memcpy = 0;
10187
#else
10188
50.5M
    if (use_memcpy) {
10189
46.0M
        res_data = PyUnicode_1BYTE_DATA(res);
10190
46.0M
        kind = PyUnicode_KIND(res);
10191
46.0M
        if (seplen != 0)
10192
19.3k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10193
46.0M
    }
10194
50.5M
#endif
10195
50.5M
    if (use_memcpy) {
10196
305M
        for (i = 0; i < seqlen; ++i) {
10197
259M
            Py_ssize_t itemlen;
10198
259M
            item = items[i];
10199
10200
            /* Copy item, and maybe the separator. */
10201
259M
            if (i && seplen != 0) {
10202
25.9k
                memcpy(res_data,
10203
25.9k
                          sep_data,
10204
25.9k
                          kind * seplen);
10205
25.9k
                res_data += kind * seplen;
10206
25.9k
            }
10207
10208
259M
            itemlen = PyUnicode_GET_LENGTH(item);
10209
259M
            if (itemlen != 0) {
10210
224M
                memcpy(res_data,
10211
224M
                          PyUnicode_DATA(item),
10212
224M
                          kind * itemlen);
10213
224M
                res_data += kind * itemlen;
10214
224M
            }
10215
259M
        }
10216
46.0M
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10217
46.0M
                           + kind * PyUnicode_GET_LENGTH(res));
10218
46.0M
    }
10219
4.56M
    else {
10220
97.7M
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10221
93.2M
            Py_ssize_t itemlen;
10222
93.2M
            item = items[i];
10223
10224
            /* Copy item, and maybe the separator. */
10225
93.2M
            if (i && seplen != 0) {
10226
65.8k
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10227
65.8k
                res_offset += seplen;
10228
65.8k
            }
10229
10230
93.2M
            itemlen = PyUnicode_GET_LENGTH(item);
10231
93.2M
            if (itemlen != 0) {
10232
91.4M
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10233
91.4M
                res_offset += itemlen;
10234
91.4M
            }
10235
93.2M
        }
10236
4.56M
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10237
4.56M
    }
10238
10239
50.5M
    Py_XDECREF(sep);
10240
50.5M
    assert(_PyUnicode_CheckConsistency(res, 1));
10241
50.5M
    return res;
10242
10243
0
  onError:
10244
0
    Py_XDECREF(sep);
10245
0
    Py_XDECREF(res);
10246
0
    return NULL;
10247
50.5M
}
10248
10249
void
10250
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10251
                    Py_UCS4 fill_char)
10252
739
{
10253
739
    const int kind = PyUnicode_KIND(unicode);
10254
739
    void *data = PyUnicode_DATA(unicode);
10255
739
    assert(unicode_modifiable(unicode));
10256
739
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10257
739
    assert(start >= 0);
10258
739
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10259
739
    _PyUnicode_Fill(kind, data, fill_char, start, length);
10260
739
}
10261
10262
Py_ssize_t
10263
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10264
               Py_UCS4 fill_char)
10265
739
{
10266
739
    Py_ssize_t maxlen;
10267
10268
739
    if (!PyUnicode_Check(unicode)) {
10269
0
        PyErr_BadInternalCall();
10270
0
        return -1;
10271
0
    }
10272
739
    if (unicode_check_modifiable(unicode))
10273
0
        return -1;
10274
10275
739
    if (start < 0) {
10276
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10277
0
        return -1;
10278
0
    }
10279
739
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10280
0
        PyErr_SetString(PyExc_ValueError,
10281
0
                         "fill character is bigger than "
10282
0
                         "the string maximum character");
10283
0
        return -1;
10284
0
    }
10285
10286
739
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10287
739
    length = Py_MIN(maxlen, length);
10288
739
    if (length <= 0)
10289
0
        return 0;
10290
10291
739
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10292
739
    return length;
10293
739
}
10294
10295
static PyObject *
10296
pad(PyObject *self,
10297
    Py_ssize_t left,
10298
    Py_ssize_t right,
10299
    Py_UCS4 fill)
10300
0
{
10301
0
    PyObject *u;
10302
0
    Py_UCS4 maxchar;
10303
0
    int kind;
10304
0
    void *data;
10305
10306
0
    if (left < 0)
10307
0
        left = 0;
10308
0
    if (right < 0)
10309
0
        right = 0;
10310
10311
0
    if (left == 0 && right == 0)
10312
0
        return unicode_result_unchanged(self);
10313
10314
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10315
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10316
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10317
0
        return NULL;
10318
0
    }
10319
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10320
0
    maxchar = Py_MAX(maxchar, fill);
10321
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10322
0
    if (!u)
10323
0
        return NULL;
10324
10325
0
    kind = PyUnicode_KIND(u);
10326
0
    data = PyUnicode_DATA(u);
10327
0
    if (left)
10328
0
        _PyUnicode_Fill(kind, data, fill, 0, left);
10329
0
    if (right)
10330
0
        _PyUnicode_Fill(kind, data, fill,
10331
0
                        left + _PyUnicode_LENGTH(self), right);
10332
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10333
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10334
0
    return u;
10335
0
}
10336
10337
PyObject *
10338
PyUnicode_Splitlines(PyObject *string, int keepends)
10339
13.7k
{
10340
13.7k
    PyObject *list;
10341
10342
13.7k
    if (ensure_unicode(string) < 0)
10343
0
        return NULL;
10344
10345
13.7k
    switch (PyUnicode_KIND(string)) {
10346
3.53k
    case PyUnicode_1BYTE_KIND:
10347
3.53k
        if (PyUnicode_IS_ASCII(string))
10348
2.68k
            list = asciilib_splitlines(
10349
2.68k
                string, PyUnicode_1BYTE_DATA(string),
10350
2.68k
                PyUnicode_GET_LENGTH(string), keepends);
10351
852
        else
10352
852
            list = ucs1lib_splitlines(
10353
852
                string, PyUnicode_1BYTE_DATA(string),
10354
852
                PyUnicode_GET_LENGTH(string), keepends);
10355
3.53k
        break;
10356
7.22k
    case PyUnicode_2BYTE_KIND:
10357
7.22k
        list = ucs2lib_splitlines(
10358
7.22k
            string, PyUnicode_2BYTE_DATA(string),
10359
7.22k
            PyUnicode_GET_LENGTH(string), keepends);
10360
7.22k
        break;
10361
3.02k
    case PyUnicode_4BYTE_KIND:
10362
3.02k
        list = ucs4lib_splitlines(
10363
3.02k
            string, PyUnicode_4BYTE_DATA(string),
10364
3.02k
            PyUnicode_GET_LENGTH(string), keepends);
10365
3.02k
        break;
10366
0
    default:
10367
0
        Py_UNREACHABLE();
10368
13.7k
    }
10369
13.7k
    return list;
10370
13.7k
}
10371
10372
static PyObject *
10373
split(PyObject *self,
10374
      PyObject *substring,
10375
      Py_ssize_t maxcount)
10376
22.7M
{
10377
22.7M
    int kind1, kind2;
10378
22.7M
    const void *buf1, *buf2;
10379
22.7M
    Py_ssize_t len1, len2;
10380
22.7M
    PyObject* out;
10381
22.7M
    len1 = PyUnicode_GET_LENGTH(self);
10382
22.7M
    kind1 = PyUnicode_KIND(self);
10383
10384
22.7M
    if (substring == NULL) {
10385
180k
        if (maxcount < 0) {
10386
154k
            maxcount = (len1 - 1) / 2 + 1;
10387
154k
        }
10388
180k
        switch (kind1) {
10389
119k
        case PyUnicode_1BYTE_KIND:
10390
119k
            if (PyUnicode_IS_ASCII(self))
10391
86.3k
                return asciilib_split_whitespace(
10392
86.3k
                    self,  PyUnicode_1BYTE_DATA(self),
10393
86.3k
                    len1, maxcount
10394
86.3k
                    );
10395
33.2k
            else
10396
33.2k
                return ucs1lib_split_whitespace(
10397
33.2k
                    self,  PyUnicode_1BYTE_DATA(self),
10398
33.2k
                    len1, maxcount
10399
33.2k
                    );
10400
50.3k
        case PyUnicode_2BYTE_KIND:
10401
50.3k
            return ucs2lib_split_whitespace(
10402
50.3k
                self,  PyUnicode_2BYTE_DATA(self),
10403
50.3k
                len1, maxcount
10404
50.3k
                );
10405
10.5k
        case PyUnicode_4BYTE_KIND:
10406
10.5k
            return ucs4lib_split_whitespace(
10407
10.5k
                self,  PyUnicode_4BYTE_DATA(self),
10408
10.5k
                len1, maxcount
10409
10.5k
                );
10410
0
        default:
10411
0
            Py_UNREACHABLE();
10412
180k
        }
10413
180k
    }
10414
10415
22.6M
    kind2 = PyUnicode_KIND(substring);
10416
22.6M
    len2 = PyUnicode_GET_LENGTH(substring);
10417
22.6M
    if (maxcount < 0) {
10418
        // if len2 == 0, it will raise ValueError.
10419
11.8M
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10420
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10421
11.8M
        maxcount = maxcount < 0 ? len1 : maxcount;
10422
11.8M
    }
10423
22.6M
    if (kind1 < kind2 || len1 < len2) {
10424
5.63M
        out = PyList_New(1);
10425
5.63M
        if (out == NULL)
10426
0
            return NULL;
10427
5.63M
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10428
5.63M
        return out;
10429
5.63M
    }
10430
16.9M
    buf1 = PyUnicode_DATA(self);
10431
16.9M
    buf2 = PyUnicode_DATA(substring);
10432
16.9M
    if (kind2 != kind1) {
10433
216k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10434
216k
        if (!buf2)
10435
0
            return NULL;
10436
216k
    }
10437
10438
16.9M
    switch (kind1) {
10439
16.7M
    case PyUnicode_1BYTE_KIND:
10440
16.7M
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10441
15.6M
            out = asciilib_split(
10442
15.6M
                self,  buf1, len1, buf2, len2, maxcount);
10443
1.06M
        else
10444
1.06M
            out = ucs1lib_split(
10445
1.06M
                self,  buf1, len1, buf2, len2, maxcount);
10446
16.7M
        break;
10447
181k
    case PyUnicode_2BYTE_KIND:
10448
181k
        out = ucs2lib_split(
10449
181k
            self,  buf1, len1, buf2, len2, maxcount);
10450
181k
        break;
10451
35.0k
    case PyUnicode_4BYTE_KIND:
10452
35.0k
        out = ucs4lib_split(
10453
35.0k
            self,  buf1, len1, buf2, len2, maxcount);
10454
35.0k
        break;
10455
0
    default:
10456
0
        out = NULL;
10457
16.9M
    }
10458
16.9M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10459
16.9M
    if (kind2 != kind1)
10460
216k
        PyMem_Free((void *)buf2);
10461
16.9M
    return out;
10462
16.9M
}
10463
10464
static PyObject *
10465
rsplit(PyObject *self,
10466
       PyObject *substring,
10467
       Py_ssize_t maxcount)
10468
50
{
10469
50
    int kind1, kind2;
10470
50
    const void *buf1, *buf2;
10471
50
    Py_ssize_t len1, len2;
10472
50
    PyObject* out;
10473
10474
50
    len1 = PyUnicode_GET_LENGTH(self);
10475
50
    kind1 = PyUnicode_KIND(self);
10476
10477
50
    if (substring == NULL) {
10478
0
        if (maxcount < 0) {
10479
0
            maxcount = (len1 - 1) / 2 + 1;
10480
0
        }
10481
0
        switch (kind1) {
10482
0
        case PyUnicode_1BYTE_KIND:
10483
0
            if (PyUnicode_IS_ASCII(self))
10484
0
                return asciilib_rsplit_whitespace(
10485
0
                    self,  PyUnicode_1BYTE_DATA(self),
10486
0
                    len1, maxcount
10487
0
                    );
10488
0
            else
10489
0
                return ucs1lib_rsplit_whitespace(
10490
0
                    self,  PyUnicode_1BYTE_DATA(self),
10491
0
                    len1, maxcount
10492
0
                    );
10493
0
        case PyUnicode_2BYTE_KIND:
10494
0
            return ucs2lib_rsplit_whitespace(
10495
0
                self,  PyUnicode_2BYTE_DATA(self),
10496
0
                len1, maxcount
10497
0
                );
10498
0
        case PyUnicode_4BYTE_KIND:
10499
0
            return ucs4lib_rsplit_whitespace(
10500
0
                self,  PyUnicode_4BYTE_DATA(self),
10501
0
                len1, maxcount
10502
0
                );
10503
0
        default:
10504
0
            Py_UNREACHABLE();
10505
0
        }
10506
0
    }
10507
50
    kind2 = PyUnicode_KIND(substring);
10508
50
    len2 = PyUnicode_GET_LENGTH(substring);
10509
50
    if (maxcount < 0) {
10510
        // if len2 == 0, it will raise ValueError.
10511
0
        maxcount = len2 == 0 ? 0 : (len1 / len2) + 1;
10512
        // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1
10513
0
        maxcount = maxcount < 0 ? len1 : maxcount;
10514
0
    }
10515
50
    if (kind1 < kind2 || len1 < len2) {
10516
0
        out = PyList_New(1);
10517
0
        if (out == NULL)
10518
0
            return NULL;
10519
0
        PyList_SET_ITEM(out, 0, Py_NewRef(self));
10520
0
        return out;
10521
0
    }
10522
50
    buf1 = PyUnicode_DATA(self);
10523
50
    buf2 = PyUnicode_DATA(substring);
10524
50
    if (kind2 != kind1) {
10525
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
10526
0
        if (!buf2)
10527
0
            return NULL;
10528
0
    }
10529
10530
50
    switch (kind1) {
10531
50
    case PyUnicode_1BYTE_KIND:
10532
50
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10533
50
            out = asciilib_rsplit(
10534
50
                self,  buf1, len1, buf2, len2, maxcount);
10535
0
        else
10536
0
            out = ucs1lib_rsplit(
10537
0
                self,  buf1, len1, buf2, len2, maxcount);
10538
50
        break;
10539
0
    case PyUnicode_2BYTE_KIND:
10540
0
        out = ucs2lib_rsplit(
10541
0
            self,  buf1, len1, buf2, len2, maxcount);
10542
0
        break;
10543
0
    case PyUnicode_4BYTE_KIND:
10544
0
        out = ucs4lib_rsplit(
10545
0
            self,  buf1, len1, buf2, len2, maxcount);
10546
0
        break;
10547
0
    default:
10548
0
        out = NULL;
10549
50
    }
10550
50
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
10551
50
    if (kind2 != kind1)
10552
0
        PyMem_Free((void *)buf2);
10553
50
    return out;
10554
50
}
10555
10556
static Py_ssize_t
10557
anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10558
            PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10559
166M
{
10560
166M
    switch (kind) {
10561
24.7M
    case PyUnicode_1BYTE_KIND:
10562
24.7M
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10563
20.9M
            return asciilib_find(buf1, len1, buf2, len2, offset);
10564
3.87M
        else
10565
3.87M
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10566
68.9M
    case PyUnicode_2BYTE_KIND:
10567
68.9M
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10568
72.6M
    case PyUnicode_4BYTE_KIND:
10569
72.6M
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10570
166M
    }
10571
166M
    Py_UNREACHABLE();
10572
166M
}
10573
10574
static Py_ssize_t
10575
anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10576
             PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10577
40.0M
{
10578
40.0M
    switch (kind) {
10579
35.5M
    case PyUnicode_1BYTE_KIND:
10580
35.5M
        return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10581
4.44M
    case PyUnicode_2BYTE_KIND:
10582
4.44M
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10583
120k
    case PyUnicode_4BYTE_KIND:
10584
120k
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10585
40.0M
    }
10586
40.0M
    Py_UNREACHABLE();
10587
40.0M
}
10588
10589
static void
10590
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10591
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10592
1.33M
{
10593
1.33M
    int kind = PyUnicode_KIND(u);
10594
1.33M
    void *data = PyUnicode_DATA(u);
10595
1.33M
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10596
1.33M
    if (kind == PyUnicode_1BYTE_KIND) {
10597
602k
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10598
602k
                                      (Py_UCS1 *)data + len,
10599
602k
                                      u1, u2, maxcount);
10600
602k
    }
10601
734k
    else if (kind == PyUnicode_2BYTE_KIND) {
10602
720k
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10603
720k
                                      (Py_UCS2 *)data + len,
10604
720k
                                      u1, u2, maxcount);
10605
720k
    }
10606
14.0k
    else {
10607
14.0k
        assert(kind == PyUnicode_4BYTE_KIND);
10608
14.0k
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10609
14.0k
                                      (Py_UCS4 *)data + len,
10610
14.0k
                                      u1, u2, maxcount);
10611
14.0k
    }
10612
1.33M
}
10613
10614
static PyObject *
10615
replace(PyObject *self, PyObject *str1,
10616
        PyObject *str2, Py_ssize_t maxcount)
10617
78.7M
{
10618
78.7M
    PyObject *u;
10619
78.7M
    const char *sbuf = PyUnicode_DATA(self);
10620
78.7M
    const void *buf1 = PyUnicode_DATA(str1);
10621
78.7M
    const void *buf2 = PyUnicode_DATA(str2);
10622
78.7M
    int srelease = 0, release1 = 0, release2 = 0;
10623
78.7M
    int skind = PyUnicode_KIND(self);
10624
78.7M
    int kind1 = PyUnicode_KIND(str1);
10625
78.7M
    int kind2 = PyUnicode_KIND(str2);
10626
78.7M
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10627
78.7M
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10628
78.7M
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10629
78.7M
    int mayshrink;
10630
78.7M
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10631
10632
78.7M
    if (slen < len1)
10633
32.7M
        goto nothing;
10634
10635
45.9M
    if (maxcount < 0)
10636
45.9M
        maxcount = PY_SSIZE_T_MAX;
10637
0
    else if (maxcount == 0)
10638
0
        goto nothing;
10639
10640
45.9M
    if (str1 == str2)
10641
0
        goto nothing;
10642
10643
45.9M
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10644
45.9M
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10645
45.9M
    if (maxchar < maxchar_str1)
10646
        /* substring too wide to be present */
10647
0
        goto nothing;
10648
45.9M
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10649
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10650
       result string. */
10651
45.9M
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10652
45.9M
    maxchar = Py_MAX(maxchar, maxchar_str2);
10653
10654
45.9M
    if (len1 == len2) {
10655
        /* same length */
10656
5.91M
        if (len1 == 0)
10657
0
            goto nothing;
10658
5.91M
        if (len1 == 1) {
10659
            /* replace characters */
10660
5.91M
            Py_UCS4 u1, u2;
10661
5.91M
            Py_ssize_t pos;
10662
10663
5.91M
            u1 = PyUnicode_READ(kind1, buf1, 0);
10664
5.91M
            pos = findchar(sbuf, skind, slen, u1, 1);
10665
5.91M
            if (pos < 0)
10666
4.57M
                goto nothing;
10667
1.33M
            u2 = PyUnicode_READ(kind2, buf2, 0);
10668
1.33M
            u = PyUnicode_New(slen, maxchar);
10669
1.33M
            if (!u)
10670
0
                goto error;
10671
10672
1.33M
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10673
1.33M
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10674
1.33M
        }
10675
0
        else {
10676
0
            int rkind = skind;
10677
0
            char *res;
10678
0
            Py_ssize_t i;
10679
10680
0
            if (kind1 < rkind) {
10681
                /* widen substring */
10682
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10683
0
                if (!buf1) goto error;
10684
0
                release1 = 1;
10685
0
            }
10686
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10687
0
            if (i < 0)
10688
0
                goto nothing;
10689
0
            if (rkind > kind2) {
10690
                /* widen replacement */
10691
0
                buf2 = unicode_askind(kind2, buf2, len2, rkind);
10692
0
                if (!buf2) goto error;
10693
0
                release2 = 1;
10694
0
            }
10695
0
            else if (rkind < kind2) {
10696
                /* widen self and buf1 */
10697
0
                rkind = kind2;
10698
0
                if (release1) {
10699
0
                    assert(buf1 != PyUnicode_DATA(str1));
10700
0
                    PyMem_Free((void *)buf1);
10701
0
                    buf1 = PyUnicode_DATA(str1);
10702
0
                    release1 = 0;
10703
0
                }
10704
0
                sbuf = unicode_askind(skind, sbuf, slen, rkind);
10705
0
                if (!sbuf) goto error;
10706
0
                srelease = 1;
10707
0
                buf1 = unicode_askind(kind1, buf1, len1, rkind);
10708
0
                if (!buf1) goto error;
10709
0
                release1 = 1;
10710
0
            }
10711
0
            u = PyUnicode_New(slen, maxchar);
10712
0
            if (!u)
10713
0
                goto error;
10714
0
            assert(PyUnicode_KIND(u) == rkind);
10715
0
            res = PyUnicode_DATA(u);
10716
10717
0
            memcpy(res, sbuf, rkind * slen);
10718
            /* change everything in-place, starting with this one */
10719
0
            memcpy(res + rkind * i,
10720
0
                   buf2,
10721
0
                   rkind * len2);
10722
0
            i += len1;
10723
10724
0
            while ( --maxcount > 0) {
10725
0
                i = anylib_find(rkind, self,
10726
0
                                sbuf+rkind*i, slen-i,
10727
0
                                str1, buf1, len1, i);
10728
0
                if (i == -1)
10729
0
                    break;
10730
0
                memcpy(res + rkind * i,
10731
0
                       buf2,
10732
0
                       rkind * len2);
10733
0
                i += len1;
10734
0
            }
10735
0
        }
10736
5.91M
    }
10737
40.0M
    else {
10738
40.0M
        Py_ssize_t n, i, j, ires;
10739
40.0M
        Py_ssize_t new_size;
10740
40.0M
        int rkind = skind;
10741
40.0M
        char *res;
10742
10743
40.0M
        if (kind1 < rkind) {
10744
            /* widen substring */
10745
4.56M
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10746
4.56M
            if (!buf1) goto error;
10747
4.56M
            release1 = 1;
10748
4.56M
        }
10749
40.0M
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10750
40.0M
        if (n == 0)
10751
35.1M
            goto nothing;
10752
4.92M
        if (kind2 < rkind) {
10753
            /* widen replacement */
10754
963k
            buf2 = unicode_askind(kind2, buf2, len2, rkind);
10755
963k
            if (!buf2) goto error;
10756
963k
            release2 = 1;
10757
963k
        }
10758
3.96M
        else if (kind2 > rkind) {
10759
            /* widen self and buf1 */
10760
0
            rkind = kind2;
10761
0
            sbuf = unicode_askind(skind, sbuf, slen, rkind);
10762
0
            if (!sbuf) goto error;
10763
0
            srelease = 1;
10764
0
            if (release1) {
10765
0
                assert(buf1 != PyUnicode_DATA(str1));
10766
0
                PyMem_Free((void *)buf1);
10767
0
                buf1 = PyUnicode_DATA(str1);
10768
0
                release1 = 0;
10769
0
            }
10770
0
            buf1 = unicode_askind(kind1, buf1, len1, rkind);
10771
0
            if (!buf1) goto error;
10772
0
            release1 = 1;
10773
0
        }
10774
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10775
           PyUnicode_GET_LENGTH(str1)); */
10776
4.92M
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10777
0
                PyErr_SetString(PyExc_OverflowError,
10778
0
                                "replace string is too long");
10779
0
                goto error;
10780
0
        }
10781
4.92M
        new_size = slen + n * (len2 - len1);
10782
4.92M
        if (new_size == 0) {
10783
0
            u = unicode_get_empty();
10784
0
            goto done;
10785
0
        }
10786
4.92M
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10787
0
            PyErr_SetString(PyExc_OverflowError,
10788
0
                            "replace string is too long");
10789
0
            goto error;
10790
0
        }
10791
4.92M
        u = PyUnicode_New(new_size, maxchar);
10792
4.92M
        if (!u)
10793
0
            goto error;
10794
4.92M
        assert(PyUnicode_KIND(u) == rkind);
10795
4.92M
        res = PyUnicode_DATA(u);
10796
4.92M
        ires = i = 0;
10797
4.92M
        if (len1 > 0) {
10798
171M
            while (n-- > 0) {
10799
                /* look for next match */
10800
166M
                j = anylib_find(rkind, self,
10801
166M
                                sbuf + rkind * i, slen-i,
10802
166M
                                str1, buf1, len1, i);
10803
166M
                if (j == -1)
10804
0
                    break;
10805
166M
                else if (j > i) {
10806
                    /* copy unchanged part [i:j] */
10807
20.5M
                    memcpy(res + rkind * ires,
10808
20.5M
                           sbuf + rkind * i,
10809
20.5M
                           rkind * (j-i));
10810
20.5M
                    ires += j - i;
10811
20.5M
                }
10812
                /* copy substitution string */
10813
166M
                if (len2 > 0) {
10814
166M
                    memcpy(res + rkind * ires,
10815
166M
                           buf2,
10816
166M
                           rkind * len2);
10817
166M
                    ires += len2;
10818
166M
                }
10819
166M
                i = j + len1;
10820
166M
            }
10821
4.92M
            if (i < slen)
10822
                /* copy tail [i:] */
10823
4.84M
                memcpy(res + rkind * ires,
10824
4.84M
                       sbuf + rkind * i,
10825
4.84M
                       rkind * (slen-i));
10826
4.92M
        }
10827
0
        else {
10828
            /* interleave */
10829
0
            while (n > 0) {
10830
0
                memcpy(res + rkind * ires,
10831
0
                       buf2,
10832
0
                       rkind * len2);
10833
0
                ires += len2;
10834
0
                if (--n <= 0)
10835
0
                    break;
10836
0
                memcpy(res + rkind * ires,
10837
0
                       sbuf + rkind * i,
10838
0
                       rkind);
10839
0
                ires++;
10840
0
                i++;
10841
0
            }
10842
0
            memcpy(res + rkind * ires,
10843
0
                   sbuf + rkind * i,
10844
0
                   rkind * (slen-i));
10845
0
        }
10846
4.92M
    }
10847
10848
6.26M
    if (mayshrink) {
10849
0
        unicode_adjust_maxchar(&u);
10850
0
        if (u == NULL)
10851
0
            goto error;
10852
0
    }
10853
10854
6.26M
  done:
10855
6.26M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10856
6.26M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10857
6.26M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10858
6.26M
    if (srelease)
10859
0
        PyMem_Free((void *)sbuf);
10860
6.26M
    if (release1)
10861
963k
        PyMem_Free((void *)buf1);
10862
6.26M
    if (release2)
10863
963k
        PyMem_Free((void *)buf2);
10864
6.26M
    assert(_PyUnicode_CheckConsistency(u, 1));
10865
6.26M
    return u;
10866
10867
72.5M
  nothing:
10868
    /* nothing to replace; return original string (when possible) */
10869
72.5M
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10870
72.5M
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10871
72.5M
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10872
72.5M
    if (srelease)
10873
0
        PyMem_Free((void *)sbuf);
10874
72.5M
    if (release1)
10875
3.59M
        PyMem_Free((void *)buf1);
10876
72.5M
    if (release2)
10877
0
        PyMem_Free((void *)buf2);
10878
72.5M
    return unicode_result_unchanged(self);
10879
10880
0
  error:
10881
0
    assert(srelease == (sbuf != PyUnicode_DATA(self)));
10882
0
    assert(release1 == (buf1 != PyUnicode_DATA(str1)));
10883
0
    assert(release2 == (buf2 != PyUnicode_DATA(str2)));
10884
0
    if (srelease)
10885
0
        PyMem_Free((void *)sbuf);
10886
0
    if (release1)
10887
0
        PyMem_Free((void *)buf1);
10888
0
    if (release2)
10889
0
        PyMem_Free((void *)buf2);
10890
0
    return NULL;
10891
6.26M
}
10892
10893
/* --- Unicode Object Methods --------------------------------------------- */
10894
10895
/*[clinic input]
10896
@permit_long_docstring_body
10897
str.title as unicode_title
10898
10899
Return a version of the string where each word is titlecased.
10900
10901
More specifically, words start with uppercased characters and all remaining
10902
cased characters have lower case.
10903
[clinic start generated code]*/
10904
10905
static PyObject *
10906
unicode_title_impl(PyObject *self)
10907
/*[clinic end generated code: output=c75ae03809574902 input=533ce0eb6a7f5d1b]*/
10908
0
{
10909
0
    return case_operation(self, do_title);
10910
0
}
10911
10912
/*[clinic input]
10913
@permit_long_docstring_body
10914
str.capitalize as unicode_capitalize
10915
10916
Return a capitalized version of the string.
10917
10918
More specifically, make the first character have upper case and the rest lower
10919
case.
10920
[clinic start generated code]*/
10921
10922
static PyObject *
10923
unicode_capitalize_impl(PyObject *self)
10924
/*[clinic end generated code: output=e49a4c333cdb7667 input=a4a15ade41f6f9e9]*/
10925
0
{
10926
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10927
0
        return unicode_result_unchanged(self);
10928
0
    return case_operation(self, do_capitalize);
10929
0
}
10930
10931
/*[clinic input]
10932
str.casefold as unicode_casefold
10933
10934
Return a version of the string suitable for caseless comparisons.
10935
[clinic start generated code]*/
10936
10937
static PyObject *
10938
unicode_casefold_impl(PyObject *self)
10939
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10940
0
{
10941
0
    if (PyUnicode_IS_ASCII(self))
10942
0
        return ascii_upper_or_lower(self, 1);
10943
0
    return case_operation(self, do_casefold);
10944
0
}
10945
10946
10947
/* Argument converter. Accepts a single Unicode character. */
10948
10949
static int
10950
convert_uc(PyObject *obj, void *addr)
10951
0
{
10952
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10953
10954
0
    if (!PyUnicode_Check(obj)) {
10955
0
        PyErr_Format(PyExc_TypeError,
10956
0
                     "The fill character must be a unicode character, "
10957
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10958
0
        return 0;
10959
0
    }
10960
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10961
0
        PyErr_SetString(PyExc_TypeError,
10962
0
                        "The fill character must be exactly one character long");
10963
0
        return 0;
10964
0
    }
10965
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10966
0
    return 1;
10967
0
}
10968
10969
/*[clinic input]
10970
str.center as unicode_center
10971
10972
    width: Py_ssize_t
10973
    fillchar: Py_UCS4 = ' '
10974
    /
10975
10976
Return a centered string of length width.
10977
10978
Padding is done using the specified fill character (default is a space).
10979
[clinic start generated code]*/
10980
10981
static PyObject *
10982
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10983
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10984
0
{
10985
0
    Py_ssize_t marg, left;
10986
10987
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10988
0
        return unicode_result_unchanged(self);
10989
10990
0
    marg = width - PyUnicode_GET_LENGTH(self);
10991
0
    left = marg / 2 + (marg & width & 1);
10992
10993
0
    return pad(self, left, marg - left, fillchar);
10994
0
}
10995
10996
/* This function assumes that str1 and str2 are readied by the caller. */
10997
10998
static int
10999
unicode_compare(PyObject *str1, PyObject *str2)
11000
21.5M
{
11001
21.5M
#define COMPARE(TYPE1, TYPE2) \
11002
21.5M
    do { \
11003
20.5M
        TYPE1* p1 = (TYPE1 *)data1; \
11004
20.5M
        TYPE2* p2 = (TYPE2 *)data2; \
11005
20.5M
        TYPE1* end = p1 + len; \
11006
20.5M
        Py_UCS4 c1, c2; \
11007
20.5M
        for (; p1 != end; p1++, p2++) { \
11008
20.5M
            c1 = *p1; \
11009
20.5M
            c2 = *p2; \
11010
20.5M
            if (c1 != c2) \
11011
20.5M
                return (c1 < c2) ? -1 : 1; \
11012
20.5M
        } \
11013
20.5M
    } \
11014
20.5M
    while (0)
11015
11016
21.5M
    int kind1, kind2;
11017
21.5M
    const void *data1, *data2;
11018
21.5M
    Py_ssize_t len1, len2, len;
11019
11020
21.5M
    kind1 = PyUnicode_KIND(str1);
11021
21.5M
    kind2 = PyUnicode_KIND(str2);
11022
21.5M
    data1 = PyUnicode_DATA(str1);
11023
21.5M
    data2 = PyUnicode_DATA(str2);
11024
21.5M
    len1 = PyUnicode_GET_LENGTH(str1);
11025
21.5M
    len2 = PyUnicode_GET_LENGTH(str2);
11026
21.5M
    len = Py_MIN(len1, len2);
11027
11028
21.5M
    switch(kind1) {
11029
2.09M
    case PyUnicode_1BYTE_KIND:
11030
2.09M
    {
11031
2.09M
        switch(kind2) {
11032
75.4k
        case PyUnicode_1BYTE_KIND:
11033
75.4k
        {
11034
75.4k
            int cmp = memcmp(data1, data2, len);
11035
            /* normalize result of memcmp() into the range [-1; 1] */
11036
75.4k
            if (cmp < 0)
11037
49.3k
                return -1;
11038
26.0k
            if (cmp > 0)
11039
25.5k
                return 1;
11040
563
            break;
11041
26.0k
        }
11042
1.57M
        case PyUnicode_2BYTE_KIND:
11043
1.57M
            COMPARE(Py_UCS1, Py_UCS2);
11044
0
            break;
11045
443k
        case PyUnicode_4BYTE_KIND:
11046
443k
            COMPARE(Py_UCS1, Py_UCS4);
11047
0
            break;
11048
0
        default:
11049
0
            Py_UNREACHABLE();
11050
2.09M
        }
11051
563
        break;
11052
2.09M
    }
11053
17.9M
    case PyUnicode_2BYTE_KIND:
11054
17.9M
    {
11055
17.9M
        switch(kind2) {
11056
3.05k
        case PyUnicode_1BYTE_KIND:
11057
3.05k
            COMPARE(Py_UCS2, Py_UCS1);
11058
0
            break;
11059
15.8M
        case PyUnicode_2BYTE_KIND:
11060
15.8M
        {
11061
15.8M
            COMPARE(Py_UCS2, Py_UCS2);
11062
0
            break;
11063
15.8M
        }
11064
2.05M
        case PyUnicode_4BYTE_KIND:
11065
2.05M
            COMPARE(Py_UCS2, Py_UCS4);
11066
0
            break;
11067
0
        default:
11068
0
            Py_UNREACHABLE();
11069
17.9M
        }
11070
0
        break;
11071
17.9M
    }
11072
1.47M
    case PyUnicode_4BYTE_KIND:
11073
1.47M
    {
11074
1.47M
        switch(kind2) {
11075
4.12k
        case PyUnicode_1BYTE_KIND:
11076
4.12k
            COMPARE(Py_UCS4, Py_UCS1);
11077
0
            break;
11078
574k
        case PyUnicode_2BYTE_KIND:
11079
574k
            COMPARE(Py_UCS4, Py_UCS2);
11080
0
            break;
11081
898k
        case PyUnicode_4BYTE_KIND:
11082
898k
        {
11083
898k
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11084
898k
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11085
            /* normalize result of wmemcmp() into the range [-1; 1] */
11086
898k
            if (cmp < 0)
11087
436k
                return -1;
11088
462k
            if (cmp > 0)
11089
462k
                return 1;
11090
#else
11091
            COMPARE(Py_UCS4, Py_UCS4);
11092
#endif
11093
0
            break;
11094
462k
        }
11095
0
        default:
11096
0
            Py_UNREACHABLE();
11097
1.47M
        }
11098
0
        break;
11099
1.47M
    }
11100
0
    default:
11101
0
        Py_UNREACHABLE();
11102
21.5M
    }
11103
11104
563
    if (len1 == len2)
11105
560
        return 0;
11106
3
    if (len1 < len2)
11107
3
        return -1;
11108
0
    else
11109
0
        return 1;
11110
11111
3
#undef COMPARE
11112
3
}
11113
11114
11115
int
11116
_PyUnicode_Equal(PyObject *str1, PyObject *str2)
11117
290M
{
11118
290M
    assert(PyUnicode_Check(str1));
11119
290M
    assert(PyUnicode_Check(str2));
11120
290M
    if (str1 == str2) {
11121
76.9M
        return 1;
11122
76.9M
    }
11123
213M
    return unicode_eq(str1, str2);
11124
290M
}
11125
11126
11127
int
11128
PyUnicode_Equal(PyObject *str1, PyObject *str2)
11129
0
{
11130
0
    if (!PyUnicode_Check(str1)) {
11131
0
        PyErr_Format(PyExc_TypeError,
11132
0
                     "first argument must be str, not %T", str1);
11133
0
        return -1;
11134
0
    }
11135
0
    if (!PyUnicode_Check(str2)) {
11136
0
        PyErr_Format(PyExc_TypeError,
11137
0
                     "second argument must be str, not %T", str2);
11138
0
        return -1;
11139
0
    }
11140
11141
0
    return _PyUnicode_Equal(str1, str2);
11142
0
}
11143
11144
11145
int
11146
PyUnicode_Compare(PyObject *left, PyObject *right)
11147
7.20k
{
11148
7.20k
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11149
        /* a string is equal to itself */
11150
7.20k
        if (left == right)
11151
0
            return 0;
11152
11153
7.20k
        return unicode_compare(left, right);
11154
7.20k
    }
11155
0
    PyErr_Format(PyExc_TypeError,
11156
0
                 "Can't compare %.100s and %.100s",
11157
0
                 Py_TYPE(left)->tp_name,
11158
0
                 Py_TYPE(right)->tp_name);
11159
0
    return -1;
11160
7.20k
}
11161
11162
int
11163
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11164
2.00M
{
11165
2.00M
    Py_ssize_t i;
11166
2.00M
    int kind;
11167
2.00M
    Py_UCS4 chr;
11168
11169
2.00M
    assert(_PyUnicode_CHECK(uni));
11170
2.00M
    kind = PyUnicode_KIND(uni);
11171
2.00M
    if (kind == PyUnicode_1BYTE_KIND) {
11172
2.00M
        const void *data = PyUnicode_1BYTE_DATA(uni);
11173
2.00M
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11174
2.00M
        size_t len, len2 = strlen(str);
11175
2.00M
        int cmp;
11176
11177
2.00M
        len = Py_MIN(len1, len2);
11178
2.00M
        cmp = memcmp(data, str, len);
11179
2.00M
        if (cmp != 0) {
11180
1.44M
            if (cmp < 0)
11181
8.43k
                return -1;
11182
1.43M
            else
11183
1.43M
                return 1;
11184
1.44M
        }
11185
561k
        if (len1 > len2)
11186
70
            return 1; /* uni is longer */
11187
561k
        if (len1 < len2)
11188
734
            return -1; /* str is longer */
11189
560k
        return 0;
11190
561k
    }
11191
1.67k
    else {
11192
1.67k
        const void *data = PyUnicode_DATA(uni);
11193
        /* Compare Unicode string and source character set string */
11194
2.90k
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11195
2.66k
            if (chr != (unsigned char)str[i])
11196
1.44k
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11197
        /* This check keeps Python strings that end in '\0' from comparing equal
11198
         to C strings identical up to that point. */
11199
238
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11200
238
            return 1; /* uni is longer */
11201
0
        if (str[i])
11202
0
            return -1; /* str is longer */
11203
0
        return 0;
11204
0
    }
11205
2.00M
}
11206
11207
int
11208
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
11209
0
{
11210
0
    return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
11211
0
}
11212
11213
int
11214
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
11215
0
{
11216
0
    assert(_PyUnicode_CHECK(unicode));
11217
0
    assert(str);
11218
11219
0
    if (PyUnicode_IS_ASCII(unicode)) {
11220
0
        Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11221
0
        return size == len &&
11222
0
            memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11223
0
    }
11224
0
    if (PyUnicode_UTF8(unicode) != NULL) {
11225
0
        Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
11226
0
        return size == len &&
11227
0
            memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
11228
0
    }
11229
11230
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
11231
0
    if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
11232
0
        return 0;
11233
0
    }
11234
0
    const unsigned char *s = (const unsigned char *)str;
11235
0
    const unsigned char *ends = s + (size_t)size;
11236
0
    int kind = PyUnicode_KIND(unicode);
11237
0
    const void *data = PyUnicode_DATA(unicode);
11238
    /* Compare Unicode string and UTF-8 string */
11239
0
    for (Py_ssize_t i = 0; i < len; i++) {
11240
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11241
0
        if (ch < 0x80) {
11242
0
            if (ends == s || s[0] != ch) {
11243
0
                return 0;
11244
0
            }
11245
0
            s += 1;
11246
0
        }
11247
0
        else if (ch < 0x800) {
11248
0
            if ((ends - s) < 2 ||
11249
0
                s[0] != (0xc0 | (ch >> 6)) ||
11250
0
                s[1] != (0x80 | (ch & 0x3f)))
11251
0
            {
11252
0
                return 0;
11253
0
            }
11254
0
            s += 2;
11255
0
        }
11256
0
        else if (ch < 0x10000) {
11257
0
            if (Py_UNICODE_IS_SURROGATE(ch) ||
11258
0
                (ends - s) < 3 ||
11259
0
                s[0] != (0xe0 | (ch >> 12)) ||
11260
0
                s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
11261
0
                s[2] != (0x80 | (ch & 0x3f)))
11262
0
            {
11263
0
                return 0;
11264
0
            }
11265
0
            s += 3;
11266
0
        }
11267
0
        else {
11268
0
            assert(ch <= MAX_UNICODE);
11269
0
            if ((ends - s) < 4 ||
11270
0
                s[0] != (0xf0 | (ch >> 18)) ||
11271
0
                s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
11272
0
                s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
11273
0
                s[3] != (0x80 | (ch & 0x3f)))
11274
0
            {
11275
0
                return 0;
11276
0
            }
11277
0
            s += 4;
11278
0
        }
11279
0
    }
11280
0
    return s == ends;
11281
0
}
11282
11283
int
11284
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11285
7.72M
{
11286
7.72M
    size_t len;
11287
7.72M
    assert(_PyUnicode_CHECK(unicode));
11288
7.72M
    assert(str);
11289
#ifndef NDEBUG
11290
    for (const char *p = str; *p; p++) {
11291
        assert((unsigned char)*p < 128);
11292
    }
11293
#endif
11294
7.72M
    if (!PyUnicode_IS_ASCII(unicode))
11295
152k
        return 0;
11296
7.56M
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11297
7.56M
    return strlen(str) == len &&
11298
463k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11299
7.72M
}
11300
11301
int
11302
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11303
0
{
11304
0
    PyObject *right_uni;
11305
11306
0
    assert(_PyUnicode_CHECK(left));
11307
0
    assert(right->string);
11308
#ifndef NDEBUG
11309
    for (const char *p = right->string; *p; p++) {
11310
        assert((unsigned char)*p < 128);
11311
    }
11312
#endif
11313
11314
0
    if (!PyUnicode_IS_ASCII(left))
11315
0
        return 0;
11316
11317
0
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11318
0
    if (right_uni == NULL) {
11319
        /* memory error or bad data */
11320
0
        PyErr_Clear();
11321
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11322
0
    }
11323
11324
0
    if (left == right_uni)
11325
0
        return 1;
11326
11327
0
    assert(PyUnicode_CHECK_INTERNED(right_uni));
11328
0
    if (PyUnicode_CHECK_INTERNED(left)) {
11329
0
        return 0;
11330
0
    }
11331
11332
0
    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
11333
0
    assert(right_hash != -1);
11334
0
    Py_hash_t hash = PyUnicode_HASH(left);
11335
0
    if (hash != -1 && hash != right_hash) {
11336
0
        return 0;
11337
0
    }
11338
11339
0
    return unicode_eq(left, right_uni);
11340
0
}
11341
11342
PyObject *
11343
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11344
30.1M
{
11345
30.1M
    int result;
11346
11347
30.1M
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11348
94.0k
        Py_RETURN_NOTIMPLEMENTED;
11349
11350
30.0M
    if (left == right) {
11351
1.50k
        switch (op) {
11352
1.41k
        case Py_EQ:
11353
1.41k
        case Py_LE:
11354
1.41k
        case Py_GE:
11355
            /* a string is equal to itself */
11356
1.41k
            Py_RETURN_TRUE;
11357
83
        case Py_NE:
11358
83
        case Py_LT:
11359
83
        case Py_GT:
11360
83
            Py_RETURN_FALSE;
11361
0
        default:
11362
0
            PyErr_BadArgument();
11363
0
            return NULL;
11364
1.50k
        }
11365
1.50k
    }
11366
30.0M
    else if (op == Py_EQ || op == Py_NE) {
11367
8.54M
        result = unicode_eq(left, right);
11368
8.54M
        result ^= (op == Py_NE);
11369
8.54M
        return PyBool_FromLong(result);
11370
8.54M
    }
11371
21.5M
    else {
11372
21.5M
        result = unicode_compare(left, right);
11373
21.5M
        Py_RETURN_RICHCOMPARE(result, 0, op);
11374
21.5M
    }
11375
30.0M
}
11376
11377
int
11378
PyUnicode_Contains(PyObject *str, PyObject *substr)
11379
91.3M
{
11380
91.3M
    int kind1, kind2;
11381
91.3M
    const void *buf1, *buf2;
11382
91.3M
    Py_ssize_t len1, len2;
11383
91.3M
    int result;
11384
11385
91.3M
    if (!PyUnicode_Check(substr)) {
11386
0
        PyErr_Format(PyExc_TypeError,
11387
0
                     "'in <string>' requires string as left operand, not %.100s",
11388
0
                     Py_TYPE(substr)->tp_name);
11389
0
        return -1;
11390
0
    }
11391
91.3M
    if (ensure_unicode(str) < 0)
11392
0
        return -1;
11393
11394
91.3M
    kind1 = PyUnicode_KIND(str);
11395
91.3M
    kind2 = PyUnicode_KIND(substr);
11396
91.3M
    if (kind1 < kind2)
11397
3.57M
        return 0;
11398
87.8M
    len1 = PyUnicode_GET_LENGTH(str);
11399
87.8M
    len2 = PyUnicode_GET_LENGTH(substr);
11400
87.8M
    if (len1 < len2)
11401
5.65M
        return 0;
11402
82.1M
    buf1 = PyUnicode_DATA(str);
11403
82.1M
    buf2 = PyUnicode_DATA(substr);
11404
82.1M
    if (len2 == 1) {
11405
82.1M
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11406
82.1M
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11407
82.1M
        return result;
11408
82.1M
    }
11409
41.3k
    if (kind2 != kind1) {
11410
22.4k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11411
22.4k
        if (!buf2)
11412
0
            return -1;
11413
22.4k
    }
11414
11415
41.3k
    switch (kind1) {
11416
18.8k
    case PyUnicode_1BYTE_KIND:
11417
18.8k
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11418
18.8k
        break;
11419
17.1k
    case PyUnicode_2BYTE_KIND:
11420
17.1k
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11421
17.1k
        break;
11422
5.34k
    case PyUnicode_4BYTE_KIND:
11423
5.34k
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11424
5.34k
        break;
11425
0
    default:
11426
0
        Py_UNREACHABLE();
11427
41.3k
    }
11428
11429
41.3k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
11430
41.3k
    if (kind2 != kind1)
11431
22.4k
        PyMem_Free((void *)buf2);
11432
11433
41.3k
    return result;
11434
41.3k
}
11435
11436
/* Concat to string or Unicode object giving a new Unicode object. */
11437
11438
PyObject *
11439
PyUnicode_Concat(PyObject *left, PyObject *right)
11440
38.8M
{
11441
38.8M
    PyObject *result;
11442
38.8M
    Py_UCS4 maxchar, maxchar2;
11443
38.8M
    Py_ssize_t left_len, right_len, new_len;
11444
11445
38.8M
    if (ensure_unicode(left) < 0)
11446
0
        return NULL;
11447
11448
38.8M
    if (!PyUnicode_Check(right)) {
11449
0
        PyErr_Format(PyExc_TypeError,
11450
0
            "can only concatenate str (not \"%.200s\") to str",
11451
0
            Py_TYPE(right)->tp_name);
11452
0
        return NULL;
11453
0
    }
11454
11455
    /* Shortcuts */
11456
38.8M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11457
38.8M
    if (left == empty) {
11458
72.2k
        return PyUnicode_FromObject(right);
11459
72.2k
    }
11460
38.8M
    if (right == empty) {
11461
4.58M
        return PyUnicode_FromObject(left);
11462
4.58M
    }
11463
11464
34.2M
    left_len = PyUnicode_GET_LENGTH(left);
11465
34.2M
    right_len = PyUnicode_GET_LENGTH(right);
11466
34.2M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11467
0
        PyErr_SetString(PyExc_OverflowError,
11468
0
                        "strings are too large to concat");
11469
0
        return NULL;
11470
0
    }
11471
34.2M
    new_len = left_len + right_len;
11472
11473
34.2M
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11474
34.2M
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11475
34.2M
    maxchar = Py_MAX(maxchar, maxchar2);
11476
11477
    /* Concat the two Unicode strings */
11478
34.2M
    result = PyUnicode_New(new_len, maxchar);
11479
34.2M
    if (result == NULL)
11480
0
        return NULL;
11481
34.2M
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11482
34.2M
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11483
34.2M
    assert(_PyUnicode_CheckConsistency(result, 1));
11484
34.2M
    return result;
11485
34.2M
}
11486
11487
void
11488
PyUnicode_Append(PyObject **p_left, PyObject *right)
11489
1.49M
{
11490
1.49M
    PyObject *left, *res;
11491
1.49M
    Py_UCS4 maxchar, maxchar2;
11492
1.49M
    Py_ssize_t left_len, right_len, new_len;
11493
11494
1.49M
    if (p_left == NULL) {
11495
0
        if (!PyErr_Occurred())
11496
0
            PyErr_BadInternalCall();
11497
0
        return;
11498
0
    }
11499
1.49M
    left = *p_left;
11500
1.49M
    if (right == NULL || left == NULL
11501
1.49M
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11502
0
        if (!PyErr_Occurred())
11503
0
            PyErr_BadInternalCall();
11504
0
        goto error;
11505
0
    }
11506
11507
    /* Shortcuts */
11508
1.49M
    PyObject *empty = unicode_get_empty();  // Borrowed reference
11509
1.49M
    if (left == empty) {
11510
455k
        Py_DECREF(left);
11511
455k
        *p_left = Py_NewRef(right);
11512
455k
        return;
11513
455k
    }
11514
1.03M
    if (right == empty) {
11515
0
        return;
11516
0
    }
11517
11518
1.03M
    left_len = PyUnicode_GET_LENGTH(left);
11519
1.03M
    right_len = PyUnicode_GET_LENGTH(right);
11520
1.03M
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11521
0
        PyErr_SetString(PyExc_OverflowError,
11522
0
                        "strings are too large to concat");
11523
0
        goto error;
11524
0
    }
11525
1.03M
    new_len = left_len + right_len;
11526
11527
1.03M
    if (unicode_modifiable(left)
11528
1.03M
        && PyUnicode_CheckExact(right)
11529
1.03M
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11530
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11531
           to change the structure size, but characters are stored just after
11532
           the structure, and so it requires to move all characters which is
11533
           not so different than duplicating the string. */
11534
992k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11535
992k
    {
11536
        /* append inplace */
11537
992k
        if (unicode_resize(p_left, new_len) != 0)
11538
0
            goto error;
11539
11540
        /* copy 'right' into the newly allocated area of 'left' */
11541
992k
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11542
992k
    }
11543
45.6k
    else {
11544
45.6k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11545
45.6k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11546
45.6k
        maxchar = Py_MAX(maxchar, maxchar2);
11547
11548
        /* Concat the two Unicode strings */
11549
45.6k
        res = PyUnicode_New(new_len, maxchar);
11550
45.6k
        if (res == NULL)
11551
0
            goto error;
11552
45.6k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11553
45.6k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11554
45.6k
        Py_DECREF(left);
11555
45.6k
        *p_left = res;
11556
45.6k
    }
11557
1.03M
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11558
1.03M
    return;
11559
11560
0
error:
11561
0
    Py_CLEAR(*p_left);
11562
0
}
11563
11564
void
11565
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11566
0
{
11567
0
    PyUnicode_Append(pleft, right);
11568
0
    Py_XDECREF(right);
11569
0
}
11570
11571
/*[clinic input]
11572
@permit_long_summary
11573
@text_signature "($self, sub[, start[, end]], /)"
11574
str.count as unicode_count -> Py_ssize_t
11575
11576
    self as str: self
11577
    sub as substr: unicode
11578
    start: slice_index(accept={int, NoneType}, c_default='0') = None
11579
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
11580
    /
11581
11582
Return the number of non-overlapping occurrences of substring sub in string S[start:end].
11583
11584
Optional arguments start and end are interpreted as in slice notation.
11585
[clinic start generated code]*/
11586
11587
static Py_ssize_t
11588
unicode_count_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11589
                   Py_ssize_t end)
11590
/*[clinic end generated code: output=8fcc3aef0b18edbf input=8590716ee228b935]*/
11591
20.1M
{
11592
20.1M
    assert(PyUnicode_Check(str));
11593
20.1M
    assert(PyUnicode_Check(substr));
11594
11595
20.1M
    Py_ssize_t result;
11596
20.1M
    int kind1, kind2;
11597
20.1M
    const void *buf1 = NULL, *buf2 = NULL;
11598
20.1M
    Py_ssize_t len1, len2;
11599
11600
20.1M
    kind1 = PyUnicode_KIND(str);
11601
20.1M
    kind2 = PyUnicode_KIND(substr);
11602
20.1M
    if (kind1 < kind2)
11603
0
        return 0;
11604
11605
20.1M
    len1 = PyUnicode_GET_LENGTH(str);
11606
20.1M
    len2 = PyUnicode_GET_LENGTH(substr);
11607
20.1M
    ADJUST_INDICES(start, end, len1);
11608
20.1M
    if (end - start < len2)
11609
87.5k
        return 0;
11610
11611
20.0M
    buf1 = PyUnicode_DATA(str);
11612
20.0M
    buf2 = PyUnicode_DATA(substr);
11613
20.0M
    if (kind2 != kind1) {
11614
3.97M
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
11615
3.97M
        if (!buf2)
11616
0
            goto onError;
11617
3.97M
    }
11618
11619
    // We don't reuse `anylib_count` here because of the explicit casts.
11620
20.0M
    switch (kind1) {
11621
16.0M
    case PyUnicode_1BYTE_KIND:
11622
16.0M
        result = ucs1lib_count(
11623
16.0M
            ((const Py_UCS1*)buf1) + start, end - start,
11624
16.0M
            buf2, len2, PY_SSIZE_T_MAX
11625
16.0M
            );
11626
16.0M
        break;
11627
3.09M
    case PyUnicode_2BYTE_KIND:
11628
3.09M
        result = ucs2lib_count(
11629
3.09M
            ((const Py_UCS2*)buf1) + start, end - start,
11630
3.09M
            buf2, len2, PY_SSIZE_T_MAX
11631
3.09M
            );
11632
3.09M
        break;
11633
883k
    case PyUnicode_4BYTE_KIND:
11634
883k
        result = ucs4lib_count(
11635
883k
            ((const Py_UCS4*)buf1) + start, end - start,
11636
883k
            buf2, len2, PY_SSIZE_T_MAX
11637
883k
            );
11638
883k
        break;
11639
0
    default:
11640
0
        Py_UNREACHABLE();
11641
20.0M
    }
11642
11643
20.0M
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11644
20.0M
    if (kind2 != kind1)
11645
3.97M
        PyMem_Free((void *)buf2);
11646
11647
20.0M
    return result;
11648
0
  onError:
11649
0
    assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
11650
0
    if (kind2 != kind1)
11651
0
        PyMem_Free((void *)buf2);
11652
0
    return -1;
11653
20.0M
}
11654
11655
/*[clinic input]
11656
str.encode as unicode_encode
11657
11658
    encoding: str(c_default="NULL") = 'utf-8'
11659
        The encoding in which to encode the string.
11660
    errors: str(c_default="NULL") = 'strict'
11661
        The error handling scheme to use for encoding errors.
11662
        The default is 'strict' meaning that encoding errors raise a
11663
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11664
        'xmlcharrefreplace' as well as any other name registered with
11665
        codecs.register_error that can handle UnicodeEncodeErrors.
11666
11667
Encode the string using the codec registered for encoding.
11668
[clinic start generated code]*/
11669
11670
static PyObject *
11671
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11672
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11673
16.0M
{
11674
16.0M
    return PyUnicode_AsEncodedString(self, encoding, errors);
11675
16.0M
}
11676
11677
/*[clinic input]
11678
str.expandtabs as unicode_expandtabs
11679
11680
    tabsize: int = 8
11681
11682
Return a copy where all tab characters are expanded using spaces.
11683
11684
If tabsize is not given, a tab size of 8 characters is assumed.
11685
[clinic start generated code]*/
11686
11687
static PyObject *
11688
unicode_expandtabs_impl(PyObject *self, int tabsize)
11689
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11690
6.75M
{
11691
6.75M
    Py_ssize_t i, j, line_pos, src_len, incr;
11692
6.75M
    Py_UCS4 ch;
11693
6.75M
    PyObject *u;
11694
6.75M
    const void *src_data;
11695
6.75M
    void *dest_data;
11696
6.75M
    int kind;
11697
6.75M
    int found;
11698
11699
    /* First pass: determine size of output string */
11700
6.75M
    src_len = PyUnicode_GET_LENGTH(self);
11701
6.75M
    i = j = line_pos = 0;
11702
6.75M
    kind = PyUnicode_KIND(self);
11703
6.75M
    src_data = PyUnicode_DATA(self);
11704
6.75M
    found = 0;
11705
129M
    for (; i < src_len; i++) {
11706
122M
        ch = PyUnicode_READ(kind, src_data, i);
11707
122M
        if (ch == '\t') {
11708
13.0M
            found = 1;
11709
13.0M
            if (tabsize > 0) {
11710
13.0M
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11711
13.0M
                if (j > PY_SSIZE_T_MAX - incr)
11712
0
                    goto overflow;
11713
13.0M
                line_pos += incr;
11714
13.0M
                j += incr;
11715
13.0M
            }
11716
13.0M
        }
11717
109M
        else {
11718
109M
            if (j > PY_SSIZE_T_MAX - 1)
11719
0
                goto overflow;
11720
109M
            line_pos++;
11721
109M
            j++;
11722
109M
            if (ch == '\n' || ch == '\r')
11723
12.9k
                line_pos = 0;
11724
109M
        }
11725
122M
    }
11726
6.75M
    if (!found)
11727
6.61M
        return unicode_result_unchanged(self);
11728
11729
    /* Second pass: create output string and fill it */
11730
139k
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11731
139k
    if (!u)
11732
0
        return NULL;
11733
139k
    dest_data = PyUnicode_DATA(u);
11734
11735
139k
    i = j = line_pos = 0;
11736
11737
31.3M
    for (; i < src_len; i++) {
11738
31.2M
        ch = PyUnicode_READ(kind, src_data, i);
11739
31.2M
        if (ch == '\t') {
11740
13.0M
            if (tabsize > 0) {
11741
13.0M
                incr = tabsize - (line_pos % tabsize);
11742
13.0M
                line_pos += incr;
11743
13.0M
                _PyUnicode_Fill(kind, dest_data, ' ', j, incr);
11744
13.0M
                j += incr;
11745
13.0M
            }
11746
13.0M
        }
11747
18.2M
        else {
11748
18.2M
            line_pos++;
11749
18.2M
            PyUnicode_WRITE(kind, dest_data, j, ch);
11750
18.2M
            j++;
11751
18.2M
            if (ch == '\n' || ch == '\r')
11752
0
                line_pos = 0;
11753
18.2M
        }
11754
31.2M
    }
11755
139k
    assert (j == PyUnicode_GET_LENGTH(u));
11756
139k
    return unicode_result(u);
11757
11758
0
  overflow:
11759
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11760
0
    return NULL;
11761
139k
}
11762
11763
/*[clinic input]
11764
@permit_long_summary
11765
str.find as unicode_find = str.count
11766
11767
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11768
11769
Optional arguments start and end are interpreted as in slice notation.
11770
Return -1 on failure.
11771
[clinic start generated code]*/
11772
11773
static Py_ssize_t
11774
unicode_find_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11775
                  Py_ssize_t end)
11776
/*[clinic end generated code: output=51dbe6255712e278 input=3a9d650fe4c24695]*/
11777
16.0M
{
11778
16.0M
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11779
16.0M
    if (result < 0) {
11780
218k
        return -1;
11781
218k
    }
11782
15.7M
    return result;
11783
16.0M
}
11784
11785
static PyObject *
11786
unicode_getitem(PyObject *self, Py_ssize_t index)
11787
53.9M
{
11788
53.9M
    const void *data;
11789
53.9M
    int kind;
11790
53.9M
    Py_UCS4 ch;
11791
11792
53.9M
    if (!PyUnicode_Check(self)) {
11793
0
        PyErr_BadArgument();
11794
0
        return NULL;
11795
0
    }
11796
53.9M
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11797
384
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11798
384
        return NULL;
11799
384
    }
11800
53.9M
    kind = PyUnicode_KIND(self);
11801
53.9M
    data = PyUnicode_DATA(self);
11802
53.9M
    ch = PyUnicode_READ(kind, data, index);
11803
53.9M
    return unicode_char(ch);
11804
53.9M
}
11805
11806
/* Believe it or not, this produces the same value for ASCII strings
11807
   as bytes_hash(). */
11808
static Py_hash_t
11809
unicode_hash(PyObject *self)
11810
45.7M
{
11811
45.7M
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11812
11813
#ifdef Py_DEBUG
11814
    assert(_Py_HashSecret_Initialized);
11815
#endif
11816
45.7M
    Py_hash_t hash = PyUnicode_HASH(self);
11817
45.7M
    if (hash != -1) {
11818
257k
        return hash;
11819
257k
    }
11820
45.4M
    x = Py_HashBuffer(PyUnicode_DATA(self),
11821
45.4M
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11822
11823
45.4M
    PyUnicode_SET_HASH(self, x);
11824
45.4M
    return x;
11825
45.7M
}
11826
11827
/*[clinic input]
11828
@permit_long_summary
11829
str.index as unicode_index = str.count
11830
11831
Return the lowest index in S where substring sub is found, such that sub is contained within S[start:end].
11832
11833
Optional arguments start and end are interpreted as in slice notation.
11834
Raises ValueError when the substring is not found.
11835
[clinic start generated code]*/
11836
11837
static Py_ssize_t
11838
unicode_index_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
11839
                   Py_ssize_t end)
11840
/*[clinic end generated code: output=77558288837cdf40 input=ae5e48f69ed75b06]*/
11841
574k
{
11842
574k
    Py_ssize_t result = any_find_slice(str, substr, start, end, 1);
11843
574k
    if (result == -1) {
11844
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11845
0
    }
11846
574k
    else if (result < 0) {
11847
0
        return -1;
11848
0
    }
11849
574k
    return result;
11850
574k
}
11851
11852
/*[clinic input]
11853
str.isascii as unicode_isascii
11854
11855
Return True if all characters in the string are ASCII, False otherwise.
11856
11857
ASCII characters have code points in the range U+0000-U+007F.
11858
Empty string is ASCII too.
11859
[clinic start generated code]*/
11860
11861
static PyObject *
11862
unicode_isascii_impl(PyObject *self)
11863
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11864
10.5M
{
11865
10.5M
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11866
10.5M
}
11867
11868
/*[clinic input]
11869
@permit_long_docstring_body
11870
str.islower as unicode_islower
11871
11872
Return True if the string is a lowercase string, False otherwise.
11873
11874
A string is lowercase if all cased characters in the string are lowercase and
11875
there is at least one cased character in the string.
11876
[clinic start generated code]*/
11877
11878
static PyObject *
11879
unicode_islower_impl(PyObject *self)
11880
/*[clinic end generated code: output=dbd41995bd005b81 input=c6fc0295241a1aaa]*/
11881
0
{
11882
0
    Py_ssize_t i, length;
11883
0
    int kind;
11884
0
    const void *data;
11885
0
    int cased;
11886
11887
0
    length = PyUnicode_GET_LENGTH(self);
11888
0
    kind = PyUnicode_KIND(self);
11889
0
    data = PyUnicode_DATA(self);
11890
11891
    /* Shortcut for single character strings */
11892
0
    if (length == 1)
11893
0
        return PyBool_FromLong(
11894
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11895
11896
    /* Special case for empty strings */
11897
0
    if (length == 0)
11898
0
        Py_RETURN_FALSE;
11899
11900
0
    cased = 0;
11901
0
    for (i = 0; i < length; i++) {
11902
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11903
11904
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11905
0
            Py_RETURN_FALSE;
11906
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11907
0
            cased = 1;
11908
0
    }
11909
0
    return PyBool_FromLong(cased);
11910
0
}
11911
11912
/*[clinic input]
11913
@permit_long_docstring_body
11914
str.isupper as unicode_isupper
11915
11916
Return True if the string is an uppercase string, False otherwise.
11917
11918
A string is uppercase if all cased characters in the string are uppercase and
11919
there is at least one cased character in the string.
11920
[clinic start generated code]*/
11921
11922
static PyObject *
11923
unicode_isupper_impl(PyObject *self)
11924
/*[clinic end generated code: output=049209c8e7f15f59 input=8d5cb33e67efde72]*/
11925
6.98k
{
11926
6.98k
    Py_ssize_t i, length;
11927
6.98k
    int kind;
11928
6.98k
    const void *data;
11929
6.98k
    int cased;
11930
11931
6.98k
    length = PyUnicode_GET_LENGTH(self);
11932
6.98k
    kind = PyUnicode_KIND(self);
11933
6.98k
    data = PyUnicode_DATA(self);
11934
11935
    /* Shortcut for single character strings */
11936
6.98k
    if (length == 1)
11937
0
        return PyBool_FromLong(
11938
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11939
11940
    /* Special case for empty strings */
11941
6.98k
    if (length == 0)
11942
0
        Py_RETURN_FALSE;
11943
11944
6.98k
    cased = 0;
11945
89.1k
    for (i = 0; i < length; i++) {
11946
83.0k
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11947
11948
83.0k
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11949
840
            Py_RETURN_FALSE;
11950
82.1k
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11951
6.23k
            cased = 1;
11952
83.0k
    }
11953
6.14k
    return PyBool_FromLong(cased);
11954
6.98k
}
11955
11956
/*[clinic input]
11957
str.istitle as unicode_istitle
11958
11959
Return True if the string is a title-cased string, False otherwise.
11960
11961
In a title-cased string, upper- and title-case characters may only
11962
follow uncased characters and lowercase characters only cased ones.
11963
[clinic start generated code]*/
11964
11965
static PyObject *
11966
unicode_istitle_impl(PyObject *self)
11967
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11968
0
{
11969
0
    Py_ssize_t i, length;
11970
0
    int kind;
11971
0
    const void *data;
11972
0
    int cased, previous_is_cased;
11973
11974
0
    length = PyUnicode_GET_LENGTH(self);
11975
0
    kind = PyUnicode_KIND(self);
11976
0
    data = PyUnicode_DATA(self);
11977
11978
    /* Shortcut for single character strings */
11979
0
    if (length == 1) {
11980
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11981
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11982
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11983
0
    }
11984
11985
    /* Special case for empty strings */
11986
0
    if (length == 0)
11987
0
        Py_RETURN_FALSE;
11988
11989
0
    cased = 0;
11990
0
    previous_is_cased = 0;
11991
0
    for (i = 0; i < length; i++) {
11992
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11993
11994
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11995
0
            if (previous_is_cased)
11996
0
                Py_RETURN_FALSE;
11997
0
            previous_is_cased = 1;
11998
0
            cased = 1;
11999
0
        }
12000
0
        else if (Py_UNICODE_ISLOWER(ch)) {
12001
0
            if (!previous_is_cased)
12002
0
                Py_RETURN_FALSE;
12003
0
            previous_is_cased = 1;
12004
0
            cased = 1;
12005
0
        }
12006
0
        else
12007
0
            previous_is_cased = 0;
12008
0
    }
12009
0
    return PyBool_FromLong(cased);
12010
0
}
12011
12012
/*[clinic input]
12013
@permit_long_docstring_body
12014
str.isspace as unicode_isspace
12015
12016
Return True if the string is a whitespace string, False otherwise.
12017
12018
A string is whitespace if all characters in the string are whitespace and there
12019
is at least one character in the string.
12020
[clinic start generated code]*/
12021
12022
static PyObject *
12023
unicode_isspace_impl(PyObject *self)
12024
/*[clinic end generated code: output=163a63bfa08ac2b9 input=44fe05e248c6e159]*/
12025
19.6M
{
12026
19.6M
    Py_ssize_t i, length;
12027
19.6M
    int kind;
12028
19.6M
    const void *data;
12029
12030
19.6M
    length = PyUnicode_GET_LENGTH(self);
12031
19.6M
    kind = PyUnicode_KIND(self);
12032
19.6M
    data = PyUnicode_DATA(self);
12033
12034
    /* Shortcut for single character strings */
12035
19.6M
    if (length == 1)
12036
19.6M
        return PyBool_FromLong(
12037
19.6M
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
12038
12039
    /* Special case for empty strings */
12040
0
    if (length == 0)
12041
0
        Py_RETURN_FALSE;
12042
12043
0
    for (i = 0; i < length; i++) {
12044
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12045
0
        if (!Py_UNICODE_ISSPACE(ch))
12046
0
            Py_RETURN_FALSE;
12047
0
    }
12048
0
    Py_RETURN_TRUE;
12049
0
}
12050
12051
/*[clinic input]
12052
@permit_long_docstring_body
12053
str.isalpha as unicode_isalpha
12054
12055
Return True if the string is an alphabetic string, False otherwise.
12056
12057
A string is alphabetic if all characters in the string are alphabetic and there
12058
is at least one character in the string.
12059
[clinic start generated code]*/
12060
12061
static PyObject *
12062
unicode_isalpha_impl(PyObject *self)
12063
/*[clinic end generated code: output=cc81b9ac3883ec4f input=c233000624a56e0d]*/
12064
0
{
12065
0
    Py_ssize_t i, length;
12066
0
    int kind;
12067
0
    const void *data;
12068
12069
0
    length = PyUnicode_GET_LENGTH(self);
12070
0
    kind = PyUnicode_KIND(self);
12071
0
    data = PyUnicode_DATA(self);
12072
12073
    /* Shortcut for single character strings */
12074
0
    if (length == 1)
12075
0
        return PyBool_FromLong(
12076
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
12077
12078
    /* Special case for empty strings */
12079
0
    if (length == 0)
12080
0
        Py_RETURN_FALSE;
12081
12082
0
    for (i = 0; i < length; i++) {
12083
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
12084
0
            Py_RETURN_FALSE;
12085
0
    }
12086
0
    Py_RETURN_TRUE;
12087
0
}
12088
12089
/*[clinic input]
12090
@permit_long_docstring_body
12091
str.isalnum as unicode_isalnum
12092
12093
Return True if the string is an alpha-numeric string, False otherwise.
12094
12095
A string is alpha-numeric if all characters in the string are alpha-numeric and
12096
there is at least one character in the string.
12097
[clinic start generated code]*/
12098
12099
static PyObject *
12100
unicode_isalnum_impl(PyObject *self)
12101
/*[clinic end generated code: output=a5a23490ffc3660c input=5d63ba9c9bafdb6b]*/
12102
28.0M
{
12103
28.0M
    int kind;
12104
28.0M
    const void *data;
12105
28.0M
    Py_ssize_t len, i;
12106
12107
28.0M
    kind = PyUnicode_KIND(self);
12108
28.0M
    data = PyUnicode_DATA(self);
12109
28.0M
    len = PyUnicode_GET_LENGTH(self);
12110
12111
    /* Shortcut for single character strings */
12112
28.0M
    if (len == 1) {
12113
28.0M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12114
28.0M
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12115
28.0M
    }
12116
12117
    /* Special case for empty strings */
12118
0
    if (len == 0)
12119
0
        Py_RETURN_FALSE;
12120
12121
0
    for (i = 0; i < len; i++) {
12122
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12123
0
        if (!Py_UNICODE_ISALNUM(ch))
12124
0
            Py_RETURN_FALSE;
12125
0
    }
12126
0
    Py_RETURN_TRUE;
12127
0
}
12128
12129
/*[clinic input]
12130
@permit_long_docstring_body
12131
str.isdecimal as unicode_isdecimal
12132
12133
Return True if the string is a decimal string, False otherwise.
12134
12135
A string is a decimal string if all characters in the string are decimal and
12136
there is at least one character in the string.
12137
[clinic start generated code]*/
12138
12139
static PyObject *
12140
unicode_isdecimal_impl(PyObject *self)
12141
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=8e84a58b414935a3]*/
12142
0
{
12143
0
    Py_ssize_t i, length;
12144
0
    int kind;
12145
0
    const void *data;
12146
12147
0
    length = PyUnicode_GET_LENGTH(self);
12148
0
    kind = PyUnicode_KIND(self);
12149
0
    data = PyUnicode_DATA(self);
12150
12151
    /* Shortcut for single character strings */
12152
0
    if (length == 1)
12153
0
        return PyBool_FromLong(
12154
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12155
12156
    /* Special case for empty strings */
12157
0
    if (length == 0)
12158
0
        Py_RETURN_FALSE;
12159
12160
0
    for (i = 0; i < length; i++) {
12161
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12162
0
            Py_RETURN_FALSE;
12163
0
    }
12164
0
    Py_RETURN_TRUE;
12165
0
}
12166
12167
/*[clinic input]
12168
@permit_long_docstring_body
12169
str.isdigit as unicode_isdigit
12170
12171
Return True if the string is a digit string, False otherwise.
12172
12173
A string is a digit string if all characters in the string are digits and there
12174
is at least one character in the string.
12175
[clinic start generated code]*/
12176
12177
static PyObject *
12178
unicode_isdigit_impl(PyObject *self)
12179
/*[clinic end generated code: output=10a6985311da6858 input=99e284affb54d4a0]*/
12180
1.96M
{
12181
1.96M
    Py_ssize_t i, length;
12182
1.96M
    int kind;
12183
1.96M
    const void *data;
12184
12185
1.96M
    length = PyUnicode_GET_LENGTH(self);
12186
1.96M
    kind = PyUnicode_KIND(self);
12187
1.96M
    data = PyUnicode_DATA(self);
12188
12189
    /* Shortcut for single character strings */
12190
1.96M
    if (length == 1) {
12191
1.96M
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12192
1.96M
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12193
1.96M
    }
12194
12195
    /* Special case for empty strings */
12196
306
    if (length == 0)
12197
0
        Py_RETURN_FALSE;
12198
12199
1.09k
    for (i = 0; i < length; i++) {
12200
786
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12201
0
            Py_RETURN_FALSE;
12202
786
    }
12203
306
    Py_RETURN_TRUE;
12204
306
}
12205
12206
/*[clinic input]
12207
@permit_long_docstring_body
12208
str.isnumeric as unicode_isnumeric
12209
12210
Return True if the string is a numeric string, False otherwise.
12211
12212
A string is numeric if all characters in the string are numeric and there is at
12213
least one character in the string.
12214
[clinic start generated code]*/
12215
12216
static PyObject *
12217
unicode_isnumeric_impl(PyObject *self)
12218
/*[clinic end generated code: output=9172a32d9013051a input=e9f5b6b8b29b0ee6]*/
12219
0
{
12220
0
    Py_ssize_t i, length;
12221
0
    int kind;
12222
0
    const void *data;
12223
12224
0
    length = PyUnicode_GET_LENGTH(self);
12225
0
    kind = PyUnicode_KIND(self);
12226
0
    data = PyUnicode_DATA(self);
12227
12228
    /* Shortcut for single character strings */
12229
0
    if (length == 1)
12230
0
        return PyBool_FromLong(
12231
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12232
12233
    /* Special case for empty strings */
12234
0
    if (length == 0)
12235
0
        Py_RETURN_FALSE;
12236
12237
0
    for (i = 0; i < length; i++) {
12238
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12239
0
            Py_RETURN_FALSE;
12240
0
    }
12241
0
    Py_RETURN_TRUE;
12242
0
}
12243
12244
Py_ssize_t
12245
_PyUnicode_ScanIdentifier(PyObject *self)
12246
13.9k
{
12247
13.9k
    Py_ssize_t i;
12248
13.9k
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12249
13.9k
    if (len == 0) {
12250
        /* an empty string is not a valid identifier */
12251
0
        return 0;
12252
0
    }
12253
12254
13.9k
    int kind = PyUnicode_KIND(self);
12255
13.9k
    const void *data = PyUnicode_DATA(self);
12256
13.9k
    Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12257
    /* PEP 3131 says that the first character must be in
12258
       XID_Start and subsequent characters in XID_Continue,
12259
       and for the ASCII range, the 2.x rules apply (i.e
12260
       start with letters and underscore, continue with
12261
       letters, digits, underscore). However, given the current
12262
       definition of XID_Start and XID_Continue, it is sufficient
12263
       to check just for these, except that _ must be allowed
12264
       as starting an identifier.  */
12265
13.9k
    if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12266
465
        return 0;
12267
465
    }
12268
12269
60.5k
    for (i = 1; i < len; i++) {
12270
47.2k
        ch = PyUnicode_READ(kind, data, i);
12271
47.2k
        if (!_PyUnicode_IsXidContinue(ch)) {
12272
218
            return i;
12273
218
        }
12274
47.2k
    }
12275
13.2k
    return i;
12276
13.5k
}
12277
12278
int
12279
PyUnicode_IsIdentifier(PyObject *self)
12280
958
{
12281
958
    Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12282
958
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12283
    /* an empty string is not a valid identifier */
12284
958
    return len && i == len;
12285
958
}
12286
12287
/*[clinic input]
12288
@permit_long_docstring_body
12289
str.isidentifier as unicode_isidentifier
12290
12291
Return True if the string is a valid Python identifier, False otherwise.
12292
12293
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12294
such as "def" or "class".
12295
[clinic start generated code]*/
12296
12297
static PyObject *
12298
unicode_isidentifier_impl(PyObject *self)
12299
/*[clinic end generated code: output=fe585a9666572905 input=86315dd889d7bd04]*/
12300
496
{
12301
496
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12302
496
}
12303
12304
/*[clinic input]
12305
@permit_long_summary
12306
str.isprintable as unicode_isprintable
12307
12308
Return True if all characters in the string are printable, False otherwise.
12309
12310
A character is printable if repr() may use it in its output.
12311
[clinic start generated code]*/
12312
12313
static PyObject *
12314
unicode_isprintable_impl(PyObject *self)
12315
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=18345ba847084ec5]*/
12316
1.83M
{
12317
1.83M
    Py_ssize_t i, length;
12318
1.83M
    int kind;
12319
1.83M
    const void *data;
12320
12321
1.83M
    length = PyUnicode_GET_LENGTH(self);
12322
1.83M
    kind = PyUnicode_KIND(self);
12323
1.83M
    data = PyUnicode_DATA(self);
12324
12325
    /* Shortcut for single character strings */
12326
1.83M
    if (length == 1)
12327
1.83M
        return PyBool_FromLong(
12328
1.83M
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12329
12330
0
    for (i = 0; i < length; i++) {
12331
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12332
0
            Py_RETURN_FALSE;
12333
0
        }
12334
0
    }
12335
0
    Py_RETURN_TRUE;
12336
0
}
12337
12338
/*[clinic input]
12339
@permit_long_docstring_body
12340
str.join as unicode_join
12341
12342
    iterable: object
12343
    /
12344
12345
Concatenate any number of strings.
12346
12347
The string whose method is called is inserted in between each given string.
12348
The result is returned as a new string.
12349
12350
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12351
[clinic start generated code]*/
12352
12353
static PyObject *
12354
unicode_join(PyObject *self, PyObject *iterable)
12355
/*[clinic end generated code: output=6857e7cecfe7bf98 input=bac724ed412ef3f8]*/
12356
20.5M
{
12357
20.5M
    return PyUnicode_Join(self, iterable);
12358
20.5M
}
12359
12360
static Py_ssize_t
12361
unicode_length(PyObject *self)
12362
41.7M
{
12363
41.7M
    return PyUnicode_GET_LENGTH(self);
12364
41.7M
}
12365
12366
/*[clinic input]
12367
str.ljust as unicode_ljust
12368
12369
    width: Py_ssize_t
12370
    fillchar: Py_UCS4 = ' '
12371
    /
12372
12373
Return a left-justified string of length width.
12374
12375
Padding is done using the specified fill character (default is a space).
12376
[clinic start generated code]*/
12377
12378
static PyObject *
12379
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12380
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12381
0
{
12382
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12383
0
        return unicode_result_unchanged(self);
12384
12385
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12386
0
}
12387
12388
/*[clinic input]
12389
str.lower as unicode_lower
12390
12391
Return a copy of the string converted to lowercase.
12392
[clinic start generated code]*/
12393
12394
static PyObject *
12395
unicode_lower_impl(PyObject *self)
12396
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12397
92.6M
{
12398
92.6M
    if (PyUnicode_IS_ASCII(self))
12399
71.1M
        return ascii_upper_or_lower(self, 1);
12400
21.5M
    return case_operation(self, do_lower);
12401
92.6M
}
12402
12403
59.5M
#define LEFTSTRIP 0
12404
77.1M
#define RIGHTSTRIP 1
12405
37.0M
#define BOTHSTRIP 2
12406
12407
/* Arrays indexed by above */
12408
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12409
12410
0
#define STRIPNAME(i) (stripfuncnames[i])
12411
12412
/* externally visible for str.strip(unicode) */
12413
PyObject *
12414
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12415
8.44M
{
12416
8.44M
    const void *data;
12417
8.44M
    int kind;
12418
8.44M
    Py_ssize_t i, j, len;
12419
8.44M
    BLOOM_MASK sepmask;
12420
8.44M
    Py_ssize_t seplen;
12421
12422
8.44M
    kind = PyUnicode_KIND(self);
12423
8.44M
    data = PyUnicode_DATA(self);
12424
8.44M
    len = PyUnicode_GET_LENGTH(self);
12425
8.44M
    seplen = PyUnicode_GET_LENGTH(sepobj);
12426
8.44M
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12427
8.44M
                              PyUnicode_DATA(sepobj),
12428
8.44M
                              seplen);
12429
12430
8.44M
    i = 0;
12431
8.44M
    if (striptype != RIGHTSTRIP) {
12432
419k
        while (i < len) {
12433
416k
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12434
416k
            if (!BLOOM(sepmask, ch))
12435
380k
                break;
12436
36.4k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12437
2.23k
                break;
12438
34.2k
            i++;
12439
34.2k
        }
12440
384k
    }
12441
12442
8.44M
    j = len;
12443
8.44M
    if (striptype != LEFTSTRIP) {
12444
8.05M
        j--;
12445
8.44M
        while (j >= i) {
12446
3.65M
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12447
3.65M
            if (!BLOOM(sepmask, ch))
12448
3.25M
                break;
12449
406k
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12450
24.4k
                break;
12451
382k
            j--;
12452
382k
        }
12453
12454
8.05M
        j++;
12455
8.05M
    }
12456
12457
8.44M
    return PyUnicode_Substring(self, i, j);
12458
8.44M
}
12459
12460
PyObject*
12461
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12462
268M
{
12463
268M
    const unsigned char *data;
12464
268M
    int kind;
12465
268M
    Py_ssize_t length;
12466
12467
268M
    length = PyUnicode_GET_LENGTH(self);
12468
268M
    end = Py_MIN(end, length);
12469
12470
268M
    if (start == 0 && end == length)
12471
52.1M
        return unicode_result_unchanged(self);
12472
12473
216M
    if (start < 0 || end < 0) {
12474
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12475
0
        return NULL;
12476
0
    }
12477
216M
    if (start >= length || end < start)
12478
177k
        _Py_RETURN_UNICODE_EMPTY();
12479
12480
216M
    length = end - start;
12481
216M
    if (PyUnicode_IS_ASCII(self)) {
12482
46.4M
        data = PyUnicode_1BYTE_DATA(self);
12483
46.4M
        return _PyUnicode_FromASCII((const char*)(data + start), length);
12484
46.4M
    }
12485
169M
    else {
12486
169M
        kind = PyUnicode_KIND(self);
12487
169M
        data = PyUnicode_1BYTE_DATA(self);
12488
169M
        return PyUnicode_FromKindAndData(kind,
12489
169M
                                         data + kind * start,
12490
169M
                                         length);
12491
169M
    }
12492
216M
}
12493
12494
static PyObject *
12495
do_strip(PyObject *self, int striptype)
12496
49.4M
{
12497
49.4M
    Py_ssize_t len, i, j;
12498
12499
49.4M
    len = PyUnicode_GET_LENGTH(self);
12500
12501
49.4M
    if (PyUnicode_IS_ASCII(self)) {
12502
40.9M
        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12503
12504
40.9M
        i = 0;
12505
40.9M
        if (striptype != RIGHTSTRIP) {
12506
31.9M
            while (i < len) {
12507
20.3M
                Py_UCS1 ch = data[i];
12508
20.3M
                if (!_Py_ascii_whitespace[ch])
12509
19.3M
                    break;
12510
1.04M
                i++;
12511
1.04M
            }
12512
30.9M
        }
12513
12514
40.9M
        j = len;
12515
40.9M
        if (striptype != LEFTSTRIP) {
12516
40.5M
            j--;
12517
51.0M
            while (j >= i) {
12518
33.8M
                Py_UCS1 ch = data[j];
12519
33.8M
                if (!_Py_ascii_whitespace[ch])
12520
23.3M
                    break;
12521
10.4M
                j--;
12522
10.4M
            }
12523
40.5M
            j++;
12524
40.5M
        }
12525
40.9M
    }
12526
8.55M
    else {
12527
8.55M
        int kind = PyUnicode_KIND(self);
12528
8.55M
        const void *data = PyUnicode_DATA(self);
12529
12530
8.55M
        i = 0;
12531
8.55M
        if (striptype != RIGHTSTRIP) {
12532
8.97M
            while (i < len) {
12533
8.96M
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12534
8.96M
                if (!Py_UNICODE_ISSPACE(ch))
12535
7.32M
                    break;
12536
1.63M
                i++;
12537
1.63M
            }
12538
7.33M
        }
12539
12540
8.55M
        j = len;
12541
8.55M
        if (striptype != LEFTSTRIP) {
12542
7.64M
            j--;
12543
8.45M
            while (j >= i) {
12544
8.42M
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12545
8.42M
                if (!Py_UNICODE_ISSPACE(ch))
12546
7.61M
                    break;
12547
802k
                j--;
12548
802k
            }
12549
7.64M
            j++;
12550
7.64M
        }
12551
8.55M
    }
12552
12553
49.4M
    return PyUnicode_Substring(self, i, j);
12554
49.4M
}
12555
12556
12557
static PyObject *
12558
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12559
57.9M
{
12560
57.9M
    if (sep != Py_None) {
12561
8.44M
        if (PyUnicode_Check(sep))
12562
8.44M
            return _PyUnicode_XStrip(self, striptype, sep);
12563
0
        else {
12564
0
            PyErr_Format(PyExc_TypeError,
12565
0
                         "%s arg must be None or str",
12566
0
                         STRIPNAME(striptype));
12567
0
            return NULL;
12568
0
        }
12569
8.44M
    }
12570
12571
49.4M
    return do_strip(self, striptype);
12572
57.9M
}
12573
12574
12575
/*[clinic input]
12576
@permit_long_summary
12577
str.strip as unicode_strip
12578
12579
    chars: object = None
12580
    /
12581
12582
Return a copy of the string with leading and trailing whitespace removed.
12583
12584
If chars is given and not None, remove characters in chars instead.
12585
[clinic start generated code]*/
12586
12587
static PyObject *
12588
unicode_strip_impl(PyObject *self, PyObject *chars)
12589
/*[clinic end generated code: output=ca19018454345d57 input=8bc6353450345fbd]*/
12590
37.0M
{
12591
37.0M
    return do_argstrip(self, BOTHSTRIP, chars);
12592
37.0M
}
12593
12594
12595
/*[clinic input]
12596
str.lstrip as unicode_lstrip
12597
12598
    chars: object = None
12599
    /
12600
12601
Return a copy of the string with leading whitespace removed.
12602
12603
If chars is given and not None, remove characters in chars instead.
12604
[clinic start generated code]*/
12605
12606
static PyObject *
12607
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12608
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12609
1.60M
{
12610
1.60M
    return do_argstrip(self, LEFTSTRIP, chars);
12611
1.60M
}
12612
12613
12614
/*[clinic input]
12615
str.rstrip as unicode_rstrip
12616
12617
    chars: object = None
12618
    /
12619
12620
Return a copy of the string with trailing whitespace removed.
12621
12622
If chars is given and not None, remove characters in chars instead.
12623
[clinic start generated code]*/
12624
12625
static PyObject *
12626
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12627
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12628
19.2M
{
12629
19.2M
    return do_argstrip(self, RIGHTSTRIP, chars);
12630
19.2M
}
12631
12632
12633
static PyObject*
12634
unicode_repeat(PyObject *str, Py_ssize_t len)
12635
399k
{
12636
399k
    PyObject *u;
12637
399k
    Py_ssize_t nchars, n;
12638
12639
399k
    if (len < 1)
12640
36.8k
        _Py_RETURN_UNICODE_EMPTY();
12641
12642
    /* no repeat, return original string */
12643
362k
    if (len == 1)
12644
124k
        return unicode_result_unchanged(str);
12645
12646
238k
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12647
0
        PyErr_SetString(PyExc_OverflowError,
12648
0
                        "repeated string is too long");
12649
0
        return NULL;
12650
0
    }
12651
238k
    nchars = len * PyUnicode_GET_LENGTH(str);
12652
12653
238k
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12654
238k
    if (!u)
12655
0
        return NULL;
12656
238k
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12657
12658
238k
    if (PyUnicode_GET_LENGTH(str) == 1) {
12659
235k
        int kind = PyUnicode_KIND(str);
12660
235k
        Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12661
235k
        if (kind == PyUnicode_1BYTE_KIND) {
12662
235k
            void *to = PyUnicode_DATA(u);
12663
235k
            memset(to, (unsigned char)fill_char, len);
12664
235k
        }
12665
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12666
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12667
0
            for (n = 0; n < len; ++n)
12668
0
                ucs2[n] = fill_char;
12669
0
        } else {
12670
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12671
0
            assert(kind == PyUnicode_4BYTE_KIND);
12672
0
            for (n = 0; n < len; ++n)
12673
0
                ucs4[n] = fill_char;
12674
0
        }
12675
235k
    }
12676
2.39k
    else {
12677
2.39k
        Py_ssize_t char_size = PyUnicode_KIND(str);
12678
2.39k
        char *to = (char *) PyUnicode_DATA(u);
12679
2.39k
        _PyBytes_Repeat(to, nchars * char_size, PyUnicode_DATA(str),
12680
2.39k
            PyUnicode_GET_LENGTH(str) * char_size);
12681
2.39k
    }
12682
12683
238k
    assert(_PyUnicode_CheckConsistency(u, 1));
12684
238k
    return u;
12685
238k
}
12686
12687
PyObject *
12688
PyUnicode_Replace(PyObject *str,
12689
                  PyObject *substr,
12690
                  PyObject *replstr,
12691
                  Py_ssize_t maxcount)
12692
2
{
12693
2
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12694
2
            ensure_unicode(replstr) < 0)
12695
0
        return NULL;
12696
2
    return replace(str, substr, replstr, maxcount);
12697
2
}
12698
12699
/*[clinic input]
12700
@permit_long_docstring_body
12701
str.replace as unicode_replace
12702
12703
    old: unicode
12704
    new: unicode
12705
    /
12706
    count: Py_ssize_t = -1
12707
        Maximum number of occurrences to replace.
12708
        -1 (the default value) means replace all occurrences.
12709
12710
Return a copy with all occurrences of substring old replaced by new.
12711
12712
If the optional argument count is given, only the first count occurrences are
12713
replaced.
12714
[clinic start generated code]*/
12715
12716
static PyObject *
12717
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12718
                     Py_ssize_t count)
12719
/*[clinic end generated code: output=b63f1a8b5eebf448 input=f27ca92ac46b65a1]*/
12720
78.7M
{
12721
78.7M
    return replace(self, old, new, count);
12722
78.7M
}
12723
12724
/*[clinic input]
12725
@permit_long_docstring_body
12726
str.removeprefix as unicode_removeprefix
12727
12728
    prefix: unicode
12729
    /
12730
12731
Return a str with the given prefix string removed if present.
12732
12733
If the string starts with the prefix string, return string[len(prefix):].
12734
Otherwise, return a copy of the original string.
12735
[clinic start generated code]*/
12736
12737
static PyObject *
12738
unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
12739
/*[clinic end generated code: output=f1e5945e9763bcb9 input=1989a856dbb813f1]*/
12740
0
{
12741
0
    int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
12742
0
    if (match == -1) {
12743
0
        return NULL;
12744
0
    }
12745
0
    if (match) {
12746
0
        return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
12747
0
                                   PyUnicode_GET_LENGTH(self));
12748
0
    }
12749
0
    return unicode_result_unchanged(self);
12750
0
}
12751
12752
/*[clinic input]
12753
str.removesuffix as unicode_removesuffix
12754
12755
    suffix: unicode
12756
    /
12757
12758
Return a str with the given suffix string removed if present.
12759
12760
If the string ends with the suffix string and that suffix is not empty,
12761
return string[:-len(suffix)]. Otherwise, return a copy of the original
12762
string.
12763
[clinic start generated code]*/
12764
12765
static PyObject *
12766
unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
12767
/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
12768
0
{
12769
0
    int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
12770
0
    if (match == -1) {
12771
0
        return NULL;
12772
0
    }
12773
0
    if (match) {
12774
0
        return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
12775
0
                                            - PyUnicode_GET_LENGTH(suffix));
12776
0
    }
12777
0
    return unicode_result_unchanged(self);
12778
0
}
12779
12780
static PyObject *
12781
unicode_repr(PyObject *unicode)
12782
3.94M
{
12783
3.94M
    Py_ssize_t isize = PyUnicode_GET_LENGTH(unicode);
12784
3.94M
    const void *idata = PyUnicode_DATA(unicode);
12785
12786
    /* Compute length of output, quote characters, and
12787
       maximum character */
12788
3.94M
    Py_ssize_t osize = 0;
12789
3.94M
    Py_UCS4 maxch = 127;
12790
3.94M
    Py_ssize_t squote = 0;
12791
3.94M
    Py_ssize_t dquote = 0;
12792
3.94M
    int ikind = PyUnicode_KIND(unicode);
12793
137M
    for (Py_ssize_t i = 0; i < isize; i++) {
12794
133M
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12795
133M
        Py_ssize_t incr = 1;
12796
133M
        switch (ch) {
12797
157k
        case '\'': squote++; break;
12798
591k
        case '"':  dquote++; break;
12799
136k
        case '\\': case '\t': case '\r': case '\n':
12800
136k
            incr = 2;
12801
136k
            break;
12802
132M
        default:
12803
            /* Fast-path ASCII */
12804
132M
            if (ch < ' ' || ch == 0x7f)
12805
83.4M
                incr = 4; /* \xHH */
12806
49.4M
            else if (ch < 0x7f)
12807
41.6M
                ;
12808
7.82M
            else if (Py_UNICODE_ISPRINTABLE(ch))
12809
7.72M
                maxch = (ch > maxch) ? ch : maxch;
12810
98.6k
            else if (ch < 0x100)
12811
26.7k
                incr = 4; /* \xHH */
12812
71.8k
            else if (ch < 0x10000)
12813
49.3k
                incr = 6; /* \uHHHH */
12814
22.5k
            else
12815
22.5k
                incr = 10; /* \uHHHHHHHH */
12816
133M
        }
12817
133M
        if (osize > PY_SSIZE_T_MAX - incr) {
12818
0
            PyErr_SetString(PyExc_OverflowError,
12819
0
                            "string is too long to generate repr");
12820
0
            return NULL;
12821
0
        }
12822
133M
        osize += incr;
12823
133M
    }
12824
12825
3.94M
    Py_UCS4 quote = '\'';
12826
3.94M
    int changed = (osize != isize);
12827
3.94M
    if (squote) {
12828
73.3k
        changed = 1;
12829
73.3k
        if (dquote)
12830
            /* Both squote and dquote present. Use squote,
12831
               and escape them */
12832
7.50k
            osize += squote;
12833
65.8k
        else
12834
65.8k
            quote = '"';
12835
73.3k
    }
12836
3.94M
    osize += 2;   /* quotes */
12837
12838
3.94M
    PyObject *repr = PyUnicode_New(osize, maxch);
12839
3.94M
    if (repr == NULL)
12840
0
        return NULL;
12841
3.94M
    int okind = PyUnicode_KIND(repr);
12842
3.94M
    void *odata = PyUnicode_DATA(repr);
12843
12844
3.94M
    if (!changed) {
12845
3.35M
        PyUnicode_WRITE(okind, odata, 0, quote);
12846
12847
3.35M
        _PyUnicode_FastCopyCharacters(repr, 1,
12848
3.35M
                                      unicode, 0,
12849
3.35M
                                      isize);
12850
12851
3.35M
        PyUnicode_WRITE(okind, odata, osize-1, quote);
12852
3.35M
    }
12853
598k
    else {
12854
598k
        switch (okind) {
12855
381k
        case PyUnicode_1BYTE_KIND:
12856
381k
            ucs1lib_repr(unicode, quote, odata);
12857
381k
            break;
12858
213k
        case PyUnicode_2BYTE_KIND:
12859
213k
            ucs2lib_repr(unicode, quote, odata);
12860
213k
            break;
12861
4.10k
        default:
12862
4.10k
            assert(okind == PyUnicode_4BYTE_KIND);
12863
4.10k
            ucs4lib_repr(unicode, quote, odata);
12864
598k
        }
12865
598k
    }
12866
12867
3.94M
    assert(_PyUnicode_CheckConsistency(repr, 1));
12868
3.94M
    return repr;
12869
3.94M
}
12870
12871
/*[clinic input]
12872
@permit_long_summary
12873
str.rfind as unicode_rfind = str.count
12874
12875
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12876
12877
Optional arguments start and end are interpreted as in slice notation.
12878
Return -1 on failure.
12879
[clinic start generated code]*/
12880
12881
static Py_ssize_t
12882
unicode_rfind_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12883
                   Py_ssize_t end)
12884
/*[clinic end generated code: output=880b29f01dd014c8 input=7f7e97d5cd3299a2]*/
12885
9.87k
{
12886
9.87k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12887
9.87k
    if (result < 0) {
12888
6.61k
        return -1;
12889
6.61k
    }
12890
3.26k
    return result;
12891
9.87k
}
12892
12893
/*[clinic input]
12894
@permit_long_summary
12895
str.rindex as unicode_rindex = str.count
12896
12897
Return the highest index in S where substring sub is found, such that sub is contained within S[start:end].
12898
12899
Optional arguments start and end are interpreted as in slice notation.
12900
Raises ValueError when the substring is not found.
12901
[clinic start generated code]*/
12902
12903
static Py_ssize_t
12904
unicode_rindex_impl(PyObject *str, PyObject *substr, Py_ssize_t start,
12905
                    Py_ssize_t end)
12906
/*[clinic end generated code: output=5f3aef124c867fe1 input=0363a324740b3e62]*/
12907
119k
{
12908
119k
    Py_ssize_t result = any_find_slice(str, substr, start, end, -1);
12909
119k
    if (result == -1) {
12910
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12911
0
    }
12912
119k
    else if (result < 0) {
12913
0
        return -1;
12914
0
    }
12915
119k
    return result;
12916
119k
}
12917
12918
/*[clinic input]
12919
str.rjust as unicode_rjust
12920
12921
    width: Py_ssize_t
12922
    fillchar: Py_UCS4 = ' '
12923
    /
12924
12925
Return a right-justified string of length width.
12926
12927
Padding is done using the specified fill character (default is a space).
12928
[clinic start generated code]*/
12929
12930
static PyObject *
12931
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12932
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12933
0
{
12934
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12935
0
        return unicode_result_unchanged(self);
12936
12937
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12938
0
}
12939
12940
PyObject *
12941
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12942
0
{
12943
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12944
0
        return NULL;
12945
12946
0
    return split(s, sep, maxsplit);
12947
0
}
12948
12949
/*[clinic input]
12950
@permit_long_summary
12951
str.split as unicode_split
12952
12953
    sep: object = None
12954
        The separator used to split the string.
12955
12956
        When set to None (the default value), will split on any whitespace
12957
        character (including \n \r \t \f and spaces) and will discard
12958
        empty strings from the result.
12959
    maxsplit: Py_ssize_t = -1
12960
        Maximum number of splits.
12961
        -1 (the default value) means no limit.
12962
12963
Return a list of the substrings in the string, using sep as the separator string.
12964
12965
Splitting starts at the front of the string and works to the end.
12966
12967
Note, str.split() is mainly useful for data that has been intentionally
12968
delimited.  With natural text that includes punctuation, consider using
12969
the regular expression module.
12970
12971
[clinic start generated code]*/
12972
12973
static PyObject *
12974
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12975
/*[clinic end generated code: output=3a65b1db356948dc input=2c1fd08a78e038b8]*/
12976
22.7M
{
12977
22.7M
    if (sep == Py_None)
12978
180k
        return split(self, NULL, maxsplit);
12979
22.6M
    if (PyUnicode_Check(sep))
12980
22.6M
        return split(self, sep, maxsplit);
12981
12982
0
    PyErr_Format(PyExc_TypeError,
12983
0
                 "must be str or None, not %.100s",
12984
0
                 Py_TYPE(sep)->tp_name);
12985
0
    return NULL;
12986
22.6M
}
12987
12988
PyObject *
12989
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12990
6.63M
{
12991
6.63M
    PyObject* out;
12992
6.63M
    int kind1, kind2;
12993
6.63M
    const void *buf1, *buf2;
12994
6.63M
    Py_ssize_t len1, len2;
12995
12996
6.63M
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12997
0
        return NULL;
12998
12999
6.63M
    kind1 = PyUnicode_KIND(str_obj);
13000
6.63M
    kind2 = PyUnicode_KIND(sep_obj);
13001
6.63M
    len1 = PyUnicode_GET_LENGTH(str_obj);
13002
6.63M
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13003
6.63M
    if (kind1 < kind2 || len1 < len2) {
13004
1.22k
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13005
1.22k
        return PyTuple_Pack(3, str_obj, empty, empty);
13006
1.22k
    }
13007
6.62M
    buf1 = PyUnicode_DATA(str_obj);
13008
6.62M
    buf2 = PyUnicode_DATA(sep_obj);
13009
6.62M
    if (kind2 != kind1) {
13010
83.1k
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13011
83.1k
        if (!buf2)
13012
0
            return NULL;
13013
83.1k
    }
13014
13015
6.62M
    switch (kind1) {
13016
6.54M
    case PyUnicode_1BYTE_KIND:
13017
6.54M
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13018
2.38M
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13019
4.16M
        else
13020
4.16M
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13021
6.54M
        break;
13022
73.0k
    case PyUnicode_2BYTE_KIND:
13023
73.0k
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13024
73.0k
        break;
13025
10.1k
    case PyUnicode_4BYTE_KIND:
13026
10.1k
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13027
10.1k
        break;
13028
0
    default:
13029
0
        Py_UNREACHABLE();
13030
6.62M
    }
13031
13032
6.62M
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13033
6.62M
    if (kind2 != kind1)
13034
83.1k
        PyMem_Free((void *)buf2);
13035
13036
6.62M
    return out;
13037
6.62M
}
13038
13039
13040
PyObject *
13041
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
13042
11.3k
{
13043
11.3k
    PyObject* out;
13044
11.3k
    int kind1, kind2;
13045
11.3k
    const void *buf1, *buf2;
13046
11.3k
    Py_ssize_t len1, len2;
13047
13048
11.3k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
13049
0
        return NULL;
13050
13051
11.3k
    kind1 = PyUnicode_KIND(str_obj);
13052
11.3k
    kind2 = PyUnicode_KIND(sep_obj);
13053
11.3k
    len1 = PyUnicode_GET_LENGTH(str_obj);
13054
11.3k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13055
11.3k
    if (kind1 < kind2 || len1 < len2) {
13056
0
        PyObject *empty = unicode_get_empty();  // Borrowed reference
13057
0
        return PyTuple_Pack(3, empty, empty, str_obj);
13058
0
    }
13059
11.3k
    buf1 = PyUnicode_DATA(str_obj);
13060
11.3k
    buf2 = PyUnicode_DATA(sep_obj);
13061
11.3k
    if (kind2 != kind1) {
13062
0
        buf2 = unicode_askind(kind2, buf2, len2, kind1);
13063
0
        if (!buf2)
13064
0
            return NULL;
13065
0
    }
13066
13067
11.3k
    switch (kind1) {
13068
11.3k
    case PyUnicode_1BYTE_KIND:
13069
11.3k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13070
11.3k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13071
0
        else
13072
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13073
11.3k
        break;
13074
0
    case PyUnicode_2BYTE_KIND:
13075
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13076
0
        break;
13077
0
    case PyUnicode_4BYTE_KIND:
13078
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13079
0
        break;
13080
0
    default:
13081
0
        Py_UNREACHABLE();
13082
11.3k
    }
13083
13084
11.3k
    assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
13085
11.3k
    if (kind2 != kind1)
13086
0
        PyMem_Free((void *)buf2);
13087
13088
11.3k
    return out;
13089
11.3k
}
13090
13091
/*[clinic input]
13092
@permit_long_docstring_body
13093
str.partition as unicode_partition
13094
13095
    sep: object
13096
    /
13097
13098
Partition the string into three parts using the given separator.
13099
13100
This will search for the separator in the string.  If the separator is found,
13101
returns a 3-tuple containing the part before the separator, the separator
13102
itself, and the part after it.
13103
13104
If the separator is not found, returns a 3-tuple containing the original string
13105
and two empty strings.
13106
[clinic start generated code]*/
13107
13108
static PyObject *
13109
unicode_partition(PyObject *self, PyObject *sep)
13110
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=4d854b520d7b0e97]*/
13111
6.63M
{
13112
6.63M
    return PyUnicode_Partition(self, sep);
13113
6.63M
}
13114
13115
/*[clinic input]
13116
@permit_long_docstring_body
13117
str.rpartition as unicode_rpartition = str.partition
13118
13119
Partition the string into three parts using the given separator.
13120
13121
This will search for the separator in the string, starting at the end. If
13122
the separator is found, returns a 3-tuple containing the part before the
13123
separator, the separator itself, and the part after it.
13124
13125
If the separator is not found, returns a 3-tuple containing two empty strings
13126
and the original string.
13127
[clinic start generated code]*/
13128
13129
static PyObject *
13130
unicode_rpartition(PyObject *self, PyObject *sep)
13131
/*[clinic end generated code: output=1aa13cf1156572aa input=a6adabe91e75b486]*/
13132
11.3k
{
13133
11.3k
    return PyUnicode_RPartition(self, sep);
13134
11.3k
}
13135
13136
PyObject *
13137
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13138
0
{
13139
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13140
0
        return NULL;
13141
13142
0
    return rsplit(s, sep, maxsplit);
13143
0
}
13144
13145
/*[clinic input]
13146
@permit_long_summary
13147
str.rsplit as unicode_rsplit = str.split
13148
13149
Return a list of the substrings in the string, using sep as the separator string.
13150
13151
Splitting starts at the end of the string and works to the front.
13152
[clinic start generated code]*/
13153
13154
static PyObject *
13155
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13156
/*[clinic end generated code: output=c2b815c63bcabffc input=0f762e30d267fa83]*/
13157
50
{
13158
50
    if (sep == Py_None)
13159
0
        return rsplit(self, NULL, maxsplit);
13160
50
    if (PyUnicode_Check(sep))
13161
50
        return rsplit(self, sep, maxsplit);
13162
13163
0
    PyErr_Format(PyExc_TypeError,
13164
0
                 "must be str or None, not %.100s",
13165
0
                 Py_TYPE(sep)->tp_name);
13166
0
    return NULL;
13167
50
}
13168
13169
/*[clinic input]
13170
@permit_long_docstring_body
13171
str.splitlines as unicode_splitlines
13172
13173
    keepends: bool = False
13174
13175
Return a list of the lines in the string, breaking at line boundaries.
13176
13177
Line breaks are not included in the resulting list unless keepends is given and
13178
true.
13179
[clinic start generated code]*/
13180
13181
static PyObject *
13182
unicode_splitlines_impl(PyObject *self, int keepends)
13183
/*[clinic end generated code: output=f664dcdad153ec40 input=39eeafbfef61c827]*/
13184
13.7k
{
13185
13.7k
    return PyUnicode_Splitlines(self, keepends);
13186
13.7k
}
13187
13188
static
13189
PyObject *unicode_str(PyObject *self)
13190
3.30M
{
13191
3.30M
    return unicode_result_unchanged(self);
13192
3.30M
}
13193
13194
/*[clinic input]
13195
@permit_long_summary
13196
str.swapcase as unicode_swapcase
13197
13198
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13199
[clinic start generated code]*/
13200
13201
static PyObject *
13202
unicode_swapcase_impl(PyObject *self)
13203
/*[clinic end generated code: output=5d28966bf6d7b2af input=85bc39a9b4e8ee91]*/
13204
0
{
13205
0
    return case_operation(self, do_swapcase);
13206
0
}
13207
13208
/*[clinic input]
13209
13210
@staticmethod
13211
str.maketrans as unicode_maketrans
13212
13213
  x: object
13214
13215
  y: unicode=NULL
13216
13217
  z: unicode=NULL
13218
13219
  /
13220
13221
Return a translation table usable for str.translate().
13222
13223
If there is only one argument, it must be a dictionary mapping Unicode
13224
ordinals (integers) or characters to Unicode ordinals, strings or None.
13225
Character keys will be then converted to ordinals.
13226
If there are two arguments, they must be strings of equal length, and
13227
in the resulting dictionary, each character in x will be mapped to the
13228
character at the same position in y. If there is a third argument, it
13229
must be a string, whose characters will be mapped to None in the result.
13230
[clinic start generated code]*/
13231
13232
static PyObject *
13233
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13234
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13235
0
{
13236
0
    PyObject *new = NULL, *key, *value;
13237
0
    Py_ssize_t i = 0;
13238
0
    int res;
13239
13240
0
    new = PyDict_New();
13241
0
    if (!new)
13242
0
        return NULL;
13243
0
    if (y != NULL) {
13244
0
        int x_kind, y_kind, z_kind;
13245
0
        const void *x_data, *y_data, *z_data;
13246
13247
        /* x must be a string too, of equal length */
13248
0
        if (!PyUnicode_Check(x)) {
13249
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13250
0
                            "be a string if there is a second argument");
13251
0
            goto err;
13252
0
        }
13253
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13254
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13255
0
                            "arguments must have equal length");
13256
0
            goto err;
13257
0
        }
13258
        /* create entries for translating chars in x to those in y */
13259
0
        x_kind = PyUnicode_KIND(x);
13260
0
        y_kind = PyUnicode_KIND(y);
13261
0
        x_data = PyUnicode_DATA(x);
13262
0
        y_data = PyUnicode_DATA(y);
13263
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13264
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13265
0
            if (!key)
13266
0
                goto err;
13267
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13268
0
            if (!value) {
13269
0
                Py_DECREF(key);
13270
0
                goto err;
13271
0
            }
13272
0
            res = PyDict_SetItem(new, key, value);
13273
0
            Py_DECREF(key);
13274
0
            Py_DECREF(value);
13275
0
            if (res < 0)
13276
0
                goto err;
13277
0
        }
13278
        /* create entries for deleting chars in z */
13279
0
        if (z != NULL) {
13280
0
            z_kind = PyUnicode_KIND(z);
13281
0
            z_data = PyUnicode_DATA(z);
13282
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13283
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13284
0
                if (!key)
13285
0
                    goto err;
13286
0
                res = PyDict_SetItem(new, key, Py_None);
13287
0
                Py_DECREF(key);
13288
0
                if (res < 0)
13289
0
                    goto err;
13290
0
            }
13291
0
        }
13292
0
    } else {
13293
0
        int kind;
13294
0
        const void *data;
13295
13296
        /* x must be a dict */
13297
0
        if (!PyDict_CheckExact(x)) {
13298
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13299
0
                            "to maketrans it must be a dict");
13300
0
            goto err;
13301
0
        }
13302
        /* copy entries into the new dict, converting string keys to int keys */
13303
0
        while (PyDict_Next(x, &i, &key, &value)) {
13304
0
            if (PyUnicode_Check(key)) {
13305
                /* convert string keys to integer keys */
13306
0
                PyObject *newkey;
13307
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13308
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13309
0
                                    "table must be of length 1");
13310
0
                    goto err;
13311
0
                }
13312
0
                kind = PyUnicode_KIND(key);
13313
0
                data = PyUnicode_DATA(key);
13314
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13315
0
                if (!newkey)
13316
0
                    goto err;
13317
0
                res = PyDict_SetItem(new, newkey, value);
13318
0
                Py_DECREF(newkey);
13319
0
                if (res < 0)
13320
0
                    goto err;
13321
0
            } else if (PyLong_Check(key)) {
13322
                /* just keep integer keys */
13323
0
                if (PyDict_SetItem(new, key, value) < 0)
13324
0
                    goto err;
13325
0
            } else {
13326
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13327
0
                                "be strings or integers");
13328
0
                goto err;
13329
0
            }
13330
0
        }
13331
0
    }
13332
0
    return new;
13333
0
  err:
13334
0
    Py_DECREF(new);
13335
0
    return NULL;
13336
0
}
13337
13338
/*[clinic input]
13339
@permit_long_docstring_body
13340
str.translate as unicode_translate
13341
13342
    table: object
13343
        Translation table, which must be a mapping of Unicode ordinals to
13344
        Unicode ordinals, strings, or None.
13345
    /
13346
13347
Replace each character in the string using the given translation table.
13348
13349
The table must implement lookup/indexing via __getitem__, for instance a
13350
dictionary or list.  If this operation raises LookupError, the character is
13351
left untouched.  Characters mapped to None are deleted.
13352
[clinic start generated code]*/
13353
13354
static PyObject *
13355
unicode_translate(PyObject *self, PyObject *table)
13356
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=699e5fa0ebf9f5e9]*/
13357
104
{
13358
104
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13359
104
}
13360
13361
/*[clinic input]
13362
str.upper as unicode_upper
13363
13364
Return a copy of the string converted to uppercase.
13365
[clinic start generated code]*/
13366
13367
static PyObject *
13368
unicode_upper_impl(PyObject *self)
13369
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13370
102
{
13371
102
    if (PyUnicode_IS_ASCII(self))
13372
102
        return ascii_upper_or_lower(self, 0);
13373
0
    return case_operation(self, do_upper);
13374
102
}
13375
13376
/*[clinic input]
13377
@permit_long_summary
13378
str.zfill as unicode_zfill
13379
13380
    width: Py_ssize_t
13381
    /
13382
13383
Pad a numeric string with zeros on the left, to fill a field of the given width.
13384
13385
The string is never truncated.
13386
[clinic start generated code]*/
13387
13388
static PyObject *
13389
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13390
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=25a4ee0ea3e58ce0]*/
13391
0
{
13392
0
    Py_ssize_t fill;
13393
0
    PyObject *u;
13394
0
    int kind;
13395
0
    const void *data;
13396
0
    Py_UCS4 chr;
13397
13398
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13399
0
        return unicode_result_unchanged(self);
13400
13401
0
    fill = width - PyUnicode_GET_LENGTH(self);
13402
13403
0
    u = pad(self, fill, 0, '0');
13404
13405
0
    if (u == NULL)
13406
0
        return NULL;
13407
13408
0
    kind = PyUnicode_KIND(u);
13409
0
    data = PyUnicode_DATA(u);
13410
0
    chr = PyUnicode_READ(kind, data, fill);
13411
13412
0
    if (chr == '+' || chr == '-') {
13413
        /* move sign to beginning of string */
13414
0
        PyUnicode_WRITE(kind, data, 0, chr);
13415
0
        PyUnicode_WRITE(kind, data, fill, '0');
13416
0
    }
13417
13418
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13419
0
    return u;
13420
0
}
13421
13422
/*[clinic input]
13423
@permit_long_summary
13424
@text_signature "($self, prefix[, start[, end]], /)"
13425
str.startswith as unicode_startswith
13426
13427
    prefix as subobj: object
13428
        A string or a tuple of strings to try.
13429
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13430
        Optional start position. Default: start of the string.
13431
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13432
        Optional stop position. Default: end of the string.
13433
    /
13434
13435
Return True if the string starts with the specified prefix, False otherwise.
13436
[clinic start generated code]*/
13437
13438
static PyObject *
13439
unicode_startswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13440
                        Py_ssize_t end)
13441
/*[clinic end generated code: output=4bd7cfd0803051d4 input=766bdbd33df251dc]*/
13442
69.1M
{
13443
69.1M
    if (PyTuple_Check(subobj)) {
13444
8.69M
        Py_ssize_t i;
13445
31.5M
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13446
22.8M
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13447
22.8M
            if (!PyUnicode_Check(substring)) {
13448
0
                PyErr_Format(PyExc_TypeError,
13449
0
                             "tuple for startswith must only contain str, "
13450
0
                             "not %.100s",
13451
0
                             Py_TYPE(substring)->tp_name);
13452
0
                return NULL;
13453
0
            }
13454
22.8M
            int result = tailmatch(self, substring, start, end, -1);
13455
22.8M
            if (result < 0) {
13456
0
                return NULL;
13457
0
            }
13458
22.8M
            if (result) {
13459
37.4k
                Py_RETURN_TRUE;
13460
37.4k
            }
13461
22.8M
        }
13462
        /* nothing matched */
13463
8.69M
        Py_RETURN_FALSE;
13464
8.69M
    }
13465
60.4M
    if (!PyUnicode_Check(subobj)) {
13466
0
        PyErr_Format(PyExc_TypeError,
13467
0
                     "startswith first arg must be str or "
13468
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13469
0
        return NULL;
13470
0
    }
13471
60.4M
    int result = tailmatch(self, subobj, start, end, -1);
13472
60.4M
    if (result < 0) {
13473
0
        return NULL;
13474
0
    }
13475
60.4M
    return PyBool_FromLong(result);
13476
60.4M
}
13477
13478
13479
/*[clinic input]
13480
@permit_long_summary
13481
@text_signature "($self, suffix[, start[, end]], /)"
13482
str.endswith as unicode_endswith
13483
13484
    suffix as subobj: object
13485
        A string or a tuple of strings to try.
13486
    start: slice_index(accept={int, NoneType}, c_default='0') = None
13487
        Optional start position. Default: start of the string.
13488
    end: slice_index(accept={int, NoneType}, c_default='PY_SSIZE_T_MAX') = None
13489
        Optional stop position. Default: end of the string.
13490
    /
13491
13492
Return True if the string ends with the specified suffix, False otherwise.
13493
[clinic start generated code]*/
13494
13495
static PyObject *
13496
unicode_endswith_impl(PyObject *self, PyObject *subobj, Py_ssize_t start,
13497
                      Py_ssize_t end)
13498
/*[clinic end generated code: output=cce6f8ceb0102ca9 input=b66bf6d5547ba1aa]*/
13499
15.0M
{
13500
15.0M
    if (PyTuple_Check(subobj)) {
13501
182k
        Py_ssize_t i;
13502
333k
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13503
312k
            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
13504
312k
            if (!PyUnicode_Check(substring)) {
13505
0
                PyErr_Format(PyExc_TypeError,
13506
0
                             "tuple for endswith must only contain str, "
13507
0
                             "not %.100s",
13508
0
                             Py_TYPE(substring)->tp_name);
13509
0
                return NULL;
13510
0
            }
13511
312k
            int result = tailmatch(self, substring, start, end, +1);
13512
312k
            if (result < 0) {
13513
0
                return NULL;
13514
0
            }
13515
312k
            if (result) {
13516
161k
                Py_RETURN_TRUE;
13517
161k
            }
13518
312k
        }
13519
182k
        Py_RETURN_FALSE;
13520
182k
    }
13521
14.8M
    if (!PyUnicode_Check(subobj)) {
13522
0
        PyErr_Format(PyExc_TypeError,
13523
0
                     "endswith first arg must be str or "
13524
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13525
0
        return NULL;
13526
0
    }
13527
14.8M
    int result = tailmatch(self, subobj, start, end, +1);
13528
14.8M
    if (result < 0) {
13529
0
        return NULL;
13530
0
    }
13531
14.8M
    return PyBool_FromLong(result);
13532
14.8M
}
13533
13534
13535
static inline void
13536
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13537
65.0M
{
13538
65.0M
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13539
65.0M
    writer->data = PyUnicode_DATA(writer->buffer);
13540
13541
65.0M
    if (!writer->readonly) {
13542
64.9M
        writer->kind = PyUnicode_KIND(writer->buffer);
13543
64.9M
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13544
64.9M
    }
13545
19.6k
    else {
13546
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
13547
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13548
19.6k
        writer->kind = 0;
13549
19.6k
        assert(writer->kind <= PyUnicode_1BYTE_KIND);
13550
13551
        /* Copy-on-write mode: set buffer size to 0 so
13552
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13553
         * next write. */
13554
19.6k
        writer->size = 0;
13555
19.6k
    }
13556
65.0M
}
13557
13558
13559
void
13560
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13561
51.2M
{
13562
51.2M
    memset(writer, 0, sizeof(*writer));
13563
13564
    /* ASCII is the bare minimum */
13565
51.2M
    writer->min_char = 127;
13566
13567
    /* use a kind value smaller than PyUnicode_1BYTE_KIND so
13568
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13569
51.2M
    assert(writer->kind == 0);
13570
51.2M
    assert(writer->kind < PyUnicode_1BYTE_KIND);
13571
51.2M
}
13572
13573
13574
PyUnicodeWriter*
13575
PyUnicodeWriter_Create(Py_ssize_t length)
13576
4.72M
{
13577
4.72M
    if (length < 0) {
13578
0
        PyErr_SetString(PyExc_ValueError,
13579
0
                        "length must be positive");
13580
0
        return NULL;
13581
0
    }
13582
13583
4.72M
    const size_t size = sizeof(_PyUnicodeWriter);
13584
4.72M
    PyUnicodeWriter *pub_writer;
13585
4.72M
    pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
13586
4.72M
    if (pub_writer == NULL) {
13587
2.64M
        pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
13588
2.64M
        if (pub_writer == NULL) {
13589
0
            return (PyUnicodeWriter *)PyErr_NoMemory();
13590
0
        }
13591
2.64M
    }
13592
4.72M
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;
13593
13594
4.72M
    _PyUnicodeWriter_Init(writer);
13595
4.72M
    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
13596
0
        PyUnicodeWriter_Discard(pub_writer);
13597
0
        return NULL;
13598
0
    }
13599
4.72M
    writer->overallocate = 1;
13600
13601
4.72M
    return pub_writer;
13602
4.72M
}
13603
13604
13605
void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
13606
65.4k
{
13607
65.4k
    if (writer == NULL) {
13608
65.0k
        return;
13609
65.0k
    }
13610
453
    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
13611
453
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
13612
453
}
13613
13614
13615
// Initialize _PyUnicodeWriter with initial buffer
13616
static inline void
13617
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13618
634k
{
13619
634k
    memset(writer, 0, sizeof(*writer));
13620
634k
    writer->buffer = buffer;
13621
634k
    _PyUnicodeWriter_Update(writer);
13622
634k
    writer->min_length = writer->size;
13623
634k
}
13624
13625
13626
int
13627
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13628
                                 Py_ssize_t length, Py_UCS4 maxchar)
13629
64.3M
{
13630
64.3M
    Py_ssize_t newlen;
13631
64.3M
    PyObject *newbuffer;
13632
13633
64.3M
    assert(length >= 0);
13634
64.3M
    assert(maxchar <= MAX_UNICODE);
13635
13636
    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13637
64.3M
    assert((maxchar > writer->maxchar && length >= 0)
13638
64.3M
           || length > 0);
13639
13640
64.3M
    if (length > PY_SSIZE_T_MAX - writer->pos) {
13641
0
        PyErr_NoMemory();
13642
0
        return -1;
13643
0
    }
13644
64.3M
    newlen = writer->pos + length;
13645
13646
64.3M
    maxchar = Py_MAX(maxchar, writer->min_char);
13647
13648
64.3M
    if (writer->buffer == NULL) {
13649
47.0M
        assert(!writer->readonly);
13650
47.0M
        if (writer->overallocate
13651
35.9M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13652
            /* overallocate to limit the number of realloc() */
13653
35.9M
            newlen += newlen / OVERALLOCATE_FACTOR;
13654
35.9M
        }
13655
47.0M
        if (newlen < writer->min_length)
13656
42.2M
            newlen = writer->min_length;
13657
13658
47.0M
        writer->buffer = PyUnicode_New(newlen, maxchar);
13659
47.0M
        if (writer->buffer == NULL)
13660
0
            return -1;
13661
47.0M
    }
13662
17.3M
    else if (newlen > writer->size) {
13663
14.6M
        if (writer->overallocate
13664
14.2M
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13665
            /* overallocate to limit the number of realloc() */
13666
14.2M
            newlen += newlen / OVERALLOCATE_FACTOR;
13667
14.2M
        }
13668
14.6M
        if (newlen < writer->min_length)
13669
1.30k
            newlen = writer->min_length;
13670
13671
14.6M
        if (maxchar > writer->maxchar || writer->readonly) {
13672
            /* resize + widen */
13673
3.59M
            maxchar = Py_MAX(maxchar, writer->maxchar);
13674
3.59M
            newbuffer = PyUnicode_New(newlen, maxchar);
13675
3.59M
            if (newbuffer == NULL)
13676
0
                return -1;
13677
3.59M
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13678
3.59M
                                          writer->buffer, 0, writer->pos);
13679
3.59M
            Py_DECREF(writer->buffer);
13680
3.59M
            writer->readonly = 0;
13681
3.59M
        }
13682
11.0M
        else {
13683
11.0M
            newbuffer = resize_compact(writer->buffer, newlen);
13684
11.0M
            if (newbuffer == NULL)
13685
0
                return -1;
13686
11.0M
        }
13687
14.6M
        writer->buffer = newbuffer;
13688
14.6M
    }
13689
2.67M
    else if (maxchar > writer->maxchar) {
13690
2.67M
        assert(!writer->readonly);
13691
2.67M
        newbuffer = PyUnicode_New(writer->size, maxchar);
13692
2.67M
        if (newbuffer == NULL)
13693
0
            return -1;
13694
2.67M
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13695
2.67M
                                      writer->buffer, 0, writer->pos);
13696
2.67M
        Py_SETREF(writer->buffer, newbuffer);
13697
2.67M
    }
13698
64.3M
    _PyUnicodeWriter_Update(writer);
13699
64.3M
    return 0;
13700
13701
64.3M
#undef OVERALLOCATE_FACTOR
13702
64.3M
}
13703
13704
int
13705
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13706
                                     int kind)
13707
139k
{
13708
139k
    Py_UCS4 maxchar;
13709
13710
    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13711
139k
    assert(writer->kind < kind);
13712
13713
139k
    switch (kind)
13714
139k
    {
13715
0
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13716
139k
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13717
0
    case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
13718
0
    default:
13719
0
        Py_UNREACHABLE();
13720
139k
    }
13721
13722
139k
    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13723
139k
}
13724
13725
static inline int
13726
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13727
281M
{
13728
281M
    assert(ch <= MAX_UNICODE);
13729
281M
    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13730
0
        return -1;
13731
281M
    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13732
281M
    writer->pos++;
13733
281M
    return 0;
13734
281M
}
13735
13736
int
13737
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13738
103M
{
13739
103M
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13740
103M
}
13741
13742
int
13743
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
13744
71.8M
{
13745
71.8M
    if (ch > MAX_UNICODE) {
13746
0
        PyErr_SetString(PyExc_ValueError,
13747
0
                        "character must be in range(0x110000)");
13748
0
        return -1;
13749
0
    }
13750
13751
71.8M
    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
13752
71.8M
}
13753
13754
int
13755
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13756
63.3M
{
13757
63.3M
    assert(PyUnicode_Check(str));
13758
13759
63.3M
    Py_UCS4 maxchar;
13760
63.3M
    Py_ssize_t len;
13761
13762
63.3M
    len = PyUnicode_GET_LENGTH(str);
13763
63.3M
    if (len == 0)
13764
22.3M
        return 0;
13765
41.0M
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13766
41.0M
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13767
23.0M
        if (writer->buffer == NULL && !writer->overallocate) {
13768
11.2k
            assert(_PyUnicode_CheckConsistency(str, 1));
13769
11.2k
            writer->readonly = 1;
13770
11.2k
            writer->buffer = Py_NewRef(str);
13771
11.2k
            _PyUnicodeWriter_Update(writer);
13772
11.2k
            writer->pos += len;
13773
11.2k
            return 0;
13774
11.2k
        }
13775
23.0M
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13776
0
            return -1;
13777
23.0M
    }
13778
41.0M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13779
41.0M
                                  str, 0, len);
13780
41.0M
    writer->pos += len;
13781
41.0M
    return 0;
13782
41.0M
}
13783
13784
int
13785
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
13786
4.06M
{
13787
4.06M
    PyTypeObject *type = Py_TYPE(obj);
13788
4.06M
    if (type == &PyUnicode_Type) {
13789
4.06M
        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
13790
4.06M
    }
13791
13792
0
    if (type == &PyLong_Type) {
13793
0
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13794
0
    }
13795
13796
0
    PyObject *str = PyObject_Str(obj);
13797
0
    if (str == NULL) {
13798
0
        return -1;
13799
0
    }
13800
13801
0
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
13802
0
    Py_DECREF(str);
13803
0
    return res;
13804
0
}
13805
13806
13807
int
13808
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
13809
8.04M
{
13810
8.04M
    if (Py_TYPE(obj) == &PyLong_Type) {
13811
547k
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
13812
547k
    }
13813
13814
7.50M
    PyObject *repr = PyObject_Repr(obj);
13815
7.50M
    if (repr == NULL) {
13816
0
        return -1;
13817
0
    }
13818
13819
7.50M
    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
13820
7.50M
    Py_DECREF(repr);
13821
7.50M
    return res;
13822
7.50M
}
13823
13824
13825
int
13826
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13827
                                Py_ssize_t start, Py_ssize_t end)
13828
64.8M
{
13829
64.8M
    assert(0 <= start);
13830
64.8M
    assert(end <= PyUnicode_GET_LENGTH(str));
13831
64.8M
    assert(start <= end);
13832
13833
64.8M
    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13834
116
        return _PyUnicodeWriter_WriteStr(writer, str);
13835
13836
64.8M
    Py_ssize_t len = end - start;
13837
64.8M
    if (len == 0) {
13838
0
        return 0;
13839
0
    }
13840
13841
64.8M
    Py_UCS4 maxchar;
13842
64.8M
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
13843
13.5M
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13844
13.5M
    }
13845
51.3M
    else {
13846
51.3M
        maxchar = writer->maxchar;
13847
51.3M
    }
13848
64.8M
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
13849
0
        return -1;
13850
0
    }
13851
13852
64.8M
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13853
64.8M
                                  str, start, len);
13854
64.8M
    writer->pos += len;
13855
64.8M
    return 0;
13856
64.8M
}
13857
13858
13859
int
13860
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
13861
                               Py_ssize_t start, Py_ssize_t end)
13862
609k
{
13863
609k
    if (!PyUnicode_Check(str)) {
13864
0
        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
13865
0
        return -1;
13866
0
    }
13867
609k
    if (start < 0 || start > end) {
13868
0
        PyErr_Format(PyExc_ValueError, "invalid start argument");
13869
0
        return -1;
13870
0
    }
13871
609k
    if (end > PyUnicode_GET_LENGTH(str)) {
13872
0
        PyErr_Format(PyExc_ValueError, "invalid end argument");
13873
0
        return -1;
13874
0
    }
13875
13876
609k
    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
13877
609k
                                           start, end);
13878
609k
}
13879
13880
13881
int
13882
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13883
                                  const char *ascii, Py_ssize_t len)
13884
52.4M
{
13885
52.4M
    if (len == -1)
13886
0
        len = strlen(ascii);
13887
13888
52.4M
    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13889
13890
52.4M
    if (writer->buffer == NULL && !writer->overallocate) {
13891
8.45k
        PyObject *str;
13892
13893
8.45k
        str = _PyUnicode_FromASCII(ascii, len);
13894
8.45k
        if (str == NULL)
13895
0
            return -1;
13896
13897
8.45k
        writer->readonly = 1;
13898
8.45k
        writer->buffer = str;
13899
8.45k
        _PyUnicodeWriter_Update(writer);
13900
8.45k
        writer->pos += len;
13901
8.45k
        return 0;
13902
8.45k
    }
13903
13904
52.3M
    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13905
0
        return -1;
13906
13907
52.3M
    switch (writer->kind)
13908
52.3M
    {
13909
52.3M
    case PyUnicode_1BYTE_KIND:
13910
52.3M
    {
13911
52.3M
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13912
52.3M
        Py_UCS1 *data = writer->data;
13913
13914
52.3M
        memcpy(data + writer->pos, str, len);
13915
52.3M
        break;
13916
0
    }
13917
12.1k
    case PyUnicode_2BYTE_KIND:
13918
12.1k
    {
13919
12.1k
        _PyUnicode_CONVERT_BYTES(
13920
12.1k
            Py_UCS1, Py_UCS2,
13921
12.1k
            ascii, ascii + len,
13922
12.1k
            (Py_UCS2 *)writer->data + writer->pos);
13923
12.1k
        break;
13924
0
    }
13925
3.41k
    case PyUnicode_4BYTE_KIND:
13926
3.41k
    {
13927
3.41k
        _PyUnicode_CONVERT_BYTES(
13928
3.41k
            Py_UCS1, Py_UCS4,
13929
3.41k
            ascii, ascii + len,
13930
3.41k
            (Py_UCS4 *)writer->data + writer->pos);
13931
3.41k
        break;
13932
0
    }
13933
0
    default:
13934
0
        Py_UNREACHABLE();
13935
52.3M
    }
13936
13937
52.3M
    writer->pos += len;
13938
52.3M
    return 0;
13939
52.3M
}
13940
13941
13942
int
13943
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
13944
                           const char *str,
13945
                           Py_ssize_t size)
13946
440k
{
13947
440k
    assert(writer != NULL);
13948
440k
    _Py_AssertHoldsTstate();
13949
13950
440k
    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
13951
440k
    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
13952
440k
}
13953
13954
13955
int
13956
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
13957
                          const char *str,
13958
                          Py_ssize_t size)
13959
0
{
13960
0
    if (size < 0) {
13961
0
        size = strlen(str);
13962
0
    }
13963
13964
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
13965
0
    Py_ssize_t old_pos = _writer->pos;
13966
0
    int res = unicode_decode_utf8_writer(_writer, str, size,
13967
0
                                         _Py_ERROR_STRICT, NULL, NULL);
13968
0
    if (res < 0) {
13969
0
        _writer->pos = old_pos;
13970
0
    }
13971
0
    return res;
13972
0
}
13973
13974
13975
int
13976
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
13977
                                   const char *string,
13978
                                   Py_ssize_t length,
13979
                                   const char *errors,
13980
                                   Py_ssize_t *consumed)
13981
0
{
13982
0
    if (length < 0) {
13983
0
        length = strlen(string);
13984
0
    }
13985
13986
0
    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
13987
0
    Py_ssize_t old_pos = _writer->pos;
13988
0
    int res = unicode_decode_utf8_writer(_writer, string, length,
13989
0
                                         _Py_ERROR_UNKNOWN, errors, consumed);
13990
0
    if (res < 0) {
13991
0
        _writer->pos = old_pos;
13992
0
        if (consumed) {
13993
0
            *consumed = 0;
13994
0
        }
13995
0
    }
13996
0
    return res;
13997
0
}
13998
13999
14000
int
14001
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14002
                                   const char *str, Py_ssize_t len)
14003
0
{
14004
0
    Py_UCS4 maxchar;
14005
14006
0
    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
14007
0
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14008
0
        return -1;
14009
0
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
14010
0
    writer->pos += len;
14011
0
    return 0;
14012
0
}
14013
14014
PyObject *
14015
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
14016
47.6M
{
14017
47.6M
    PyObject *str;
14018
14019
47.6M
    if (writer->pos == 0) {
14020
868
        Py_CLEAR(writer->buffer);
14021
868
        _Py_RETURN_UNICODE_EMPTY();
14022
868
    }
14023
14024
47.6M
    str = writer->buffer;
14025
47.6M
    writer->buffer = NULL;
14026
14027
47.6M
    if (writer->readonly) {
14028
18.3k
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14029
18.3k
        return str;
14030
18.3k
    }
14031
14032
47.6M
    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14033
46.5M
        PyObject *str2;
14034
46.5M
        str2 = resize_compact(str, writer->pos);
14035
46.5M
        if (str2 == NULL) {
14036
0
            Py_DECREF(str);
14037
0
            return NULL;
14038
0
        }
14039
46.5M
        str = str2;
14040
46.5M
    }
14041
14042
47.6M
    assert(_PyUnicode_CheckConsistency(str, 1));
14043
47.6M
    return unicode_result(str);
14044
47.6M
}
14045
14046
14047
PyObject*
14048
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
14049
4.72M
{
14050
4.72M
    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
14051
4.72M
    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
14052
4.72M
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
14053
4.72M
    return str;
14054
4.72M
}
14055
14056
14057
void
14058
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
14059
4.20M
{
14060
4.20M
    Py_CLEAR(writer->buffer);
14061
4.20M
}
14062
14063
#include "stringlib/unicode_format.h"
14064
14065
PyDoc_STRVAR(format__doc__,
14066
             "format($self, /, *args, **kwargs)\n\
14067
--\n\
14068
\n\
14069
Return a formatted version of the string, using substitutions from args and kwargs.\n\
14070
The substitutions are identified by braces ('{' and '}').");
14071
14072
PyDoc_STRVAR(format_map__doc__,
14073
             "format_map($self, mapping, /)\n\
14074
--\n\
14075
\n\
14076
Return a formatted version of the string, using substitutions from mapping.\n\
14077
The substitutions are identified by braces ('{' and '}').");
14078
14079
/*[clinic input]
14080
str.__format__ as unicode___format__
14081
14082
    format_spec: unicode
14083
    /
14084
14085
Return a formatted version of the string as described by format_spec.
14086
[clinic start generated code]*/
14087
14088
static PyObject *
14089
unicode___format___impl(PyObject *self, PyObject *format_spec)
14090
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
14091
0
{
14092
0
    _PyUnicodeWriter writer;
14093
0
    int ret;
14094
14095
0
    _PyUnicodeWriter_Init(&writer);
14096
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
14097
0
                                          self, format_spec, 0,
14098
0
                                          PyUnicode_GET_LENGTH(format_spec));
14099
0
    if (ret == -1) {
14100
0
        _PyUnicodeWriter_Dealloc(&writer);
14101
0
        return NULL;
14102
0
    }
14103
0
    return _PyUnicodeWriter_Finish(&writer);
14104
0
}
14105
14106
/*[clinic input]
14107
str.__sizeof__ as unicode_sizeof
14108
14109
Return the size of the string in memory, in bytes.
14110
[clinic start generated code]*/
14111
14112
static PyObject *
14113
unicode_sizeof_impl(PyObject *self)
14114
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
14115
0
{
14116
0
    Py_ssize_t size;
14117
14118
    /* If it's a compact object, account for base structure +
14119
       character data. */
14120
0
    if (PyUnicode_IS_COMPACT_ASCII(self)) {
14121
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14122
0
    }
14123
0
    else if (PyUnicode_IS_COMPACT(self)) {
14124
0
        size = sizeof(PyCompactUnicodeObject) +
14125
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
14126
0
    }
14127
0
    else {
14128
        /* If it is a two-block object, account for base object, and
14129
           for character block if present. */
14130
0
        size = sizeof(PyUnicodeObject);
14131
0
        if (_PyUnicode_DATA_ANY(self))
14132
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
14133
0
                PyUnicode_KIND(self);
14134
0
    }
14135
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
14136
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
14137
14138
0
    return PyLong_FromSsize_t(size);
14139
0
}
14140
14141
static PyObject *
14142
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
14143
0
{
14144
0
    PyObject *copy = _PyUnicode_Copy(v);
14145
0
    if (!copy)
14146
0
        return NULL;
14147
0
    return Py_BuildValue("(N)", copy);
14148
0
}
14149
14150
/*
14151
This function searchs the longest common leading whitespace
14152
of all lines in the [src, end).
14153
It returns the length of the common leading whitespace and sets `output` to
14154
point to the beginning of the common leading whitespace if length > 0.
14155
*/
14156
static Py_ssize_t
14157
search_longest_common_leading_whitespace(
14158
    const char *const src,
14159
    const char *const end,
14160
    const char **output)
14161
0
{
14162
    // [_start, _start + _len)
14163
    // describes the current longest common leading whitespace
14164
0
    const char *_start = NULL;
14165
0
    Py_ssize_t _len = 0;
14166
14167
0
    for (const char *iter = src; iter < end; ++iter) {
14168
0
        const char *line_start = iter;
14169
0
        const char *leading_whitespace_end = NULL;
14170
14171
        // scan the whole line
14172
0
        while (iter < end && *iter != '\n') {
14173
0
            if (!leading_whitespace_end && *iter != ' ' && *iter != '\t') {
14174
                /* `iter` points to the first non-whitespace character
14175
                   in this line */
14176
0
                if (iter == line_start) {
14177
                    // some line has no indent, fast exit!
14178
0
                    return 0;
14179
0
                }
14180
0
                leading_whitespace_end = iter;
14181
0
            }
14182
0
            ++iter;
14183
0
        }
14184
14185
        // if this line has all white space, skip it
14186
0
        if (!leading_whitespace_end) {
14187
0
            continue;
14188
0
        }
14189
14190
0
        if (!_start) {
14191
            // update the first leading whitespace
14192
0
            _start = line_start;
14193
0
            _len = leading_whitespace_end - line_start;
14194
0
            assert(_len > 0);
14195
0
        }
14196
0
        else {
14197
            /* We then compare with the current longest leading whitespace.
14198
14199
               [line_start, leading_whitespace_end) is the leading
14200
               whitespace of this line,
14201
14202
               [_start, _start + _len) is the leading whitespace of the
14203
               current longest leading whitespace. */
14204
0
            Py_ssize_t new_len = 0;
14205
0
            const char *_iter = _start, *line_iter = line_start;
14206
14207
0
            while (_iter < _start + _len && line_iter < leading_whitespace_end
14208
0
                   && *_iter == *line_iter)
14209
0
            {
14210
0
                ++_iter;
14211
0
                ++line_iter;
14212
0
                ++new_len;
14213
0
            }
14214
14215
0
            _len = new_len;
14216
0
            if (_len == 0) {
14217
                // No common things now, fast exit!
14218
0
                return 0;
14219
0
            }
14220
0
        }
14221
0
    }
14222
14223
0
    assert(_len >= 0);
14224
0
    if (_len > 0) {
14225
0
        *output = _start;
14226
0
    }
14227
0
    return _len;
14228
0
}
14229
14230
/* Dedent a string.
14231
   Behaviour is expected to be an exact match of `textwrap.dedent`.
14232
   Return a new reference on success, NULL with exception set on error.
14233
   */
14234
PyObject *
14235
_PyUnicode_Dedent(PyObject *unicode)
14236
0
{
14237
0
    Py_ssize_t src_len = 0;
14238
0
    const char *src = PyUnicode_AsUTF8AndSize(unicode, &src_len);
14239
0
    if (!src) {
14240
0
        return NULL;
14241
0
    }
14242
0
    assert(src_len >= 0);
14243
0
    if (src_len == 0) {
14244
0
        return Py_NewRef(unicode);
14245
0
    }
14246
14247
0
    const char *const end = src + src_len;
14248
14249
    // [whitespace_start, whitespace_start + whitespace_len)
14250
    // describes the current longest common leading whitespace
14251
0
    const char *whitespace_start = NULL;
14252
0
    Py_ssize_t whitespace_len = search_longest_common_leading_whitespace(
14253
0
        src, end, &whitespace_start);
14254
14255
0
    if (whitespace_len == 0) {
14256
0
        return Py_NewRef(unicode);
14257
0
    }
14258
14259
    // now we should trigger a dedent
14260
0
    char *dest = PyMem_Malloc(src_len);
14261
0
    if (!dest) {
14262
0
        PyErr_NoMemory();
14263
0
        return NULL;
14264
0
    }
14265
0
    char *dest_iter = dest;
14266
14267
0
    for (const char *iter = src; iter < end; ++iter) {
14268
0
        const char *line_start = iter;
14269
0
        bool in_leading_space = true;
14270
14271
        // iterate over a line to find the end of a line
14272
0
        while (iter < end && *iter != '\n') {
14273
0
            if (in_leading_space && *iter != ' ' && *iter != '\t') {
14274
0
                in_leading_space = false;
14275
0
            }
14276
0
            ++iter;
14277
0
        }
14278
14279
        // invariant: *iter == '\n' or iter == end
14280
0
        bool append_newline = iter < end;
14281
14282
        // if this line has all white space, write '\n' and continue
14283
0
        if (in_leading_space && append_newline) {
14284
0
            *dest_iter++ = '\n';
14285
0
            continue;
14286
0
        }
14287
14288
        /* copy [new_line_start + whitespace_len, iter) to buffer, then
14289
            conditionally append '\n' */
14290
14291
0
        Py_ssize_t new_line_len = iter - line_start - whitespace_len;
14292
0
        assert(new_line_len >= 0);
14293
0
        memcpy(dest_iter, line_start + whitespace_len, new_line_len);
14294
14295
0
        dest_iter += new_line_len;
14296
14297
0
        if (append_newline) {
14298
0
            *dest_iter++ = '\n';
14299
0
        }
14300
0
    }
14301
14302
0
    PyObject *res = PyUnicode_FromStringAndSize(dest, dest_iter - dest);
14303
0
    PyMem_Free(dest);
14304
0
    return res;
14305
0
}
14306
14307
static PyMethodDef unicode_methods[] = {
14308
    UNICODE_ENCODE_METHODDEF
14309
    UNICODE_REPLACE_METHODDEF
14310
    UNICODE_SPLIT_METHODDEF
14311
    UNICODE_RSPLIT_METHODDEF
14312
    UNICODE_JOIN_METHODDEF
14313
    UNICODE_CAPITALIZE_METHODDEF
14314
    UNICODE_CASEFOLD_METHODDEF
14315
    UNICODE_TITLE_METHODDEF
14316
    UNICODE_CENTER_METHODDEF
14317
    UNICODE_COUNT_METHODDEF
14318
    UNICODE_EXPANDTABS_METHODDEF
14319
    UNICODE_FIND_METHODDEF
14320
    UNICODE_PARTITION_METHODDEF
14321
    UNICODE_INDEX_METHODDEF
14322
    UNICODE_LJUST_METHODDEF
14323
    UNICODE_LOWER_METHODDEF
14324
    UNICODE_LSTRIP_METHODDEF
14325
    UNICODE_RFIND_METHODDEF
14326
    UNICODE_RINDEX_METHODDEF
14327
    UNICODE_RJUST_METHODDEF
14328
    UNICODE_RSTRIP_METHODDEF
14329
    UNICODE_RPARTITION_METHODDEF
14330
    UNICODE_SPLITLINES_METHODDEF
14331
    UNICODE_STRIP_METHODDEF
14332
    UNICODE_SWAPCASE_METHODDEF
14333
    UNICODE_TRANSLATE_METHODDEF
14334
    UNICODE_UPPER_METHODDEF
14335
    UNICODE_STARTSWITH_METHODDEF
14336
    UNICODE_ENDSWITH_METHODDEF
14337
    UNICODE_REMOVEPREFIX_METHODDEF
14338
    UNICODE_REMOVESUFFIX_METHODDEF
14339
    UNICODE_ISASCII_METHODDEF
14340
    UNICODE_ISLOWER_METHODDEF
14341
    UNICODE_ISUPPER_METHODDEF
14342
    UNICODE_ISTITLE_METHODDEF
14343
    UNICODE_ISSPACE_METHODDEF
14344
    UNICODE_ISDECIMAL_METHODDEF
14345
    UNICODE_ISDIGIT_METHODDEF
14346
    UNICODE_ISNUMERIC_METHODDEF
14347
    UNICODE_ISALPHA_METHODDEF
14348
    UNICODE_ISALNUM_METHODDEF
14349
    UNICODE_ISIDENTIFIER_METHODDEF
14350
    UNICODE_ISPRINTABLE_METHODDEF
14351
    UNICODE_ZFILL_METHODDEF
14352
    {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__},
14353
    {"format_map", do_string_format_map, METH_O, format_map__doc__},
14354
    UNICODE___FORMAT___METHODDEF
14355
    UNICODE_MAKETRANS_METHODDEF
14356
    UNICODE_SIZEOF_METHODDEF
14357
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
14358
    {NULL, NULL}
14359
};
14360
14361
static PyObject *
14362
unicode_mod(PyObject *v, PyObject *w)
14363
23.0M
{
14364
23.0M
    if (!PyUnicode_Check(v))
14365
0
        Py_RETURN_NOTIMPLEMENTED;
14366
23.0M
    return PyUnicode_Format(v, w);
14367
23.0M
}
14368
14369
static PyNumberMethods unicode_as_number = {
14370
    0,              /*nb_add*/
14371
    0,              /*nb_subtract*/
14372
    0,              /*nb_multiply*/
14373
    unicode_mod,            /*nb_remainder*/
14374
};
14375
14376
static PySequenceMethods unicode_as_sequence = {
14377
    unicode_length,     /* sq_length */
14378
    PyUnicode_Concat,   /* sq_concat */
14379
    unicode_repeat,     /* sq_repeat */
14380
    unicode_getitem,    /* sq_item */
14381
    0,                  /* sq_slice */
14382
    0,                  /* sq_ass_item */
14383
    0,                  /* sq_ass_slice */
14384
    PyUnicode_Contains, /* sq_contains */
14385
};
14386
14387
static PyObject*
14388
unicode_subscript(PyObject* self, PyObject* item)
14389
141M
{
14390
141M
    if (_PyIndex_Check(item)) {
14391
53.9M
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
14392
53.9M
        if (i == -1 && PyErr_Occurred())
14393
0
            return NULL;
14394
53.9M
        if (i < 0)
14395
61.8k
            i += PyUnicode_GET_LENGTH(self);
14396
53.9M
        return unicode_getitem(self, i);
14397
87.0M
    } else if (PySlice_Check(item)) {
14398
87.0M
        Py_ssize_t start, stop, step, slicelength, i;
14399
87.0M
        size_t cur;
14400
87.0M
        PyObject *result;
14401
87.0M
        const void *src_data;
14402
87.0M
        void *dest_data;
14403
87.0M
        int src_kind, dest_kind;
14404
87.0M
        Py_UCS4 ch, max_char, kind_limit;
14405
14406
87.0M
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14407
0
            return NULL;
14408
0
        }
14409
87.0M
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14410
87.0M
                                            &start, &stop, step);
14411
14412
87.0M
        if (slicelength <= 0) {
14413
14.8M
            _Py_RETURN_UNICODE_EMPTY();
14414
72.2M
        } else if (start == 0 && step == 1 &&
14415
30.6M
                   slicelength == PyUnicode_GET_LENGTH(self)) {
14416
6.57M
            return unicode_result_unchanged(self);
14417
65.6M
        } else if (step == 1) {
14418
65.6M
            return PyUnicode_Substring(self,
14419
65.6M
                                       start, start + slicelength);
14420
65.6M
        }
14421
        /* General case */
14422
0
        src_kind = PyUnicode_KIND(self);
14423
0
        src_data = PyUnicode_DATA(self);
14424
0
        if (!PyUnicode_IS_ASCII(self)) {
14425
0
            kind_limit = kind_maxchar_limit(src_kind);
14426
0
            max_char = 0;
14427
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14428
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
14429
0
                if (ch > max_char) {
14430
0
                    max_char = ch;
14431
0
                    if (max_char >= kind_limit)
14432
0
                        break;
14433
0
                }
14434
0
            }
14435
0
        }
14436
0
        else
14437
0
            max_char = 127;
14438
0
        result = PyUnicode_New(slicelength, max_char);
14439
0
        if (result == NULL)
14440
0
            return NULL;
14441
0
        dest_kind = PyUnicode_KIND(result);
14442
0
        dest_data = PyUnicode_DATA(result);
14443
14444
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14445
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14446
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14447
0
        }
14448
0
        assert(_PyUnicode_CheckConsistency(result, 1));
14449
0
        return result;
14450
0
    } else {
14451
0
        PyErr_Format(PyExc_TypeError, "string indices must be integers, not '%.200s'",
14452
0
                     Py_TYPE(item)->tp_name);
14453
0
        return NULL;
14454
0
    }
14455
141M
}
14456
14457
static PyMappingMethods unicode_as_mapping = {
14458
    unicode_length,     /* mp_length */
14459
    unicode_subscript,  /* mp_subscript */
14460
    0,                  /* mp_ass_subscript */
14461
};
14462
14463
14464
/* Helpers for PyUnicode_Format() */
14465
14466
struct unicode_formatter_t {
14467
    PyObject *args;
14468
    int args_owned;
14469
    Py_ssize_t arglen, argidx;
14470
    PyObject *dict;
14471
14472
    int fmtkind;
14473
    Py_ssize_t fmtcnt, fmtpos;
14474
    const void *fmtdata;
14475
    PyObject *fmtstr;
14476
14477
    _PyUnicodeWriter writer;
14478
};
14479
14480
struct unicode_format_arg_t {
14481
    Py_UCS4 ch;
14482
    int flags;
14483
    Py_ssize_t width;
14484
    int prec;
14485
    int sign;
14486
};
14487
14488
static PyObject *
14489
unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14490
45.0M
{
14491
45.0M
    Py_ssize_t argidx = ctx->argidx;
14492
14493
45.0M
    if (argidx < ctx->arglen) {
14494
45.0M
        ctx->argidx++;
14495
45.0M
        if (ctx->arglen < 0)
14496
17.5M
            return ctx->args;
14497
27.4M
        else
14498
27.4M
            return PyTuple_GetItem(ctx->args, argidx);
14499
45.0M
    }
14500
0
    PyErr_SetString(PyExc_TypeError,
14501
0
                    "not enough arguments for format string");
14502
0
    return NULL;
14503
45.0M
}
14504
14505
/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14506
14507
/* Format a float into the writer if the writer is not NULL, or into *p_output
14508
   otherwise.
14509
14510
   Return 0 on success, raise an exception and return -1 on error. */
14511
static int
14512
formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14513
            PyObject **p_output,
14514
            _PyUnicodeWriter *writer)
14515
0
{
14516
0
    char *p;
14517
0
    double x;
14518
0
    Py_ssize_t len;
14519
0
    int prec;
14520
0
    int dtoa_flags = 0;
14521
14522
0
    x = PyFloat_AsDouble(v);
14523
0
    if (x == -1.0 && PyErr_Occurred())
14524
0
        return -1;
14525
14526
0
    prec = arg->prec;
14527
0
    if (prec < 0)
14528
0
        prec = 6;
14529
14530
0
    if (arg->flags & F_ALT)
14531
0
        dtoa_flags |= Py_DTSF_ALT;
14532
0
    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14533
0
    if (p == NULL)
14534
0
        return -1;
14535
0
    len = strlen(p);
14536
0
    if (writer) {
14537
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14538
0
            PyMem_Free(p);
14539
0
            return -1;
14540
0
        }
14541
0
    }
14542
0
    else
14543
0
        *p_output = _PyUnicode_FromASCII(p, len);
14544
0
    PyMem_Free(p);
14545
0
    return 0;
14546
0
}
14547
14548
/* formatlong() emulates the format codes d, u, o, x and X, and
14549
 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14550
 * Python's regular ints.
14551
 * Return value:  a new PyUnicodeObject*, or NULL if error.
14552
 *     The output string is of the form
14553
 *         "-"? ("0x" | "0X")? digit+
14554
 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14555
 *         set in flags.  The case of hex digits will be correct,
14556
 *     There will be at least prec digits, zero-filled on the left if
14557
 *         necessary to get that many.
14558
 * val          object to be converted
14559
 * flags        bitmask of format flags; only F_ALT is looked at
14560
 * prec         minimum number of digits; 0-fill on left if needed
14561
 * type         a character in [duoxX]; u acts the same as d
14562
 *
14563
 * CAUTION:  o, x and X conversions on regular ints can never
14564
 * produce a '-' sign, but can for Python's unbounded ints.
14565
 */
14566
PyObject *
14567
_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14568
1.53k
{
14569
1.53k
    PyObject *result = NULL;
14570
1.53k
    char *buf;
14571
1.53k
    Py_ssize_t i;
14572
1.53k
    int sign;           /* 1 if '-', else 0 */
14573
1.53k
    int len;            /* number of characters */
14574
1.53k
    Py_ssize_t llen;
14575
1.53k
    int numdigits;      /* len == numnondigits + numdigits */
14576
1.53k
    int numnondigits = 0;
14577
14578
    /* Avoid exceeding SSIZE_T_MAX */
14579
1.53k
    if (prec > INT_MAX-3) {
14580
0
        PyErr_SetString(PyExc_OverflowError,
14581
0
                        "precision too large");
14582
0
        return NULL;
14583
0
    }
14584
14585
1.53k
    assert(PyLong_Check(val));
14586
14587
1.53k
    switch (type) {
14588
0
    default:
14589
0
        Py_UNREACHABLE();
14590
0
    case 'd':
14591
0
    case 'i':
14592
0
    case 'u':
14593
        /* int and int subclasses should print numerically when a numeric */
14594
        /* format code is used (see issue18780) */
14595
0
        result = PyNumber_ToBase(val, 10);
14596
0
        break;
14597
0
    case 'o':
14598
0
        numnondigits = 2;
14599
0
        result = PyNumber_ToBase(val, 8);
14600
0
        break;
14601
0
    case 'x':
14602
1.53k
    case 'X':
14603
1.53k
        numnondigits = 2;
14604
1.53k
        result = PyNumber_ToBase(val, 16);
14605
1.53k
        break;
14606
1.53k
    }
14607
1.53k
    if (!result)
14608
0
        return NULL;
14609
14610
1.53k
    assert(unicode_modifiable(result));
14611
1.53k
    assert(PyUnicode_IS_ASCII(result));
14612
14613
    /* To modify the string in-place, there can only be one reference. */
14614
1.53k
    if (!_PyObject_IsUniquelyReferenced(result)) {
14615
0
        Py_DECREF(result);
14616
0
        PyErr_BadInternalCall();
14617
0
        return NULL;
14618
0
    }
14619
1.53k
    buf = PyUnicode_DATA(result);
14620
1.53k
    llen = PyUnicode_GET_LENGTH(result);
14621
1.53k
    if (llen > INT_MAX) {
14622
0
        Py_DECREF(result);
14623
0
        PyErr_SetString(PyExc_ValueError,
14624
0
                        "string too large in _PyUnicode_FormatLong");
14625
0
        return NULL;
14626
0
    }
14627
1.53k
    len = (int)llen;
14628
1.53k
    sign = buf[0] == '-';
14629
1.53k
    numnondigits += sign;
14630
1.53k
    numdigits = len - numnondigits;
14631
1.53k
    assert(numdigits > 0);
14632
14633
    /* Get rid of base marker unless F_ALT */
14634
1.53k
    if (((alt) == 0 &&
14635
1.53k
        (type == 'o' || type == 'x' || type == 'X'))) {
14636
1.53k
        assert(buf[sign] == '0');
14637
1.53k
        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14638
1.53k
               buf[sign+1] == 'o');
14639
1.53k
        numnondigits -= 2;
14640
1.53k
        buf += 2;
14641
1.53k
        len -= 2;
14642
1.53k
        if (sign)
14643
0
            buf[0] = '-';
14644
1.53k
        assert(len == numnondigits + numdigits);
14645
1.53k
        assert(numdigits > 0);
14646
1.53k
    }
14647
14648
    /* Fill with leading zeroes to meet minimum width. */
14649
1.53k
    if (prec > numdigits) {
14650
0
        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14651
0
                                numnondigits + prec);
14652
0
        char *b1;
14653
0
        if (!r1) {
14654
0
            Py_DECREF(result);
14655
0
            return NULL;
14656
0
        }
14657
0
        b1 = PyBytes_AS_STRING(r1);
14658
0
        for (i = 0; i < numnondigits; ++i)
14659
0
            *b1++ = *buf++;
14660
0
        for (i = 0; i < prec - numdigits; i++)
14661
0
            *b1++ = '0';
14662
0
        for (i = 0; i < numdigits; i++)
14663
0
            *b1++ = *buf++;
14664
0
        *b1 = '\0';
14665
0
        Py_SETREF(result, r1);
14666
0
        buf = PyBytes_AS_STRING(result);
14667
0
        len = numnondigits + prec;
14668
0
    }
14669
14670
    /* Fix up case for hex conversions. */
14671
1.53k
    if (type == 'X') {
14672
        /* Need to convert all lower case letters to upper case.
14673
           and need to convert 0x to 0X (and -0x to -0X). */
14674
4.51k
        for (i = 0; i < len; i++)
14675
2.97k
            if (buf[i] >= 'a' && buf[i] <= 'x')
14676
1.15k
                buf[i] -= 'a'-'A';
14677
1.53k
    }
14678
1.53k
    if (!PyUnicode_Check(result)
14679
1.53k
        || buf != PyUnicode_DATA(result)) {
14680
1.53k
        PyObject *unicode;
14681
1.53k
        unicode = _PyUnicode_FromASCII(buf, len);
14682
1.53k
        Py_SETREF(result, unicode);
14683
1.53k
    }
14684
0
    else if (len != PyUnicode_GET_LENGTH(result)) {
14685
0
        if (PyUnicode_Resize(&result, len) < 0)
14686
0
            Py_CLEAR(result);
14687
0
    }
14688
1.53k
    return result;
14689
1.53k
}
14690
14691
/* Format an integer or a float as an integer.
14692
 * Return 1 if the number has been formatted into the writer,
14693
 *        0 if the number has been formatted into *p_output
14694
 *       -1 and raise an exception on error */
14695
static int
14696
mainformatlong(PyObject *v,
14697
               struct unicode_format_arg_t *arg,
14698
               PyObject **p_output,
14699
               _PyUnicodeWriter *writer)
14700
11.0M
{
14701
11.0M
    PyObject *iobj, *res;
14702
11.0M
    char type = (char)arg->ch;
14703
14704
11.0M
    if (!PyNumber_Check(v))
14705
4.15M
        goto wrongtype;
14706
14707
    /* make sure number is a type of integer for o, x, and X */
14708
6.87M
    if (!PyLong_Check(v)) {
14709
0
        if (type == 'o' || type == 'x' || type == 'X') {
14710
0
            iobj = _PyNumber_Index(v);
14711
0
        }
14712
0
        else {
14713
0
            iobj = PyNumber_Long(v);
14714
0
        }
14715
0
        if (iobj == NULL ) {
14716
0
            if (PyErr_ExceptionMatches(PyExc_TypeError))
14717
0
                goto wrongtype;
14718
0
            return -1;
14719
0
        }
14720
0
        assert(PyLong_Check(iobj));
14721
0
    }
14722
6.87M
    else {
14723
6.87M
        iobj = Py_NewRef(v);
14724
6.87M
    }
14725
14726
6.87M
    if (PyLong_CheckExact(v)
14727
6.87M
        && arg->width == -1 && arg->prec == -1
14728
6.87M
        && !(arg->flags & (F_SIGN | F_BLANK))
14729
6.87M
        && type != 'X')
14730
6.87M
    {
14731
        /* Fast path */
14732
6.87M
        int alternate = arg->flags & F_ALT;
14733
6.87M
        int base;
14734
14735
6.87M
        switch(type)
14736
6.87M
        {
14737
0
            default:
14738
0
                Py_UNREACHABLE();
14739
6.87M
            case 'd':
14740
6.87M
            case 'i':
14741
6.87M
            case 'u':
14742
6.87M
                base = 10;
14743
6.87M
                break;
14744
0
            case 'o':
14745
0
                base = 8;
14746
0
                break;
14747
0
            case 'x':
14748
0
            case 'X':
14749
0
                base = 16;
14750
0
                break;
14751
6.87M
        }
14752
14753
6.87M
        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14754
0
            Py_DECREF(iobj);
14755
0
            return -1;
14756
0
        }
14757
6.87M
        Py_DECREF(iobj);
14758
6.87M
        return 1;
14759
6.87M
    }
14760
14761
1.53k
    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14762
1.53k
    Py_DECREF(iobj);
14763
1.53k
    if (res == NULL)
14764
0
        return -1;
14765
1.53k
    *p_output = res;
14766
1.53k
    return 0;
14767
14768
4.15M
wrongtype:
14769
4.15M
    switch(type)
14770
4.15M
    {
14771
0
        case 'o':
14772
0
        case 'x':
14773
0
        case 'X':
14774
0
            PyErr_Format(PyExc_TypeError,
14775
0
                    "%%%c format: an integer is required, "
14776
0
                    "not %.200s",
14777
0
                    type, Py_TYPE(v)->tp_name);
14778
0
            break;
14779
4.15M
        default:
14780
4.15M
            PyErr_Format(PyExc_TypeError,
14781
4.15M
                    "%%%c format: a real number is required, "
14782
4.15M
                    "not %.200s",
14783
4.15M
                    type, Py_TYPE(v)->tp_name);
14784
4.15M
            break;
14785
4.15M
    }
14786
4.15M
    return -1;
14787
4.15M
}
14788
14789
static Py_UCS4
14790
formatchar(PyObject *v)
14791
0
{
14792
    /* presume that the buffer is at least 3 characters long */
14793
0
    if (PyUnicode_Check(v)) {
14794
0
        if (PyUnicode_GET_LENGTH(v) == 1) {
14795
0
            return PyUnicode_READ_CHAR(v, 0);
14796
0
        }
14797
0
        PyErr_Format(PyExc_TypeError,
14798
0
                     "%%c requires an int or a unicode character, "
14799
0
                     "not a string of length %zd",
14800
0
                     PyUnicode_GET_LENGTH(v));
14801
0
        return (Py_UCS4) -1;
14802
0
    }
14803
0
    else {
14804
0
        int overflow;
14805
0
        long x = PyLong_AsLongAndOverflow(v, &overflow);
14806
0
        if (x == -1 && PyErr_Occurred()) {
14807
0
            if (PyErr_ExceptionMatches(PyExc_TypeError)) {
14808
0
                PyErr_Format(PyExc_TypeError,
14809
0
                             "%%c requires an int or a unicode character, not %T",
14810
0
                             v);
14811
0
                return (Py_UCS4) -1;
14812
0
            }
14813
0
            return (Py_UCS4) -1;
14814
0
        }
14815
14816
0
        if (x < 0 || x > MAX_UNICODE) {
14817
            /* this includes an overflow in converting to C long */
14818
0
            PyErr_SetString(PyExc_OverflowError,
14819
0
                            "%c arg not in range(0x110000)");
14820
0
            return (Py_UCS4) -1;
14821
0
        }
14822
14823
0
        return (Py_UCS4) x;
14824
0
    }
14825
0
}
14826
14827
/* Parse options of an argument: flags, width, precision.
14828
   Handle also "%(name)" syntax.
14829
14830
   Return 0 if the argument has been formatted into arg->str.
14831
   Return 1 if the argument has been written into ctx->writer,
14832
   Raise an exception and return -1 on error. */
14833
static int
14834
unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14835
                         struct unicode_format_arg_t *arg)
14836
45.0M
{
14837
45.0M
#define FORMAT_READ(ctx) \
14838
45.3M
        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14839
14840
45.0M
    PyObject *v;
14841
14842
45.0M
    if (arg->ch == '(') {
14843
        /* Get argument value from a dictionary. Example: "%(name)s". */
14844
38.3k
        Py_ssize_t keystart;
14845
38.3k
        Py_ssize_t keylen;
14846
38.3k
        PyObject *key;
14847
38.3k
        int pcount = 1;
14848
14849
38.3k
        if (ctx->dict == NULL) {
14850
0
            PyErr_SetString(PyExc_TypeError,
14851
0
                            "format requires a mapping");
14852
0
            return -1;
14853
0
        }
14854
38.3k
        ++ctx->fmtpos;
14855
38.3k
        --ctx->fmtcnt;
14856
38.3k
        keystart = ctx->fmtpos;
14857
        /* Skip over balanced parentheses */
14858
345k
        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14859
306k
            arg->ch = FORMAT_READ(ctx);
14860
306k
            if (arg->ch == ')')
14861
38.3k
                --pcount;
14862
268k
            else if (arg->ch == '(')
14863
0
                ++pcount;
14864
306k
            ctx->fmtpos++;
14865
306k
        }
14866
38.3k
        keylen = ctx->fmtpos - keystart - 1;
14867
38.3k
        if (ctx->fmtcnt < 0 || pcount > 0) {
14868
0
            PyErr_SetString(PyExc_ValueError,
14869
0
                            "incomplete format key");
14870
0
            return -1;
14871
0
        }
14872
38.3k
        key = PyUnicode_Substring(ctx->fmtstr,
14873
38.3k
                                  keystart, keystart + keylen);
14874
38.3k
        if (key == NULL)
14875
0
            return -1;
14876
38.3k
        if (ctx->args_owned) {
14877
27.3k
            ctx->args_owned = 0;
14878
27.3k
            Py_DECREF(ctx->args);
14879
27.3k
        }
14880
38.3k
        ctx->args = PyObject_GetItem(ctx->dict, key);
14881
38.3k
        Py_DECREF(key);
14882
38.3k
        if (ctx->args == NULL)
14883
0
            return -1;
14884
38.3k
        ctx->args_owned = 1;
14885
38.3k
        ctx->arglen = -1;
14886
38.3k
        ctx->argidx = -2;
14887
38.3k
    }
14888
14889
    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14890
45.0M
    while (--ctx->fmtcnt >= 0) {
14891
45.0M
        arg->ch = FORMAT_READ(ctx);
14892
45.0M
        ctx->fmtpos++;
14893
45.0M
        switch (arg->ch) {
14894
0
        case '-': arg->flags |= F_LJUST; continue;
14895
0
        case '+': arg->flags |= F_SIGN; continue;
14896
0
        case ' ': arg->flags |= F_BLANK; continue;
14897
0
        case '#': arg->flags |= F_ALT; continue;
14898
1.53k
        case '0': arg->flags |= F_ZERO; continue;
14899
45.0M
        }
14900
45.0M
        break;
14901
45.0M
    }
14902
14903
    /* Parse width. Example: "%10s" => width=10 */
14904
45.0M
    if (arg->ch == '*') {
14905
0
        v = unicode_format_getnextarg(ctx);
14906
0
        if (v == NULL)
14907
0
            return -1;
14908
0
        if (!PyLong_Check(v)) {
14909
0
            PyErr_SetString(PyExc_TypeError,
14910
0
                            "* wants int");
14911
0
            return -1;
14912
0
        }
14913
0
        arg->width = PyLong_AsSsize_t(v);
14914
0
        if (arg->width == -1 && PyErr_Occurred())
14915
0
            return -1;
14916
0
        if (arg->width < 0) {
14917
0
            arg->flags |= F_LJUST;
14918
0
            arg->width = -arg->width;
14919
0
        }
14920
0
        if (--ctx->fmtcnt >= 0) {
14921
0
            arg->ch = FORMAT_READ(ctx);
14922
0
            ctx->fmtpos++;
14923
0
        }
14924
0
    }
14925
45.0M
    else if (arg->ch >= '0' && arg->ch <= '9') {
14926
1.53k
        arg->width = arg->ch - '0';
14927
1.53k
        while (--ctx->fmtcnt >= 0) {
14928
1.53k
            arg->ch = FORMAT_READ(ctx);
14929
1.53k
            ctx->fmtpos++;
14930
1.53k
            if (arg->ch < '0' || arg->ch > '9')
14931
1.53k
                break;
14932
            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14933
               mixing signed and unsigned comparison. Since arg->ch is between
14934
               '0' and '9', casting to int is safe. */
14935
0
            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14936
0
                PyErr_SetString(PyExc_ValueError,
14937
0
                                "width too big");
14938
0
                return -1;
14939
0
            }
14940
0
            arg->width = arg->width*10 + (arg->ch - '0');
14941
0
        }
14942
1.53k
    }
14943
14944
    /* Parse precision. Example: "%.3f" => prec=3 */
14945
45.0M
    if (arg->ch == '.') {
14946
0
        arg->prec = 0;
14947
0
        if (--ctx->fmtcnt >= 0) {
14948
0
            arg->ch = FORMAT_READ(ctx);
14949
0
            ctx->fmtpos++;
14950
0
        }
14951
0
        if (arg->ch == '*') {
14952
0
            v = unicode_format_getnextarg(ctx);
14953
0
            if (v == NULL)
14954
0
                return -1;
14955
0
            if (!PyLong_Check(v)) {
14956
0
                PyErr_SetString(PyExc_TypeError,
14957
0
                                "* wants int");
14958
0
                return -1;
14959
0
            }
14960
0
            arg->prec = PyLong_AsInt(v);
14961
0
            if (arg->prec == -1 && PyErr_Occurred())
14962
0
                return -1;
14963
0
            if (arg->prec < 0)
14964
0
                arg->prec = 0;
14965
0
            if (--ctx->fmtcnt >= 0) {
14966
0
                arg->ch = FORMAT_READ(ctx);
14967
0
                ctx->fmtpos++;
14968
0
            }
14969
0
        }
14970
0
        else if (arg->ch >= '0' && arg->ch <= '9') {
14971
0
            arg->prec = arg->ch - '0';
14972
0
            while (--ctx->fmtcnt >= 0) {
14973
0
                arg->ch = FORMAT_READ(ctx);
14974
0
                ctx->fmtpos++;
14975
0
                if (arg->ch < '0' || arg->ch > '9')
14976
0
                    break;
14977
0
                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14978
0
                    PyErr_SetString(PyExc_ValueError,
14979
0
                                    "precision too big");
14980
0
                    return -1;
14981
0
                }
14982
0
                arg->prec = arg->prec*10 + (arg->ch - '0');
14983
0
            }
14984
0
        }
14985
0
    }
14986
14987
    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14988
45.0M
    if (ctx->fmtcnt >= 0) {
14989
45.0M
        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14990
0
            if (--ctx->fmtcnt >= 0) {
14991
0
                arg->ch = FORMAT_READ(ctx);
14992
0
                ctx->fmtpos++;
14993
0
            }
14994
0
        }
14995
45.0M
    }
14996
45.0M
    if (ctx->fmtcnt < 0) {
14997
0
        PyErr_SetString(PyExc_ValueError,
14998
0
                        "incomplete format");
14999
0
        return -1;
15000
0
    }
15001
45.0M
    return 0;
15002
15003
45.0M
#undef FORMAT_READ
15004
45.0M
}
15005
15006
/* Format one argument. Supported conversion specifiers:
15007
15008
   - "s", "r", "a": any type
15009
   - "i", "d", "u": int or float
15010
   - "o", "x", "X": int
15011
   - "e", "E", "f", "F", "g", "G": float
15012
   - "c": int or str (1 character)
15013
15014
   When possible, the output is written directly into the Unicode writer
15015
   (ctx->writer). A string is created when padding is required.
15016
15017
   Return 0 if the argument has been formatted into *p_str,
15018
          1 if the argument has been written into ctx->writer,
15019
         -1 on error. */
15020
static int
15021
unicode_format_arg_format(struct unicode_formatter_t *ctx,
15022
                          struct unicode_format_arg_t *arg,
15023
                          PyObject **p_str)
15024
45.0M
{
15025
45.0M
    PyObject *v;
15026
45.0M
    _PyUnicodeWriter *writer = &ctx->writer;
15027
15028
45.0M
    if (ctx->fmtcnt == 0)
15029
10.9M
        ctx->writer.overallocate = 0;
15030
15031
45.0M
    v = unicode_format_getnextarg(ctx);
15032
45.0M
    if (v == NULL)
15033
0
        return -1;
15034
15035
15036
45.0M
    switch (arg->ch) {
15037
33.9M
    case 's':
15038
33.9M
    case 'r':
15039
33.9M
    case 'a':
15040
33.9M
        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15041
            /* Fast path */
15042
0
            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15043
0
                return -1;
15044
0
            return 1;
15045
0
        }
15046
15047
33.9M
        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15048
33.9M
            *p_str = Py_NewRef(v);
15049
33.9M
        }
15050
0
        else {
15051
0
            if (arg->ch == 's')
15052
0
                *p_str = PyObject_Str(v);
15053
0
            else if (arg->ch == 'r')
15054
0
                *p_str = PyObject_Repr(v);
15055
0
            else
15056
0
                *p_str = PyObject_ASCII(v);
15057
0
        }
15058
33.9M
        break;
15059
15060
0
    case 'i':
15061
11.0M
    case 'd':
15062
11.0M
    case 'u':
15063
11.0M
    case 'o':
15064
11.0M
    case 'x':
15065
11.0M
    case 'X':
15066
11.0M
    {
15067
11.0M
        int ret = mainformatlong(v, arg, p_str, writer);
15068
11.0M
        if (ret != 0)
15069
11.0M
            return ret;
15070
1.53k
        arg->sign = 1;
15071
1.53k
        break;
15072
11.0M
    }
15073
15074
0
    case 'e':
15075
0
    case 'E':
15076
0
    case 'f':
15077
0
    case 'F':
15078
0
    case 'g':
15079
0
    case 'G':
15080
0
        if (arg->width == -1 && arg->prec == -1
15081
0
            && !(arg->flags & (F_SIGN | F_BLANK)))
15082
0
        {
15083
            /* Fast path */
15084
0
            if (formatfloat(v, arg, NULL, writer) == -1)
15085
0
                return -1;
15086
0
            return 1;
15087
0
        }
15088
15089
0
        arg->sign = 1;
15090
0
        if (formatfloat(v, arg, p_str, NULL) == -1)
15091
0
            return -1;
15092
0
        break;
15093
15094
0
    case 'c':
15095
0
    {
15096
0
        Py_UCS4 ch = formatchar(v);
15097
0
        if (ch == (Py_UCS4) -1)
15098
0
            return -1;
15099
0
        if (arg->width == -1 && arg->prec == -1) {
15100
            /* Fast path */
15101
0
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
15102
0
                return -1;
15103
0
            return 1;
15104
0
        }
15105
0
        *p_str = PyUnicode_FromOrdinal(ch);
15106
0
        break;
15107
0
    }
15108
15109
0
    default:
15110
0
        PyErr_Format(PyExc_ValueError,
15111
0
                     "unsupported format character '%c' (0x%x) "
15112
0
                     "at index %zd",
15113
0
                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15114
0
                     (int)arg->ch,
15115
0
                     ctx->fmtpos - 1);
15116
0
        return -1;
15117
45.0M
    }
15118
33.9M
    if (*p_str == NULL)
15119
0
        return -1;
15120
33.9M
    assert (PyUnicode_Check(*p_str));
15121
33.9M
    return 0;
15122
33.9M
}
15123
15124
static int
15125
unicode_format_arg_output(struct unicode_formatter_t *ctx,
15126
                          struct unicode_format_arg_t *arg,
15127
                          PyObject *str)
15128
33.9M
{
15129
33.9M
    Py_ssize_t len;
15130
33.9M
    int kind;
15131
33.9M
    const void *pbuf;
15132
33.9M
    Py_ssize_t pindex;
15133
33.9M
    Py_UCS4 signchar;
15134
33.9M
    Py_ssize_t buflen;
15135
33.9M
    Py_UCS4 maxchar;
15136
33.9M
    Py_ssize_t sublen;
15137
33.9M
    _PyUnicodeWriter *writer = &ctx->writer;
15138
33.9M
    Py_UCS4 fill;
15139
15140
33.9M
    fill = ' ';
15141
33.9M
    if (arg->sign && arg->flags & F_ZERO)
15142
1.53k
        fill = '0';
15143
15144
33.9M
    len = PyUnicode_GET_LENGTH(str);
15145
33.9M
    if ((arg->width == -1 || arg->width <= len)
15146
33.9M
        && (arg->prec == -1 || arg->prec >= len)
15147
33.9M
        && !(arg->flags & (F_SIGN | F_BLANK)))
15148
33.9M
    {
15149
        /* Fast path */
15150
33.9M
        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15151
0
            return -1;
15152
33.9M
        return 0;
15153
33.9M
    }
15154
15155
    /* Truncate the string for "s", "r" and "a" formats
15156
       if the precision is set */
15157
96
    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15158
0
        if (arg->prec >= 0 && len > arg->prec)
15159
0
            len = arg->prec;
15160
0
    }
15161
15162
    /* Adjust sign and width */
15163
96
    kind = PyUnicode_KIND(str);
15164
96
    pbuf = PyUnicode_DATA(str);
15165
96
    pindex = 0;
15166
96
    signchar = '\0';
15167
96
    if (arg->sign) {
15168
96
        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15169
96
        if (ch == '-' || ch == '+') {
15170
0
            signchar = ch;
15171
0
            len--;
15172
0
            pindex++;
15173
0
        }
15174
96
        else if (arg->flags & F_SIGN)
15175
0
            signchar = '+';
15176
96
        else if (arg->flags & F_BLANK)
15177
0
            signchar = ' ';
15178
96
        else
15179
96
            arg->sign = 0;
15180
96
    }
15181
96
    if (arg->width < len)
15182
0
        arg->width = len;
15183
15184
    /* Prepare the writer */
15185
96
    maxchar = writer->maxchar;
15186
96
    if (!(arg->flags & F_LJUST)) {
15187
96
        if (arg->sign) {
15188
0
            if ((arg->width-1) > len)
15189
0
                maxchar = Py_MAX(maxchar, fill);
15190
0
        }
15191
96
        else {
15192
96
            if (arg->width > len)
15193
96
                maxchar = Py_MAX(maxchar, fill);
15194
96
        }
15195
96
    }
15196
96
    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15197
0
        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
15198
0
        maxchar = Py_MAX(maxchar, strmaxchar);
15199
0
    }
15200
15201
96
    buflen = arg->width;
15202
96
    if (arg->sign && len == arg->width)
15203
0
        buflen++;
15204
96
    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
15205
0
        return -1;
15206
15207
    /* Write the sign if needed */
15208
96
    if (arg->sign) {
15209
0
        if (fill != ' ') {
15210
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15211
0
            writer->pos += 1;
15212
0
        }
15213
0
        if (arg->width > len)
15214
0
            arg->width--;
15215
0
    }
15216
15217
    /* Write the numeric prefix for "x", "X" and "o" formats
15218
       if the alternate form is used.
15219
       For example, write "0x" for the "%#x" format. */
15220
96
    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15221
0
        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15222
0
        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15223
0
        if (fill != ' ') {
15224
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15225
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15226
0
            writer->pos += 2;
15227
0
            pindex += 2;
15228
0
        }
15229
0
        arg->width -= 2;
15230
0
        if (arg->width < 0)
15231
0
            arg->width = 0;
15232
0
        len -= 2;
15233
0
    }
15234
15235
    /* Pad left with the fill character if needed */
15236
96
    if (arg->width > len && !(arg->flags & F_LJUST)) {
15237
96
        sublen = arg->width - len;
15238
96
        _PyUnicode_Fill(writer->kind, writer->data, fill, writer->pos, sublen);
15239
96
        writer->pos += sublen;
15240
96
        arg->width = len;
15241
96
    }
15242
15243
    /* If padding with spaces: write sign if needed and/or numeric prefix if
15244
       the alternate form is used */
15245
96
    if (fill == ' ') {
15246
0
        if (arg->sign) {
15247
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15248
0
            writer->pos += 1;
15249
0
        }
15250
0
        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15251
0
            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15252
0
            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15253
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15254
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15255
0
            writer->pos += 2;
15256
0
            pindex += 2;
15257
0
        }
15258
0
    }
15259
15260
    /* Write characters */
15261
96
    if (len) {
15262
96
        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15263
96
                                      str, pindex, len);
15264
96
        writer->pos += len;
15265
96
    }
15266
15267
    /* Pad right with the fill character if needed */
15268
96
    if (arg->width > len) {
15269
0
        sublen = arg->width - len;
15270
0
        _PyUnicode_Fill(writer->kind, writer->data, ' ', writer->pos, sublen);
15271
0
        writer->pos += sublen;
15272
0
    }
15273
96
    return 0;
15274
96
}
15275
15276
/* Helper of PyUnicode_Format(): format one arg.
15277
   Return 0 on success, raise an exception and return -1 on error. */
15278
static int
15279
unicode_format_arg(struct unicode_formatter_t *ctx)
15280
45.0M
{
15281
45.0M
    struct unicode_format_arg_t arg;
15282
45.0M
    PyObject *str;
15283
45.0M
    int ret;
15284
15285
45.0M
    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
15286
45.0M
    if (arg.ch == '%') {
15287
0
        ctx->fmtpos++;
15288
0
        ctx->fmtcnt--;
15289
0
        if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15290
0
            return -1;
15291
0
        return 0;
15292
0
    }
15293
45.0M
    arg.flags = 0;
15294
45.0M
    arg.width = -1;
15295
45.0M
    arg.prec = -1;
15296
45.0M
    arg.sign = 0;
15297
45.0M
    str = NULL;
15298
15299
45.0M
    ret = unicode_format_arg_parse(ctx, &arg);
15300
45.0M
    if (ret == -1)
15301
0
        return -1;
15302
15303
45.0M
    ret = unicode_format_arg_format(ctx, &arg, &str);
15304
45.0M
    if (ret == -1)
15305
4.15M
        return -1;
15306
15307
40.8M
    if (ret != 1) {
15308
33.9M
        ret = unicode_format_arg_output(ctx, &arg, str);
15309
33.9M
        Py_DECREF(str);
15310
33.9M
        if (ret == -1)
15311
0
            return -1;
15312
33.9M
    }
15313
15314
40.8M
    if (ctx->dict && (ctx->argidx < ctx->arglen)) {
15315
0
        PyErr_SetString(PyExc_TypeError,
15316
0
                        "not all arguments converted during string formatting");
15317
0
        return -1;
15318
0
    }
15319
40.8M
    return 0;
15320
40.8M
}
15321
15322
PyObject *
15323
PyUnicode_Format(PyObject *format, PyObject *args)
15324
23.0M
{
15325
23.0M
    struct unicode_formatter_t ctx;
15326
15327
23.0M
    if (format == NULL || args == NULL) {
15328
0
        PyErr_BadInternalCall();
15329
0
        return NULL;
15330
0
    }
15331
15332
23.0M
    if (ensure_unicode(format) < 0)
15333
0
        return NULL;
15334
15335
23.0M
    ctx.fmtstr = format;
15336
23.0M
    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15337
23.0M
    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15338
23.0M
    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15339
23.0M
    ctx.fmtpos = 0;
15340
15341
23.0M
    _PyUnicodeWriter_Init(&ctx.writer);
15342
23.0M
    ctx.writer.min_length = ctx.fmtcnt + 100;
15343
23.0M
    ctx.writer.overallocate = 1;
15344
15345
23.0M
    if (PyTuple_Check(args)) {
15346
5.55M
        ctx.arglen = PyTuple_Size(args);
15347
5.55M
        ctx.argidx = 0;
15348
5.55M
    }
15349
17.5M
    else {
15350
17.5M
        ctx.arglen = -1;
15351
17.5M
        ctx.argidx = -2;
15352
17.5M
    }
15353
23.0M
    ctx.args_owned = 0;
15354
23.0M
    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
15355
10.9k
        ctx.dict = args;
15356
23.0M
    else
15357
23.0M
        ctx.dict = NULL;
15358
23.0M
    ctx.args = args;
15359
15360
110M
    while (--ctx.fmtcnt >= 0) {
15361
91.2M
        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15362
46.2M
            Py_ssize_t nonfmtpos;
15363
15364
46.2M
            nonfmtpos = ctx.fmtpos++;
15365
451M
            while (ctx.fmtcnt >= 0 &&
15366
439M
                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15367
405M
                ctx.fmtpos++;
15368
405M
                ctx.fmtcnt--;
15369
405M
            }
15370
46.2M
            if (ctx.fmtcnt < 0) {
15371
12.1M
                ctx.fmtpos--;
15372
12.1M
                ctx.writer.overallocate = 0;
15373
12.1M
            }
15374
15375
46.2M
            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15376
46.2M
                                                nonfmtpos, ctx.fmtpos) < 0)
15377
0
                goto onError;
15378
46.2M
        }
15379
45.0M
        else {
15380
45.0M
            ctx.fmtpos++;
15381
45.0M
            if (unicode_format_arg(&ctx) == -1)
15382
4.15M
                goto onError;
15383
45.0M
        }
15384
91.2M
    }
15385
15386
18.9M
    if (ctx.argidx < ctx.arglen && !ctx.dict) {
15387
0
        PyErr_SetString(PyExc_TypeError,
15388
0
                        "not all arguments converted during string formatting");
15389
0
        goto onError;
15390
0
    }
15391
15392
18.9M
    if (ctx.args_owned) {
15393
10.9k
        Py_DECREF(ctx.args);
15394
10.9k
    }
15395
18.9M
    return _PyUnicodeWriter_Finish(&ctx.writer);
15396
15397
4.15M
  onError:
15398
4.15M
    _PyUnicodeWriter_Dealloc(&ctx.writer);
15399
4.15M
    if (ctx.args_owned) {
15400
0
        Py_DECREF(ctx.args);
15401
0
    }
15402
4.15M
    return NULL;
15403
18.9M
}
15404
15405
static PyObject *
15406
unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15407
15408
/*[clinic input]
15409
@classmethod
15410
str.__new__ as unicode_new
15411
15412
    object as x: object = NULL
15413
    encoding: str = NULL
15414
    errors: str = NULL
15415
15416
[clinic start generated code]*/
15417
15418
static PyObject *
15419
unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15420
                 const char *errors)
15421
/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
15422
10.9M
{
15423
10.9M
    PyObject *unicode;
15424
10.9M
    if (x == NULL) {
15425
0
        unicode = unicode_get_empty();
15426
0
    }
15427
10.9M
    else if (encoding == NULL && errors == NULL) {
15428
10.9M
        unicode = PyObject_Str(x);
15429
10.9M
    }
15430
0
    else {
15431
0
        unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15432
0
    }
15433
15434
10.9M
    if (unicode != NULL && type != &PyUnicode_Type) {
15435
10.9M
        Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15436
10.9M
    }
15437
10.9M
    return unicode;
15438
10.9M
}
15439
15440
static const char *
15441
arg_as_utf8(PyObject *obj, const char *name)
15442
1.06M
{
15443
1.06M
    if (!PyUnicode_Check(obj)) {
15444
0
        PyErr_Format(PyExc_TypeError,
15445
0
                     "str() argument '%s' must be str, not %T",
15446
0
                     name, obj);
15447
0
        return NULL;
15448
0
    }
15449
1.06M
    return _PyUnicode_AsUTF8NoNUL(obj);
15450
1.06M
}
15451
15452
static PyObject *
15453
unicode_vectorcall(PyObject *type, PyObject *const *args,
15454
                   size_t nargsf, PyObject *kwnames)
15455
863k
{
15456
863k
    assert(Py_Is(_PyType_CAST(type), &PyUnicode_Type));
15457
15458
863k
    Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
15459
863k
    if (kwnames != NULL && PyTuple_GET_SIZE(kwnames) != 0) {
15460
        // Fallback to unicode_new()
15461
0
        PyObject *tuple = _PyTuple_FromArray(args, nargs);
15462
0
        if (tuple == NULL) {
15463
0
            return NULL;
15464
0
        }
15465
0
        PyObject *dict = _PyStack_AsDict(args + nargs, kwnames);
15466
0
        if (dict == NULL) {
15467
0
            Py_DECREF(tuple);
15468
0
            return NULL;
15469
0
        }
15470
0
        PyObject *ret = unicode_new(_PyType_CAST(type), tuple, dict);
15471
0
        Py_DECREF(tuple);
15472
0
        Py_DECREF(dict);
15473
0
        return ret;
15474
0
    }
15475
863k
    if (!_PyArg_CheckPositional("str", nargs, 0, 3)) {
15476
0
        return NULL;
15477
0
    }
15478
863k
    if (nargs == 0) {
15479
0
        return unicode_get_empty();
15480
0
    }
15481
863k
    PyObject *object = args[0];
15482
863k
    if (nargs == 1) {
15483
426
        return PyObject_Str(object);
15484
426
    }
15485
862k
    const char *encoding = arg_as_utf8(args[1], "encoding");
15486
862k
    if (encoding == NULL) {
15487
158
        return NULL;
15488
158
    }
15489
862k
    const char *errors = NULL;
15490
862k
    if (nargs == 3) {
15491
204k
        errors = arg_as_utf8(args[2], "errors");
15492
204k
        if (errors == NULL) {
15493
0
            return NULL;
15494
0
        }
15495
204k
    }
15496
862k
    return PyUnicode_FromEncodedObject(object, encoding, errors);
15497
862k
}
15498
15499
static PyObject *
15500
unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
15501
10.9M
{
15502
10.9M
    PyObject *self;
15503
10.9M
    Py_ssize_t length, char_size;
15504
10.9M
    int share_utf8;
15505
10.9M
    int kind;
15506
10.9M
    void *data;
15507
15508
10.9M
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
15509
10.9M
    assert(_PyUnicode_CHECK(unicode));
15510
15511
10.9M
    self = type->tp_alloc(type, 0);
15512
10.9M
    if (self == NULL) {
15513
0
        return NULL;
15514
0
    }
15515
10.9M
    kind = PyUnicode_KIND(unicode);
15516
10.9M
    length = PyUnicode_GET_LENGTH(unicode);
15517
15518
10.9M
    _PyUnicode_LENGTH(self) = length;
15519
#ifdef Py_DEBUG
15520
    _PyUnicode_HASH(self) = -1;
15521
#else
15522
10.9M
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15523
10.9M
#endif
15524
10.9M
    _PyUnicode_STATE(self).interned = 0;
15525
10.9M
    _PyUnicode_STATE(self).kind = kind;
15526
10.9M
    _PyUnicode_STATE(self).compact = 0;
15527
10.9M
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15528
10.9M
    _PyUnicode_STATE(self).statically_allocated = 0;
15529
10.9M
    PyUnicode_SET_UTF8_LENGTH(self, 0);
15530
10.9M
    PyUnicode_SET_UTF8(self, NULL);
15531
10.9M
    _PyUnicode_DATA_ANY(self) = NULL;
15532
15533
10.9M
    share_utf8 = 0;
15534
10.9M
    if (kind == PyUnicode_1BYTE_KIND) {
15535
9.76M
        char_size = 1;
15536
9.76M
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15537
9.72M
            share_utf8 = 1;
15538
9.76M
    }
15539
1.21M
    else if (kind == PyUnicode_2BYTE_KIND) {
15540
1.16M
        char_size = 2;
15541
1.16M
    }
15542
50.0k
    else {
15543
50.0k
        assert(kind == PyUnicode_4BYTE_KIND);
15544
50.0k
        char_size = 4;
15545
50.0k
    }
15546
15547
    /* Ensure we won't overflow the length. */
15548
10.9M
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15549
0
        PyErr_NoMemory();
15550
0
        goto onError;
15551
0
    }
15552
10.9M
    data = PyMem_Malloc((length + 1) * char_size);
15553
10.9M
    if (data == NULL) {
15554
0
        PyErr_NoMemory();
15555
0
        goto onError;
15556
0
    }
15557
15558
10.9M
    _PyUnicode_DATA_ANY(self) = data;
15559
10.9M
    if (share_utf8) {
15560
9.72M
        PyUnicode_SET_UTF8_LENGTH(self, length);
15561
9.72M
        PyUnicode_SET_UTF8(self, data);
15562
9.72M
    }
15563
15564
10.9M
    memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
15565
10.9M
    assert(_PyUnicode_CheckConsistency(self, 1));
15566
#ifdef Py_DEBUG
15567
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15568
#endif
15569
10.9M
    return self;
15570
15571
0
onError:
15572
0
    Py_DECREF(self);
15573
0
    return NULL;
15574
10.9M
}
15575
15576
void
15577
_PyUnicode_ExactDealloc(PyObject *op)
15578
76.2M
{
15579
76.2M
    assert(PyUnicode_CheckExact(op));
15580
76.2M
    unicode_dealloc(op);
15581
76.2M
}
15582
15583
PyDoc_STRVAR(unicode_doc,
15584
"str(object='') -> str\n\
15585
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15586
\n\
15587
Create a new string object from the given object. If encoding or\n\
15588
errors is specified, then the object must expose a data buffer\n\
15589
that will be decoded using the given encoding and error handler.\n\
15590
Otherwise, returns the result of object.__str__() (if defined)\n\
15591
or repr(object).\n\
15592
encoding defaults to 'utf-8'.\n\
15593
errors defaults to 'strict'.");
15594
15595
static PyObject *unicode_iter(PyObject *seq);
15596
15597
PyTypeObject PyUnicode_Type = {
15598
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15599
    "str",                        /* tp_name */
15600
    sizeof(PyUnicodeObject),      /* tp_basicsize */
15601
    0,                            /* tp_itemsize */
15602
    /* Slots */
15603
    unicode_dealloc,              /* tp_dealloc */
15604
    0,                            /* tp_vectorcall_offset */
15605
    0,                            /* tp_getattr */
15606
    0,                            /* tp_setattr */
15607
    0,                            /* tp_as_async */
15608
    unicode_repr,                 /* tp_repr */
15609
    &unicode_as_number,           /* tp_as_number */
15610
    &unicode_as_sequence,         /* tp_as_sequence */
15611
    &unicode_as_mapping,          /* tp_as_mapping */
15612
    unicode_hash,                 /* tp_hash*/
15613
    0,                            /* tp_call*/
15614
    unicode_str,                  /* tp_str */
15615
    PyObject_GenericGetAttr,      /* tp_getattro */
15616
    0,                            /* tp_setattro */
15617
    0,                            /* tp_as_buffer */
15618
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15619
        Py_TPFLAGS_UNICODE_SUBCLASS |
15620
        _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
15621
    unicode_doc,                  /* tp_doc */
15622
    0,                            /* tp_traverse */
15623
    0,                            /* tp_clear */
15624
    PyUnicode_RichCompare,        /* tp_richcompare */
15625
    0,                            /* tp_weaklistoffset */
15626
    unicode_iter,                 /* tp_iter */
15627
    0,                            /* tp_iternext */
15628
    unicode_methods,              /* tp_methods */
15629
    0,                            /* tp_members */
15630
    0,                            /* tp_getset */
15631
    0,                            /* tp_base */
15632
    0,                            /* tp_dict */
15633
    0,                            /* tp_descr_get */
15634
    0,                            /* tp_descr_set */
15635
    0,                            /* tp_dictoffset */
15636
    0,                            /* tp_init */
15637
    0,                            /* tp_alloc */
15638
    unicode_new,                  /* tp_new */
15639
    PyObject_Free,                /* tp_free */
15640
    .tp_vectorcall = unicode_vectorcall,
15641
};
15642
15643
/* Initialize the Unicode implementation */
15644
15645
static void
15646
_init_global_state(void)
15647
16
{
15648
16
    static int initialized = 0;
15649
16
    if (initialized) {
15650
0
        return;
15651
0
    }
15652
16
    initialized = 1;
15653
15654
    /* initialize the linebreak bloom filter */
15655
16
    const Py_UCS2 linebreak[] = {
15656
16
        0x000A, /* LINE FEED */
15657
16
        0x000D, /* CARRIAGE RETURN */
15658
16
        0x001C, /* FILE SEPARATOR */
15659
16
        0x001D, /* GROUP SEPARATOR */
15660
16
        0x001E, /* RECORD SEPARATOR */
15661
16
        0x0085, /* NEXT LINE */
15662
16
        0x2028, /* LINE SEPARATOR */
15663
16
        0x2029, /* PARAGRAPH SEPARATOR */
15664
16
    };
15665
16
    bloom_linebreak = make_bloom_mask(
15666
16
        PyUnicode_2BYTE_KIND, linebreak,
15667
16
        Py_ARRAY_LENGTH(linebreak));
15668
16
}
15669
15670
void
15671
_PyUnicode_InitState(PyInterpreterState *interp)
15672
16
{
15673
16
    if (!_Py_IsMainInterpreter(interp)) {
15674
0
        return;
15675
0
    }
15676
16
    _init_global_state();
15677
16
}
15678
15679
15680
PyStatus
15681
_PyUnicode_InitGlobalObjects(PyInterpreterState *interp)
15682
16
{
15683
16
    if (_Py_IsMainInterpreter(interp)) {
15684
16
        PyStatus status = init_global_interned_strings(interp);
15685
16
        if (_PyStatus_EXCEPTION(status)) {
15686
0
            return status;
15687
0
        }
15688
16
    }
15689
16
    assert(INTERNED_STRINGS);
15690
15691
16
    if (init_interned_dict(interp)) {
15692
0
        PyErr_Clear();
15693
0
        return _PyStatus_ERR("failed to create interned dict");
15694
0
    }
15695
15696
16
    return _PyStatus_OK();
15697
16
}
15698
15699
15700
PyStatus
15701
_PyUnicode_InitTypes(PyInterpreterState *interp)
15702
16
{
15703
16
    if (_PyStaticType_InitBuiltin(interp, &EncodingMapType) < 0) {
15704
0
        goto error;
15705
0
    }
15706
16
    if (_PyStaticType_InitBuiltin(interp, &PyFieldNameIter_Type) < 0) {
15707
0
        goto error;
15708
0
    }
15709
16
    if (_PyStaticType_InitBuiltin(interp, &PyFormatterIter_Type) < 0) {
15710
0
        goto error;
15711
0
    }
15712
16
    return _PyStatus_OK();
15713
15714
0
error:
15715
0
    return _PyStatus_ERR("Can't initialize unicode types");
15716
16
}
15717
15718
static /* non-null */ PyObject*
15719
intern_static(PyInterpreterState *interp, PyObject *s /* stolen */)
15720
17.1k
{
15721
    // Note that this steals a reference to `s`, but in many cases that
15722
    // stolen ref is returned, requiring no decref/incref.
15723
15724
17.1k
    assert(s != NULL);
15725
17.1k
    assert(_PyUnicode_CHECK(s));
15726
17.1k
    assert(_PyUnicode_STATE(s).statically_allocated);
15727
17.1k
    assert(!PyUnicode_CHECK_INTERNED(s));
15728
15729
#ifdef Py_DEBUG
15730
    /* We must not add process-global interned string if there's already a
15731
     * per-interpreter interned_dict, which might contain duplicates.
15732
     */
15733
    PyObject *interned = get_interned_dict(interp);
15734
    assert(interned == NULL);
15735
#endif
15736
15737
    /* Look in the global cache first. */
15738
17.1k
    PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15739
    /* We should only init each string once */
15740
17.1k
    assert(r == NULL);
15741
    /* but just in case (for the non-debug build), handle this */
15742
17.1k
    if (r != NULL && r != s) {
15743
0
        assert(_PyUnicode_STATE(r).interned == SSTATE_INTERNED_IMMORTAL_STATIC);
15744
0
        assert(_PyUnicode_CHECK(r));
15745
0
        Py_DECREF(s);
15746
0
        return Py_NewRef(r);
15747
0
    }
15748
15749
17.1k
    if (_Py_hashtable_set(INTERNED_STRINGS, s, s) < -1) {
15750
0
        Py_FatalError("failed to intern static string");
15751
0
    }
15752
15753
17.1k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_IMMORTAL_STATIC;
15754
17.1k
    return s;
15755
17.1k
}
15756
15757
void
15758
_PyUnicode_InternStatic(PyInterpreterState *interp, PyObject **p)
15759
17.1k
{
15760
    // This should only be called as part of runtime initialization
15761
17.1k
    assert(!Py_IsInitialized());
15762
15763
17.1k
    *p = intern_static(interp, *p);
15764
17.1k
    assert(*p);
15765
17.1k
}
15766
15767
static void
15768
immortalize_interned(PyObject *s)
15769
100k
{
15770
100k
    assert(PyUnicode_CHECK_INTERNED(s) == SSTATE_INTERNED_MORTAL);
15771
100k
    assert(!_Py_IsImmortal(s));
15772
#ifdef Py_REF_DEBUG
15773
    /* The reference count value should be excluded from the RefTotal.
15774
       The decrements to these objects will not be registered so they
15775
       need to be accounted for in here. */
15776
    for (Py_ssize_t i = 0; i < Py_REFCNT(s); i++) {
15777
        _Py_DecRefTotal(_PyThreadState_GET());
15778
    }
15779
#endif
15780
100k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_IMMORTAL);
15781
100k
    _Py_SetImmortal(s);
15782
100k
}
15783
15784
static /* non-null */ PyObject*
15785
intern_common(PyInterpreterState *interp, PyObject *s /* stolen */,
15786
              bool immortalize)
15787
36.0M
{
15788
    // Note that this steals a reference to `s`, but in many cases that
15789
    // stolen ref is returned, requiring no decref/incref.
15790
15791
#ifdef Py_DEBUG
15792
    assert(s != NULL);
15793
    assert(_PyUnicode_CHECK(s));
15794
#else
15795
36.0M
    if (s == NULL || !PyUnicode_Check(s)) {
15796
0
        return s;
15797
0
    }
15798
36.0M
#endif
15799
15800
    /* If it's a subclass, we don't really know what putting
15801
       it in the interned dict might do. */
15802
36.0M
    if (!PyUnicode_CheckExact(s)) {
15803
0
        return s;
15804
0
    }
15805
15806
    /* Is it already interned? */
15807
36.0M
    switch (PyUnicode_CHECK_INTERNED(s)) {
15808
3.34M
        case SSTATE_NOT_INTERNED:
15809
            // no, go on
15810
3.34M
            break;
15811
19.5k
        case SSTATE_INTERNED_MORTAL:
15812
            // yes but we might need to make it immortal
15813
19.5k
            if (immortalize) {
15814
5.54k
                immortalize_interned(s);
15815
5.54k
            }
15816
19.5k
            return s;
15817
32.6M
        default:
15818
            // all done
15819
32.6M
            return s;
15820
36.0M
    }
15821
15822
    /* Statically allocated strings must be already interned. */
15823
36.0M
    assert(!_PyUnicode_STATE(s).statically_allocated);
15824
15825
#if Py_GIL_DISABLED
15826
    /* In the free-threaded build, all interned strings are immortal */
15827
    immortalize = 1;
15828
#endif
15829
15830
    /* If it's already immortal, intern it as such */
15831
3.34M
    if (_Py_IsImmortal(s)) {
15832
0
        immortalize = 1;
15833
0
    }
15834
15835
    /* if it's a short string, get the singleton */
15836
3.34M
    if (PyUnicode_GET_LENGTH(s) == 1 &&
15837
23.5k
                PyUnicode_KIND(s) == PyUnicode_1BYTE_KIND) {
15838
0
        PyObject *r = LATIN1(*(unsigned char*)PyUnicode_DATA(s));
15839
0
        assert(PyUnicode_CHECK_INTERNED(r));
15840
0
        Py_DECREF(s);
15841
0
        return r;
15842
0
    }
15843
#ifdef Py_DEBUG
15844
    assert(!unicode_is_singleton(s));
15845
#endif
15846
15847
    /* Look in the global cache now. */
15848
3.34M
    {
15849
3.34M
        PyObject *r = (PyObject *)_Py_hashtable_get(INTERNED_STRINGS, s);
15850
3.34M
        if (r != NULL) {
15851
288k
            assert(_PyUnicode_STATE(r).statically_allocated);
15852
288k
            assert(r != s);  // r must be statically_allocated; s is not
15853
288k
            Py_DECREF(s);
15854
288k
            return Py_NewRef(r);
15855
288k
        }
15856
3.34M
    }
15857
15858
    /* Do a setdefault on the per-interpreter cache. */
15859
3.06M
    PyObject *interned = get_interned_dict(interp);
15860
3.06M
    assert(interned != NULL);
15861
#ifdef Py_GIL_DISABLED
15862
#  define INTERN_MUTEX &_Py_INTERP_CACHED_OBJECT(interp, interned_mutex)
15863
#endif
15864
3.06M
    FT_MUTEX_LOCK(INTERN_MUTEX);
15865
3.06M
    PyObject *t;
15866
3.06M
    {
15867
3.06M
        int res = PyDict_SetDefaultRef(interned, s, s, &t);
15868
3.06M
        if (res < 0) {
15869
0
            PyErr_Clear();
15870
0
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
15871
0
            return s;
15872
0
        }
15873
3.06M
        else if (res == 1) {
15874
            // value was already present (not inserted)
15875
2.29M
            Py_DECREF(s);
15876
2.29M
            if (immortalize &&
15877
666k
                    PyUnicode_CHECK_INTERNED(t) == SSTATE_INTERNED_MORTAL) {
15878
4.19k
                immortalize_interned(t);
15879
4.19k
            }
15880
2.29M
            FT_MUTEX_UNLOCK(INTERN_MUTEX);
15881
2.29M
            return t;
15882
2.29M
        }
15883
767k
        else {
15884
            // value was newly inserted
15885
767k
            assert (s == t);
15886
767k
            Py_DECREF(t);
15887
767k
        }
15888
3.06M
    }
15889
15890
    /* NOT_INTERNED -> INTERNED_MORTAL */
15891
15892
3.06M
    assert(_PyUnicode_STATE(s).interned == SSTATE_NOT_INTERNED);
15893
15894
767k
    if (!_Py_IsImmortal(s)) {
15895
        /* The two references in interned dict (key and value) are not counted.
15896
        unicode_dealloc() and _PyUnicode_ClearInterned() take care of this. */
15897
767k
        Py_DECREF(s);
15898
767k
        Py_DECREF(s);
15899
767k
    }
15900
767k
    FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_INTERNED_MORTAL);
15901
15902
    /* INTERNED_MORTAL -> INTERNED_IMMORTAL (if needed) */
15903
15904
#ifdef Py_DEBUG
15905
    if (_Py_IsImmortal(s)) {
15906
        assert(immortalize);
15907
    }
15908
#endif
15909
767k
    if (immortalize) {
15910
90.4k
        immortalize_interned(s);
15911
90.4k
    }
15912
15913
767k
    FT_MUTEX_UNLOCK(INTERN_MUTEX);
15914
767k
    return s;
15915
3.06M
}
15916
15917
void
15918
_PyUnicode_InternImmortal(PyInterpreterState *interp, PyObject **p)
15919
2.92M
{
15920
2.92M
    *p = intern_common(interp, *p, 1);
15921
2.92M
    assert(*p);
15922
2.92M
}
15923
15924
void
15925
_PyUnicode_InternMortal(PyInterpreterState *interp, PyObject **p)
15926
33.1M
{
15927
33.1M
    *p = intern_common(interp, *p, 0);
15928
33.1M
    assert(*p);
15929
33.1M
}
15930
15931
15932
void
15933
_PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p)
15934
0
{
15935
0
    _PyUnicode_InternImmortal(interp, p);
15936
0
    return;
15937
0
}
15938
15939
void
15940
PyUnicode_InternInPlace(PyObject **p)
15941
0
{
15942
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
15943
0
    _PyUnicode_InternMortal(interp, p);
15944
0
}
15945
15946
// Public-looking name kept for the stable ABI; user should not call this:
15947
PyAPI_FUNC(void) PyUnicode_InternImmortal(PyObject **);
15948
void
15949
PyUnicode_InternImmortal(PyObject **p)
15950
0
{
15951
0
    PyInterpreterState *interp = _PyInterpreterState_GET();
15952
0
    _PyUnicode_InternImmortal(interp, p);
15953
0
}
15954
15955
PyObject *
15956
PyUnicode_InternFromString(const char *cp)
15957
1.22M
{
15958
1.22M
    PyObject *s = PyUnicode_FromString(cp);
15959
1.22M
    if (s == NULL) {
15960
0
        return NULL;
15961
0
    }
15962
1.22M
    PyInterpreterState *interp = _PyInterpreterState_GET();
15963
1.22M
    _PyUnicode_InternMortal(interp, &s);
15964
1.22M
    return s;
15965
1.22M
}
15966
15967
15968
void
15969
_PyUnicode_ClearInterned(PyInterpreterState *interp)
15970
0
{
15971
0
    PyObject *interned = get_interned_dict(interp);
15972
0
    if (interned == NULL) {
15973
0
        return;
15974
0
    }
15975
0
    assert(PyDict_CheckExact(interned));
15976
15977
0
    if (has_shared_intern_dict(interp)) {
15978
        // the dict doesn't belong to this interpreter, skip the debug
15979
        // checks on it and just clear the pointer to it
15980
0
        clear_interned_dict(interp);
15981
0
        return;
15982
0
    }
15983
15984
#ifdef INTERNED_STATS
15985
    fprintf(stderr, "releasing %zd interned strings\n",
15986
            PyDict_GET_SIZE(interned));
15987
15988
    Py_ssize_t total_length = 0;
15989
#endif
15990
0
    Py_ssize_t pos = 0;
15991
0
    PyObject *s, *ignored_value;
15992
0
    while (PyDict_Next(interned, &pos, &s, &ignored_value)) {
15993
0
        int shared = 0;
15994
0
        switch (PyUnicode_CHECK_INTERNED(s)) {
15995
0
        case SSTATE_INTERNED_IMMORTAL:
15996
            /* Make immortal interned strings mortal again. */
15997
            // Skip the Immortal Instance check and restore
15998
            // the two references (key and value) ignored
15999
            // by PyUnicode_InternInPlace().
16000
0
            _Py_SetMortal(s, 2);
16001
#ifdef Py_REF_DEBUG
16002
            /* let's be pedantic with the ref total */
16003
            _Py_IncRefTotal(_PyThreadState_GET());
16004
            _Py_IncRefTotal(_PyThreadState_GET());
16005
#endif
16006
#ifdef INTERNED_STATS
16007
            total_length += PyUnicode_GET_LENGTH(s);
16008
#endif
16009
0
            break;
16010
0
        case SSTATE_INTERNED_IMMORTAL_STATIC:
16011
            /* It is shared between interpreters, so we should unmark it
16012
               only when this is the last interpreter in which it's
16013
               interned.  We immortalize all the statically initialized
16014
               strings during startup, so we can rely on the
16015
               main interpreter to be the last one. */
16016
0
            if (!_Py_IsMainInterpreter(interp)) {
16017
0
                shared = 1;
16018
0
            }
16019
0
            break;
16020
0
        case SSTATE_INTERNED_MORTAL:
16021
            // Restore 2 references held by the interned dict; these will
16022
            // be decref'd by clear_interned_dict's PyDict_Clear.
16023
0
            _Py_RefcntAdd(s, 2);
16024
#ifdef Py_REF_DEBUG
16025
            /* let's be pedantic with the ref total */
16026
            _Py_IncRefTotal(_PyThreadState_GET());
16027
            _Py_IncRefTotal(_PyThreadState_GET());
16028
#endif
16029
0
            break;
16030
0
        case SSTATE_NOT_INTERNED:
16031
0
            _Py_FALLTHROUGH;
16032
0
        default:
16033
0
            Py_UNREACHABLE();
16034
0
        }
16035
0
        if (!shared) {
16036
0
            FT_ATOMIC_STORE_UINT8_RELAXED(_PyUnicode_STATE(s).interned, SSTATE_NOT_INTERNED);
16037
0
        }
16038
0
    }
16039
#ifdef INTERNED_STATS
16040
    fprintf(stderr,
16041
            "total length of all interned strings: %zd characters\n",
16042
            total_length);
16043
#endif
16044
16045
0
    struct _Py_unicode_state *state = &interp->unicode;
16046
0
    struct _Py_unicode_ids *ids = &state->ids;
16047
0
    for (Py_ssize_t i=0; i < ids->size; i++) {
16048
0
        Py_XINCREF(ids->array[i]);
16049
0
    }
16050
0
    clear_interned_dict(interp);
16051
0
    if (_Py_IsMainInterpreter(interp)) {
16052
0
        clear_global_interned_strings();
16053
0
    }
16054
0
}
16055
16056
16057
/********************* Unicode Iterator **************************/
16058
16059
typedef struct {
16060
    PyObject_HEAD
16061
    Py_ssize_t it_index;
16062
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
16063
} unicodeiterobject;
16064
16065
static void
16066
unicodeiter_dealloc(PyObject *op)
16067
1.81M
{
16068
1.81M
    unicodeiterobject *it = (unicodeiterobject *)op;
16069
1.81M
    _PyObject_GC_UNTRACK(it);
16070
1.81M
    Py_XDECREF(it->it_seq);
16071
1.81M
    PyObject_GC_Del(it);
16072
1.81M
}
16073
16074
static int
16075
unicodeiter_traverse(PyObject *op, visitproc visit, void *arg)
16076
12
{
16077
12
    unicodeiterobject *it = (unicodeiterobject *)op;
16078
12
    Py_VISIT(it->it_seq);
16079
12
    return 0;
16080
12
}
16081
16082
static PyObject *
16083
unicodeiter_next(PyObject *op)
16084
162M
{
16085
162M
    unicodeiterobject *it = (unicodeiterobject *)op;
16086
162M
    PyObject *seq;
16087
16088
162M
    assert(it != NULL);
16089
162M
    seq = it->it_seq;
16090
162M
    if (seq == NULL)
16091
0
        return NULL;
16092
162M
    assert(_PyUnicode_CHECK(seq));
16093
16094
162M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16095
161M
        int kind = PyUnicode_KIND(seq);
16096
161M
        const void *data = PyUnicode_DATA(seq);
16097
161M
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
16098
161M
        it->it_index++;
16099
161M
        return unicode_char(chr);
16100
161M
    }
16101
16102
764k
    it->it_seq = NULL;
16103
764k
    Py_DECREF(seq);
16104
764k
    return NULL;
16105
162M
}
16106
16107
static PyObject *
16108
unicode_ascii_iter_next(PyObject *op)
16109
95.7M
{
16110
95.7M
    unicodeiterobject *it = (unicodeiterobject *)op;
16111
95.7M
    assert(it != NULL);
16112
95.7M
    PyObject *seq = it->it_seq;
16113
95.7M
    if (seq == NULL) {
16114
0
        return NULL;
16115
0
    }
16116
95.7M
    assert(_PyUnicode_CHECK(seq));
16117
95.7M
    assert(PyUnicode_IS_COMPACT_ASCII(seq));
16118
95.7M
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
16119
94.7M
        const void *data = ((void*)(_PyASCIIObject_CAST(seq) + 1));
16120
94.7M
        Py_UCS1 chr = (Py_UCS1)PyUnicode_READ(PyUnicode_1BYTE_KIND,
16121
94.7M
                                              data, it->it_index);
16122
94.7M
        it->it_index++;
16123
94.7M
        return (PyObject*)&_Py_SINGLETON(strings).ascii[chr];
16124
94.7M
    }
16125
956k
    it->it_seq = NULL;
16126
956k
    Py_DECREF(seq);
16127
956k
    return NULL;
16128
95.7M
}
16129
16130
static PyObject *
16131
unicodeiter_len(PyObject *op, PyObject *Py_UNUSED(ignored))
16132
0
{
16133
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16134
0
    Py_ssize_t len = 0;
16135
0
    if (it->it_seq)
16136
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
16137
0
    return PyLong_FromSsize_t(len);
16138
0
}
16139
16140
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16141
16142
static PyObject *
16143
unicodeiter_reduce(PyObject *op, PyObject *Py_UNUSED(ignored))
16144
0
{
16145
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16146
0
    PyObject *iter = _PyEval_GetBuiltin(&_Py_ID(iter));
16147
16148
    /* _PyEval_GetBuiltin can invoke arbitrary code,
16149
     * call must be before access of iterator pointers.
16150
     * see issue #101765 */
16151
16152
0
    if (it->it_seq != NULL) {
16153
0
        return Py_BuildValue("N(O)n", iter, it->it_seq, it->it_index);
16154
0
    } else {
16155
0
        PyObject *u = unicode_get_empty();
16156
0
        if (u == NULL) {
16157
0
            Py_XDECREF(iter);
16158
0
            return NULL;
16159
0
        }
16160
0
        return Py_BuildValue("N(N)", iter, u);
16161
0
    }
16162
0
}
16163
16164
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16165
16166
static PyObject *
16167
unicodeiter_setstate(PyObject *op, PyObject *state)
16168
0
{
16169
0
    unicodeiterobject *it = (unicodeiterobject *)op;
16170
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
16171
0
    if (index == -1 && PyErr_Occurred())
16172
0
        return NULL;
16173
0
    if (it->it_seq != NULL) {
16174
0
        if (index < 0)
16175
0
            index = 0;
16176
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16177
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16178
0
        it->it_index = index;
16179
0
    }
16180
0
    Py_RETURN_NONE;
16181
0
}
16182
16183
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16184
16185
static PyMethodDef unicodeiter_methods[] = {
16186
    {"__length_hint__", unicodeiter_len, METH_NOARGS, length_hint_doc},
16187
    {"__reduce__",      unicodeiter_reduce, METH_NOARGS, reduce_doc},
16188
    {"__setstate__",    unicodeiter_setstate, METH_O, setstate_doc},
16189
    {NULL,      NULL}       /* sentinel */
16190
};
16191
16192
PyTypeObject PyUnicodeIter_Type = {
16193
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16194
    "str_iterator",         /* tp_name */
16195
    sizeof(unicodeiterobject),      /* tp_basicsize */
16196
    0,                  /* tp_itemsize */
16197
    /* methods */
16198
    unicodeiter_dealloc,/* tp_dealloc */
16199
    0,                  /* tp_vectorcall_offset */
16200
    0,                  /* tp_getattr */
16201
    0,                  /* tp_setattr */
16202
    0,                  /* tp_as_async */
16203
    0,                  /* tp_repr */
16204
    0,                  /* tp_as_number */
16205
    0,                  /* tp_as_sequence */
16206
    0,                  /* tp_as_mapping */
16207
    0,                  /* tp_hash */
16208
    0,                  /* tp_call */
16209
    0,                  /* tp_str */
16210
    PyObject_GenericGetAttr,        /* tp_getattro */
16211
    0,                  /* tp_setattro */
16212
    0,                  /* tp_as_buffer */
16213
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16214
    0,                  /* tp_doc */
16215
    unicodeiter_traverse, /* tp_traverse */
16216
    0,                  /* tp_clear */
16217
    0,                  /* tp_richcompare */
16218
    0,                  /* tp_weaklistoffset */
16219
    PyObject_SelfIter,          /* tp_iter */
16220
    unicodeiter_next,   /* tp_iternext */
16221
    unicodeiter_methods,            /* tp_methods */
16222
    0,
16223
};
16224
16225
PyTypeObject _PyUnicodeASCIIIter_Type = {
16226
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
16227
    .tp_name = "str_ascii_iterator",
16228
    .tp_basicsize = sizeof(unicodeiterobject),
16229
    .tp_dealloc = unicodeiter_dealloc,
16230
    .tp_getattro = PyObject_GenericGetAttr,
16231
    .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,
16232
    .tp_traverse = unicodeiter_traverse,
16233
    .tp_iter = PyObject_SelfIter,
16234
    .tp_iternext = unicode_ascii_iter_next,
16235
    .tp_methods = unicodeiter_methods,
16236
};
16237
16238
static PyObject *
16239
unicode_iter(PyObject *seq)
16240
1.81M
{
16241
1.81M
    unicodeiterobject *it;
16242
16243
1.81M
    if (!PyUnicode_Check(seq)) {
16244
0
        PyErr_BadInternalCall();
16245
0
        return NULL;
16246
0
    }
16247
1.81M
    if (PyUnicode_IS_COMPACT_ASCII(seq)) {
16248
1.04M
        it = PyObject_GC_New(unicodeiterobject, &_PyUnicodeASCIIIter_Type);
16249
1.04M
    }
16250
764k
    else {
16251
764k
        it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16252
764k
    }
16253
1.81M
    if (it == NULL)
16254
0
        return NULL;
16255
1.81M
    it->it_index = 0;
16256
1.81M
    it->it_seq = Py_NewRef(seq);
16257
1.81M
    _PyObject_GC_TRACK(it);
16258
1.81M
    return (PyObject *)it;
16259
1.81M
}
16260
16261
static int
16262
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
16263
64
{
16264
64
    int res;
16265
64
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16266
64
    if (res == -2) {
16267
0
        PyErr_Format(PyExc_RuntimeError, "cannot encode %s", name);
16268
0
        return -1;
16269
0
    }
16270
64
    if (res < 0) {
16271
0
        PyErr_NoMemory();
16272
0
        return -1;
16273
0
    }
16274
64
    return 0;
16275
64
}
16276
16277
16278
static int
16279
config_get_codec_name(wchar_t **config_encoding)
16280
32
{
16281
32
    char *encoding;
16282
32
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16283
0
        return -1;
16284
0
    }
16285
16286
32
    PyObject *name_obj = NULL;
16287
32
    PyObject *codec = _PyCodec_Lookup(encoding);
16288
32
    PyMem_RawFree(encoding);
16289
16290
32
    if (!codec)
16291
0
        goto error;
16292
16293
32
    name_obj = PyObject_GetAttrString(codec, "name");
16294
32
    Py_CLEAR(codec);
16295
32
    if (!name_obj) {
16296
0
        goto error;
16297
0
    }
16298
16299
32
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16300
32
    Py_DECREF(name_obj);
16301
32
    if (wname == NULL) {
16302
0
        goto error;
16303
0
    }
16304
16305
32
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16306
32
    if (raw_wname == NULL) {
16307
0
        PyMem_Free(wname);
16308
0
        PyErr_NoMemory();
16309
0
        goto error;
16310
0
    }
16311
16312
32
    PyMem_RawFree(*config_encoding);
16313
32
    *config_encoding = raw_wname;
16314
16315
32
    PyMem_Free(wname);
16316
32
    return 0;
16317
16318
0
error:
16319
0
    Py_XDECREF(codec);
16320
0
    Py_XDECREF(name_obj);
16321
0
    return -1;
16322
32
}
16323
16324
16325
static PyStatus
16326
init_stdio_encoding(PyInterpreterState *interp)
16327
16
{
16328
    /* Update the stdio encoding to the normalized Python codec name. */
16329
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16330
16
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
16331
0
        return _PyStatus_ERR("failed to get the Python codec name "
16332
0
                             "of the stdio encoding");
16333
0
    }
16334
16
    return _PyStatus_OK();
16335
16
}
16336
16337
16338
static int
16339
init_fs_codec(PyInterpreterState *interp)
16340
16
{
16341
16
    const PyConfig *config = _PyInterpreterState_GetConfig(interp);
16342
16343
16
    _Py_error_handler error_handler;
16344
16
    error_handler = get_error_handler_wide(config->filesystem_errors);
16345
16
    if (error_handler == _Py_ERROR_UNKNOWN) {
16346
0
        PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
16347
0
        return -1;
16348
0
    }
16349
16350
16
    char *encoding, *errors;
16351
16
    if (encode_wstr_utf8(config->filesystem_encoding,
16352
16
                         &encoding,
16353
16
                         "filesystem_encoding") < 0) {
16354
0
        return -1;
16355
0
    }
16356
16357
16
    if (encode_wstr_utf8(config->filesystem_errors,
16358
16
                         &errors,
16359
16
                         "filesystem_errors") < 0) {
16360
0
        PyMem_RawFree(encoding);
16361
0
        return -1;
16362
0
    }
16363
16364
16
    struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16365
16
    PyMem_RawFree(fs_codec->encoding);
16366
16
    fs_codec->encoding = encoding;
16367
    /* encoding has been normalized by init_fs_encoding() */
16368
16
    fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16369
16
    PyMem_RawFree(fs_codec->errors);
16370
16
    fs_codec->errors = errors;
16371
16
    fs_codec->error_handler = error_handler;
16372
16373
#ifdef _Py_FORCE_UTF8_FS_ENCODING
16374
    assert(fs_codec->utf8 == 1);
16375
#endif
16376
16377
    /* At this point, PyUnicode_EncodeFSDefault() and
16378
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16379
       the C implementation of the filesystem encoding. */
16380
16381
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16382
       global configuration variables. */
16383
16
    if (_Py_IsMainInterpreter(interp)) {
16384
16385
16
        if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16386
16
                                      fs_codec->errors) < 0) {
16387
0
            PyErr_NoMemory();
16388
0
            return -1;
16389
0
        }
16390
16
    }
16391
16
    return 0;
16392
16
}
16393
16394
16395
static PyStatus
16396
init_fs_encoding(PyThreadState *tstate)
16397
16
{
16398
16
    PyInterpreterState *interp = tstate->interp;
16399
16400
    /* Update the filesystem encoding to the normalized Python codec name.
16401
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16402
       (Python codec name). */
16403
16
    PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
16404
16
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
16405
0
        _Py_DumpPathConfig(tstate);
16406
0
        return _PyStatus_ERR("failed to get the Python codec "
16407
0
                             "of the filesystem encoding");
16408
0
    }
16409
16410
16
    if (init_fs_codec(interp) < 0) {
16411
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
16412
0
    }
16413
16
    return _PyStatus_OK();
16414
16
}
16415
16416
16417
PyStatus
16418
_PyUnicode_InitEncodings(PyThreadState *tstate)
16419
16
{
16420
16
    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
16421
16
    if (_PyStatus_EXCEPTION(status)) {
16422
0
        return status;
16423
0
    }
16424
16
    status = init_fs_encoding(tstate);
16425
16
    if (_PyStatus_EXCEPTION(status)) {
16426
0
        return status;
16427
0
    }
16428
16429
16
    return init_stdio_encoding(tstate->interp);
16430
16
}
16431
16432
16433
static void
16434
_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
16435
0
{
16436
0
    PyMem_RawFree(fs_codec->encoding);
16437
0
    fs_codec->encoding = NULL;
16438
0
    fs_codec->utf8 = 0;
16439
0
    PyMem_RawFree(fs_codec->errors);
16440
0
    fs_codec->errors = NULL;
16441
0
    fs_codec->error_handler = _Py_ERROR_UNKNOWN;
16442
0
}
16443
16444
16445
#ifdef MS_WINDOWS
16446
int
16447
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16448
{
16449
    PyInterpreterState *interp = _PyInterpreterState_GET();
16450
    PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
16451
16452
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16453
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16454
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16455
    if (encoding == NULL || errors == NULL) {
16456
        PyMem_RawFree(encoding);
16457
        PyMem_RawFree(errors);
16458
        PyErr_NoMemory();
16459
        return -1;
16460
    }
16461
16462
    PyMem_RawFree(config->filesystem_encoding);
16463
    config->filesystem_encoding = encoding;
16464
    PyMem_RawFree(config->filesystem_errors);
16465
    config->filesystem_errors = errors;
16466
16467
    return init_fs_codec(interp);
16468
}
16469
#endif
16470
16471
16472
#ifdef Py_DEBUG
16473
static inline int
16474
unicode_is_finalizing(void)
16475
{
16476
    return (get_interned_dict(_PyInterpreterState_Main()) == NULL);
16477
}
16478
#endif
16479
16480
16481
void
16482
_PyUnicode_FiniTypes(PyInterpreterState *interp)
16483
0
{
16484
0
    _PyStaticType_FiniBuiltin(interp, &EncodingMapType);
16485
0
    _PyStaticType_FiniBuiltin(interp, &PyFieldNameIter_Type);
16486
0
    _PyStaticType_FiniBuiltin(interp, &PyFormatterIter_Type);
16487
0
}
16488
16489
16490
void
16491
_PyUnicode_Fini(PyInterpreterState *interp)
16492
0
{
16493
0
    struct _Py_unicode_state *state = &interp->unicode;
16494
16495
0
    if (!has_shared_intern_dict(interp)) {
16496
        // _PyUnicode_ClearInterned() must be called before _PyUnicode_Fini()
16497
0
        assert(get_interned_dict(interp) == NULL);
16498
0
    }
16499
16500
0
    _PyUnicode_FiniEncodings(&state->fs_codec);
16501
16502
    // bpo-47182: force a unicodedata CAPI capsule re-import on
16503
    // subsequent initialization of interpreter.
16504
0
    interp->unicode.ucnhash_capi = NULL;
16505
16506
0
    unicode_clear_identifiers(state);
16507
0
}
16508
16509
/* A _string module, to export formatter_parser and formatter_field_name_split
16510
   to the string.Formatter class implemented in Python. */
16511
16512
static PyMethodDef _string_methods[] = {
16513
    {"formatter_field_name_split", formatter_field_name_split,
16514
     METH_O, PyDoc_STR("split the argument as a field name")},
16515
    {"formatter_parser", formatter_parser,
16516
     METH_O, PyDoc_STR("parse the argument as a format string")},
16517
    {NULL, NULL}
16518
};
16519
16520
static PyModuleDef_Slot module_slots[] = {
16521
    {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
16522
    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
16523
    {0, NULL}
16524
};
16525
16526
static struct PyModuleDef _string_module = {
16527
    PyModuleDef_HEAD_INIT,
16528
    .m_name = "_string",
16529
    .m_doc = PyDoc_STR("string helper module"),
16530
    .m_size = 0,
16531
    .m_methods = _string_methods,
16532
    .m_slots = module_slots,
16533
};
16534
16535
PyMODINIT_FUNC
16536
PyInit__string(void)
16537
6
{
16538
6
    return PyModuleDef_Init(&_string_module);
16539
6
}
16540
16541
16542
#undef PyUnicode_KIND
16543
int PyUnicode_KIND(PyObject *op)
16544
0
{
16545
0
    if (!PyUnicode_Check(op)) {
16546
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16547
0
        return -1;
16548
0
    }
16549
0
    return _PyASCIIObject_CAST(op)->state.kind;
16550
0
}
16551
16552
#undef PyUnicode_DATA
16553
void* PyUnicode_DATA(PyObject *op)
16554
0
{
16555
0
    if (!PyUnicode_Check(op)) {
16556
0
        PyErr_Format(PyExc_TypeError, "expect str, got %T", op);
16557
0
        return NULL;
16558
0
    }
16559
0
    return _PyUnicode_DATA(op);
16560
0
}