Coverage Report

Created: 2025-07-11 06:59

/src/Python-3.8.3/Objects/unicodeobject.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
3
Unicode implementation based on original code by Fredrik Lundh,
4
modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6
Major speed upgrades to the method implementations at the Reykjavik
7
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9
Copyright (c) Corporation for National Research Initiatives.
10
11
--------------------------------------------------------------------
12
The original string type implementation is:
13
14
  Copyright (c) 1999 by Secret Labs AB
15
  Copyright (c) 1999 by Fredrik Lundh
16
17
By obtaining, using, and/or copying this software and/or its
18
associated documentation, you agree that you have read, understood,
19
and will comply with the following terms and conditions:
20
21
Permission to use, copy, modify, and distribute this software and its
22
associated documentation for any purpose and without fee is hereby
23
granted, provided that the above copyright notice appears in all
24
copies, and that both that copyright notice and this permission notice
25
appear in supporting documentation, and that the name of Secret Labs
26
AB or the author not be used in advertising or publicity pertaining to
27
distribution of the software without specific, written prior
28
permission.
29
30
SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37
--------------------------------------------------------------------
38
39
*/
40
41
#define PY_SSIZE_T_CLEAN
42
#include "Python.h"
43
#include "pycore_initconfig.h"
44
#include "pycore_fileutils.h"
45
#include "pycore_object.h"
46
#include "pycore_pylifecycle.h"
47
#include "pycore_pystate.h"
48
#include "ucnhash.h"
49
#include "bytes_methods.h"
50
#include "stringlib/eq.h"
51
52
#ifdef MS_WINDOWS
53
#include <windows.h>
54
#endif
55
56
/* Uncomment to display statistics on interned strings at exit when
57
   using Valgrind or Insecure++. */
58
/* #define INTERNED_STATS 1 */
59
60
61
/*[clinic input]
62
class str "PyObject *" "&PyUnicode_Type"
63
[clinic start generated code]*/
64
/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66
/*[python input]
67
class Py_UCS4_converter(CConverter):
68
    type = 'Py_UCS4'
69
    converter = 'convert_uc'
70
71
    def converter_init(self):
72
        if self.default is not unspecified:
73
            self.c_default = ascii(self.default)
74
            if len(self.c_default) > 4 or self.c_default[0] != "'":
75
                self.c_default = hex(ord(self.default))
76
77
[python start generated code]*/
78
/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
79
80
/* --- Globals ------------------------------------------------------------
81
82
NOTE: In the interpreter's initialization phase, some globals are currently
83
      initialized dynamically as needed. In the process Unicode objects may
84
      be created before the Unicode type is ready.
85
86
*/
87
88
89
#ifdef __cplusplus
90
extern "C" {
91
#endif
92
93
/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94
16.7k
#define MAX_UNICODE 0x10ffff
95
96
#ifdef Py_DEBUG
97
#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
98
#else
99
#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100
#endif
101
102
#define _PyUnicode_UTF8(op)                             \
103
89.9k
    (((PyCompactUnicodeObject*)(op))->utf8)
104
#define PyUnicode_UTF8(op)                              \
105
6.87k
    (assert(_PyUnicode_CHECK(op)),                      \
106
6.87k
     assert(PyUnicode_IS_READY(op)),                    \
107
6.87k
     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
108
6.87k
         ((char*)((PyASCIIObject*)(op) + 1)) :          \
109
6.87k
         _PyUnicode_UTF8(op))
110
#define _PyUnicode_UTF8_LENGTH(op)                      \
111
2.13k
    (((PyCompactUnicodeObject*)(op))->utf8_length)
112
#define PyUnicode_UTF8_LENGTH(op)                       \
113
2.13k
    (assert(_PyUnicode_CHECK(op)),                      \
114
2.13k
     assert(PyUnicode_IS_READY(op)),                    \
115
2.13k
     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
116
2.13k
         ((PyASCIIObject*)(op))->length :               \
117
2.13k
         _PyUnicode_UTF8_LENGTH(op))
118
#define _PyUnicode_WSTR(op)                             \
119
317k
    (((PyASCIIObject*)(op))->wstr)
120
#define _PyUnicode_WSTR_LENGTH(op)                      \
121
29
    (((PyCompactUnicodeObject*)(op))->wstr_length)
122
#define _PyUnicode_LENGTH(op)                           \
123
150k
    (((PyASCIIObject *)(op))->length)
124
#define _PyUnicode_STATE(op)                            \
125
763k
    (((PyASCIIObject *)(op))->state)
126
#define _PyUnicode_HASH(op)                             \
127
352k
    (((PyASCIIObject *)(op))->hash)
128
#define _PyUnicode_KIND(op)                             \
129
104
    (assert(_PyUnicode_CHECK(op)),                      \
130
104
     ((PyASCIIObject *)(op))->state.kind)
131
#define _PyUnicode_GET_LENGTH(op)                       \
132
    (assert(_PyUnicode_CHECK(op)),                      \
133
     ((PyASCIIObject *)(op))->length)
134
#define _PyUnicode_DATA_ANY(op)                         \
135
0
    (((PyUnicodeObject*)(op))->data.any)
136
137
#undef PyUnicode_READY
138
#define PyUnicode_READY(op)                             \
139
273k
    (assert(_PyUnicode_CHECK(op)),                      \
140
273k
     (PyUnicode_IS_READY(op) ?                          \
141
273k
      0 :                                               \
142
273k
      _PyUnicode_Ready(op)))
143
144
#define _PyUnicode_SHARE_UTF8(op)                       \
145
0
    (assert(_PyUnicode_CHECK(op)),                      \
146
0
     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
147
0
     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148
#define _PyUnicode_SHARE_WSTR(op)                       \
149
6.39k
    (assert(_PyUnicode_CHECK(op)),                      \
150
6.39k
     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
152
/* true if the Unicode object has an allocated UTF-8 memory block
153
   (not shared with other data) */
154
#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
155
83.0k
    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
156
83.0k
      && _PyUnicode_UTF8(op)                            \
157
83.0k
      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
159
/* true if the Unicode object has an allocated wstr memory block
160
   (not shared with other data) */
161
#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
162
83.0k
    ((_PyUnicode_WSTR(op) &&                            \
163
83.0k
      (!PyUnicode_IS_READY(op) ||                       \
164
0
       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
166
/* Generic helper macro to convert characters of different types.
167
   from_type and to_type have to be valid type names, begin and end
168
   are pointers to the source characters which should be of type
169
   "from_type *".  to is a pointer of type "to_type *" and points to the
170
   buffer where the result characters are written to. */
171
#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172
5.15k
    do {                                                \
173
20.6k
        to_type *_to = (to_type *)(to);                \
174
5.15k
        const from_type *_iter = (const from_type *)(begin);\
175
5.15k
        const from_type *_end = (const from_type *)(end);\
176
5.15k
        Py_ssize_t n = (_end) - (_iter);                \
177
5.15k
        const from_type *_unrolled_end =                \
178
5.15k
            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
179
19.6k
        while (_iter < (_unrolled_end)) {               \
180
14.5k
            _to[0] = (to_type) _iter[0];                \
181
14.5k
            _to[1] = (to_type) _iter[1];                \
182
14.5k
            _to[2] = (to_type) _iter[2];                \
183
14.5k
            _to[3] = (to_type) _iter[3];                \
184
14.5k
            _iter += 4; _to += 4;                       \
185
14.5k
        }                                               \
186
11.9k
        while (_iter < (_end))                          \
187
6.83k
            *_to++ = (to_type) *_iter++;                \
188
5.15k
    } while (0)
189
190
#ifdef MS_WINDOWS
191
   /* On Windows, overallocate by 50% is the best factor */
192
#  define OVERALLOCATE_FACTOR 2
193
#else
194
   /* On Linux, overallocate by 25% is the best factor */
195
12.5k
#  define OVERALLOCATE_FACTOR 4
196
#endif
197
198
/* This dictionary holds all interned unicode strings.  Note that references
199
   to strings in this dictionary are *not* counted in the string's ob_refcnt.
200
   When the interned string reaches a refcnt of 0 the string deallocation
201
   function will delete the reference from this dictionary.
202
203
   Another way to look at this is that to say that the actual reference
204
   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
205
*/
206
static PyObject *interned = NULL;
207
208
/* The empty Unicode object is shared to improve performance. */
209
static PyObject *unicode_empty = NULL;
210
211
#define _Py_INCREF_UNICODE_EMPTY()                      \
212
595
    do {                                                \
213
595
        if (unicode_empty != NULL)                      \
214
595
            Py_INCREF(unicode_empty);                   \
215
595
        else {                                          \
216
14
            unicode_empty = PyUnicode_New(0, 0);        \
217
14
            if (unicode_empty != NULL) {                \
218
14
                Py_INCREF(unicode_empty);               \
219
14
                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220
14
            }                                           \
221
14
        }                                               \
222
595
    } while (0)
223
224
#define _Py_RETURN_UNICODE_EMPTY()                      \
225
581
    do {                                                \
226
581
        _Py_INCREF_UNICODE_EMPTY();                     \
227
581
        return unicode_empty;                           \
228
581
    } while (0)
229
230
static inline void
231
unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232
             Py_ssize_t start, Py_ssize_t length)
233
0
{
234
0
    assert(0 <= start);
235
0
    assert(kind != PyUnicode_WCHAR_KIND);
236
0
    switch (kind) {
237
0
    case PyUnicode_1BYTE_KIND: {
238
0
        assert(value <= 0xff);
239
0
        Py_UCS1 ch = (unsigned char)value;
240
0
        Py_UCS1 *to = (Py_UCS1 *)data + start;
241
0
        memset(to, ch, length);
242
0
        break;
243
0
    }
244
0
    case PyUnicode_2BYTE_KIND: {
245
0
        assert(value <= 0xffff);
246
0
        Py_UCS2 ch = (Py_UCS2)value;
247
0
        Py_UCS2 *to = (Py_UCS2 *)data + start;
248
0
        const Py_UCS2 *end = to + length;
249
0
        for (; to < end; ++to) *to = ch;
250
0
        break;
251
0
    }
252
0
    case PyUnicode_4BYTE_KIND: {
253
0
        assert(value <= MAX_UNICODE);
254
0
        Py_UCS4 ch = value;
255
0
        Py_UCS4 * to = (Py_UCS4 *)data + start;
256
0
        const Py_UCS4 *end = to + length;
257
0
        for (; to < end; ++to) *to = ch;
258
0
        break;
259
0
    }
260
0
    default: Py_UNREACHABLE();
261
0
    }
262
0
}
263
264
265
/* Forward declaration */
266
static inline int
267
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
268
static PyObject *
269
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
270
                    const char *errors);
271
static PyObject *
272
unicode_decode_utf8(const char *s, Py_ssize_t size,
273
                    _Py_error_handler error_handler, const char *errors,
274
                    Py_ssize_t *consumed);
275
276
/* List of static strings. */
277
static _Py_Identifier *static_strings = NULL;
278
279
/* Single character Unicode strings in the Latin-1 range are being
280
   shared as well. */
281
static PyObject *unicode_latin1[256] = {NULL};
282
283
/* Fast detection of the most frequent whitespace characters */
284
const unsigned char _Py_ascii_whitespace[] = {
285
    0, 0, 0, 0, 0, 0, 0, 0,
286
/*     case 0x0009: * CHARACTER TABULATION */
287
/*     case 0x000A: * LINE FEED */
288
/*     case 0x000B: * LINE TABULATION */
289
/*     case 0x000C: * FORM FEED */
290
/*     case 0x000D: * CARRIAGE RETURN */
291
    0, 1, 1, 1, 1, 1, 0, 0,
292
    0, 0, 0, 0, 0, 0, 0, 0,
293
/*     case 0x001C: * FILE SEPARATOR */
294
/*     case 0x001D: * GROUP SEPARATOR */
295
/*     case 0x001E: * RECORD SEPARATOR */
296
/*     case 0x001F: * UNIT SEPARATOR */
297
    0, 0, 0, 0, 1, 1, 1, 1,
298
/*     case 0x0020: * SPACE */
299
    1, 0, 0, 0, 0, 0, 0, 0,
300
    0, 0, 0, 0, 0, 0, 0, 0,
301
    0, 0, 0, 0, 0, 0, 0, 0,
302
    0, 0, 0, 0, 0, 0, 0, 0,
303
304
    0, 0, 0, 0, 0, 0, 0, 0,
305
    0, 0, 0, 0, 0, 0, 0, 0,
306
    0, 0, 0, 0, 0, 0, 0, 0,
307
    0, 0, 0, 0, 0, 0, 0, 0,
308
    0, 0, 0, 0, 0, 0, 0, 0,
309
    0, 0, 0, 0, 0, 0, 0, 0,
310
    0, 0, 0, 0, 0, 0, 0, 0,
311
    0, 0, 0, 0, 0, 0, 0, 0
312
};
313
314
/* forward */
315
static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
316
static PyObject* get_latin1_char(unsigned char ch);
317
static int unicode_modifiable(PyObject *unicode);
318
319
320
static PyObject *
321
_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
322
static PyObject *
323
_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
324
static PyObject *
325
_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
326
327
static PyObject *
328
unicode_encode_call_errorhandler(const char *errors,
329
       PyObject **errorHandler,const char *encoding, const char *reason,
330
       PyObject *unicode, PyObject **exceptionObject,
331
       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
332
333
static void
334
raise_encode_exception(PyObject **exceptionObject,
335
                       const char *encoding,
336
                       PyObject *unicode,
337
                       Py_ssize_t startpos, Py_ssize_t endpos,
338
                       const char *reason);
339
340
/* Same for linebreaks */
341
static const unsigned char ascii_linebreak[] = {
342
    0, 0, 0, 0, 0, 0, 0, 0,
343
/*         0x000A, * LINE FEED */
344
/*         0x000B, * LINE TABULATION */
345
/*         0x000C, * FORM FEED */
346
/*         0x000D, * CARRIAGE RETURN */
347
    0, 0, 1, 1, 1, 1, 0, 0,
348
    0, 0, 0, 0, 0, 0, 0, 0,
349
/*         0x001C, * FILE SEPARATOR */
350
/*         0x001D, * GROUP SEPARATOR */
351
/*         0x001E, * RECORD SEPARATOR */
352
    0, 0, 0, 0, 1, 1, 1, 0,
353
    0, 0, 0, 0, 0, 0, 0, 0,
354
    0, 0, 0, 0, 0, 0, 0, 0,
355
    0, 0, 0, 0, 0, 0, 0, 0,
356
    0, 0, 0, 0, 0, 0, 0, 0,
357
358
    0, 0, 0, 0, 0, 0, 0, 0,
359
    0, 0, 0, 0, 0, 0, 0, 0,
360
    0, 0, 0, 0, 0, 0, 0, 0,
361
    0, 0, 0, 0, 0, 0, 0, 0,
362
    0, 0, 0, 0, 0, 0, 0, 0,
363
    0, 0, 0, 0, 0, 0, 0, 0,
364
    0, 0, 0, 0, 0, 0, 0, 0,
365
    0, 0, 0, 0, 0, 0, 0, 0
366
};
367
368
static int convert_uc(PyObject *obj, void *addr);
369
370
#include "clinic/unicodeobject.c.h"
371
372
_Py_error_handler
373
_Py_GetErrorHandler(const char *errors)
374
230
{
375
230
    if (errors == NULL || strcmp(errors, "strict") == 0) {
376
0
        return _Py_ERROR_STRICT;
377
0
    }
378
230
    if (strcmp(errors, "surrogateescape") == 0) {
379
230
        return _Py_ERROR_SURROGATEESCAPE;
380
230
    }
381
0
    if (strcmp(errors, "replace") == 0) {
382
0
        return _Py_ERROR_REPLACE;
383
0
    }
384
0
    if (strcmp(errors, "ignore") == 0) {
385
0
        return _Py_ERROR_IGNORE;
386
0
    }
387
0
    if (strcmp(errors, "backslashreplace") == 0) {
388
0
        return _Py_ERROR_BACKSLASHREPLACE;
389
0
    }
390
0
    if (strcmp(errors, "surrogatepass") == 0) {
391
0
        return _Py_ERROR_SURROGATEPASS;
392
0
    }
393
0
    if (strcmp(errors, "xmlcharrefreplace") == 0) {
394
0
        return _Py_ERROR_XMLCHARREFREPLACE;
395
0
    }
396
0
    return _Py_ERROR_OTHER;
397
0
}
398
399
400
static _Py_error_handler
401
get_error_handler_wide(const wchar_t *errors)
402
4.99k
{
403
4.99k
    if (errors == NULL || wcscmp(errors, L"strict") == 0) {
404
0
        return _Py_ERROR_STRICT;
405
0
    }
406
4.99k
    if (wcscmp(errors, L"surrogateescape") == 0) {
407
4.99k
        return _Py_ERROR_SURROGATEESCAPE;
408
4.99k
    }
409
0
    if (wcscmp(errors, L"replace") == 0) {
410
0
        return _Py_ERROR_REPLACE;
411
0
    }
412
0
    if (wcscmp(errors, L"ignore") == 0) {
413
0
        return _Py_ERROR_IGNORE;
414
0
    }
415
0
    if (wcscmp(errors, L"backslashreplace") == 0) {
416
0
        return _Py_ERROR_BACKSLASHREPLACE;
417
0
    }
418
0
    if (wcscmp(errors, L"surrogatepass") == 0) {
419
0
        return _Py_ERROR_SURROGATEPASS;
420
0
    }
421
0
    if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
422
0
        return _Py_ERROR_XMLCHARREFREPLACE;
423
0
    }
424
0
    return _Py_ERROR_OTHER;
425
0
}
426
427
428
/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
429
   This function is kept for backward compatibility with the old API. */
430
Py_UNICODE
431
PyUnicode_GetMax(void)
432
0
{
433
0
#ifdef Py_UNICODE_WIDE
434
0
    return 0x10FFFF;
435
#else
436
    /* This is actually an illegal character, so it should
437
       not be passed to unichr. */
438
    return 0xFFFF;
439
#endif
440
0
}
441
442
int
443
_PyUnicode_CheckConsistency(PyObject *op, int check_content)
444
0
{
445
0
#define CHECK(expr) \
446
0
    do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
447
448
0
    PyASCIIObject *ascii;
449
0
    unsigned int kind;
450
451
0
    assert(op != NULL);
452
0
    CHECK(PyUnicode_Check(op));
453
454
0
    ascii = (PyASCIIObject *)op;
455
0
    kind = ascii->state.kind;
456
457
0
    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
458
0
        CHECK(kind == PyUnicode_1BYTE_KIND);
459
0
        CHECK(ascii->state.ready == 1);
460
0
    }
461
0
    else {
462
0
        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
463
0
        void *data;
464
465
0
        if (ascii->state.compact == 1) {
466
0
            data = compact + 1;
467
0
            CHECK(kind == PyUnicode_1BYTE_KIND
468
0
                                 || kind == PyUnicode_2BYTE_KIND
469
0
                                 || kind == PyUnicode_4BYTE_KIND);
470
0
            CHECK(ascii->state.ascii == 0);
471
0
            CHECK(ascii->state.ready == 1);
472
0
            CHECK(compact->utf8 != data);
473
0
        }
474
0
        else {
475
0
            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
476
477
0
            data = unicode->data.any;
478
0
            if (kind == PyUnicode_WCHAR_KIND) {
479
0
                CHECK(ascii->length == 0);
480
0
                CHECK(ascii->hash == -1);
481
0
                CHECK(ascii->state.compact == 0);
482
0
                CHECK(ascii->state.ascii == 0);
483
0
                CHECK(ascii->state.ready == 0);
484
0
                CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
485
0
                CHECK(ascii->wstr != NULL);
486
0
                CHECK(data == NULL);
487
0
                CHECK(compact->utf8 == NULL);
488
0
            }
489
0
            else {
490
0
                CHECK(kind == PyUnicode_1BYTE_KIND
491
0
                                     || kind == PyUnicode_2BYTE_KIND
492
0
                                     || kind == PyUnicode_4BYTE_KIND);
493
0
                CHECK(ascii->state.compact == 0);
494
0
                CHECK(ascii->state.ready == 1);
495
0
                CHECK(data != NULL);
496
0
                if (ascii->state.ascii) {
497
0
                    CHECK(compact->utf8 == data);
498
0
                    CHECK(compact->utf8_length == ascii->length);
499
0
                }
500
0
                else
501
0
                    CHECK(compact->utf8 != data);
502
0
            }
503
0
        }
504
0
        if (kind != PyUnicode_WCHAR_KIND) {
505
0
            if (
506
#if SIZEOF_WCHAR_T == 2
507
                kind == PyUnicode_2BYTE_KIND
508
#else
509
0
                kind == PyUnicode_4BYTE_KIND
510
0
#endif
511
0
               )
512
0
            {
513
0
                CHECK(ascii->wstr == data);
514
0
                CHECK(compact->wstr_length == ascii->length);
515
0
            } else
516
0
                CHECK(ascii->wstr != data);
517
0
        }
518
519
0
        if (compact->utf8 == NULL)
520
0
            CHECK(compact->utf8_length == 0);
521
0
        if (ascii->wstr == NULL)
522
0
            CHECK(compact->wstr_length == 0);
523
0
    }
524
525
    /* check that the best kind is used: O(n) operation */
526
0
    if (check_content && kind != PyUnicode_WCHAR_KIND) {
527
0
        Py_ssize_t i;
528
0
        Py_UCS4 maxchar = 0;
529
0
        void *data;
530
0
        Py_UCS4 ch;
531
532
0
        data = PyUnicode_DATA(ascii);
533
0
        for (i=0; i < ascii->length; i++)
534
0
        {
535
0
            ch = PyUnicode_READ(kind, data, i);
536
0
            if (ch > maxchar)
537
0
                maxchar = ch;
538
0
        }
539
0
        if (kind == PyUnicode_1BYTE_KIND) {
540
0
            if (ascii->state.ascii == 0) {
541
0
                CHECK(maxchar >= 128);
542
0
                CHECK(maxchar <= 255);
543
0
            }
544
0
            else
545
0
                CHECK(maxchar < 128);
546
0
        }
547
0
        else if (kind == PyUnicode_2BYTE_KIND) {
548
0
            CHECK(maxchar >= 0x100);
549
0
            CHECK(maxchar <= 0xFFFF);
550
0
        }
551
0
        else {
552
0
            CHECK(maxchar >= 0x10000);
553
0
            CHECK(maxchar <= MAX_UNICODE);
554
0
        }
555
0
        CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
556
0
    }
557
0
    return 1;
558
559
0
#undef CHECK
560
0
}
561
562
563
static PyObject*
564
unicode_result_wchar(PyObject *unicode)
565
0
{
566
0
#ifndef Py_DEBUG
567
0
    Py_ssize_t len;
568
569
0
    len = _PyUnicode_WSTR_LENGTH(unicode);
570
0
    if (len == 0) {
571
0
        Py_DECREF(unicode);
572
0
        _Py_RETURN_UNICODE_EMPTY();
573
0
    }
574
575
0
    if (len == 1) {
576
0
        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
577
0
        if ((Py_UCS4)ch < 256) {
578
0
            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
579
0
            Py_DECREF(unicode);
580
0
            return latin1_char;
581
0
        }
582
0
    }
583
584
0
    if (_PyUnicode_Ready(unicode) < 0) {
585
0
        Py_DECREF(unicode);
586
0
        return NULL;
587
0
    }
588
#else
589
    assert(Py_REFCNT(unicode) == 1);
590
591
    /* don't make the result ready in debug mode to ensure that the caller
592
       makes the string ready before using it */
593
    assert(_PyUnicode_CheckConsistency(unicode, 1));
594
#endif
595
0
    return unicode;
596
0
}
597
598
static PyObject*
599
unicode_result_ready(PyObject *unicode)
600
75.4k
{
601
75.4k
    Py_ssize_t length;
602
603
75.4k
    length = PyUnicode_GET_LENGTH(unicode);
604
75.4k
    if (length == 0) {
605
0
        if (unicode != unicode_empty) {
606
0
            Py_DECREF(unicode);
607
0
            _Py_RETURN_UNICODE_EMPTY();
608
0
        }
609
0
        return unicode_empty;
610
0
    }
611
612
75.4k
    if (length == 1) {
613
13
        void *data = PyUnicode_DATA(unicode);
614
13
        int kind = PyUnicode_KIND(unicode);
615
13
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
616
13
        if (ch < 256) {
617
13
            PyObject *latin1_char = unicode_latin1[ch];
618
13
            if (latin1_char != NULL) {
619
11
                if (unicode != latin1_char) {
620
11
                    Py_INCREF(latin1_char);
621
11
                    Py_DECREF(unicode);
622
11
                }
623
11
                return latin1_char;
624
11
            }
625
2
            else {
626
2
                assert(_PyUnicode_CheckConsistency(unicode, 1));
627
2
                Py_INCREF(unicode);
628
2
                unicode_latin1[ch] = unicode;
629
2
                return unicode;
630
2
            }
631
13
        }
632
13
    }
633
634
75.4k
    assert(_PyUnicode_CheckConsistency(unicode, 1));
635
75.4k
    return unicode;
636
75.4k
}
637
638
static PyObject*
639
unicode_result(PyObject *unicode)
640
5.14k
{
641
5.14k
    assert(_PyUnicode_CHECK(unicode));
642
5.14k
    if (PyUnicode_IS_READY(unicode))
643
5.14k
        return unicode_result_ready(unicode);
644
0
    else
645
0
        return unicode_result_wchar(unicode);
646
5.14k
}
647
648
static PyObject*
649
unicode_result_unchanged(PyObject *unicode)
650
3.80k
{
651
3.80k
    if (PyUnicode_CheckExact(unicode)) {
652
3.80k
        if (PyUnicode_READY(unicode) == -1)
653
0
            return NULL;
654
3.80k
        Py_INCREF(unicode);
655
3.80k
        return unicode;
656
3.80k
    }
657
0
    else
658
        /* Subtype -- return genuine unicode string with the same value. */
659
0
        return _PyUnicode_Copy(unicode);
660
3.80k
}
661
662
/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
663
   ASCII, Latin1, UTF-8, etc. */
664
static char*
665
backslashreplace(_PyBytesWriter *writer, char *str,
666
                 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
667
0
{
668
0
    Py_ssize_t size, i;
669
0
    Py_UCS4 ch;
670
0
    enum PyUnicode_Kind kind;
671
0
    void *data;
672
673
0
    assert(PyUnicode_IS_READY(unicode));
674
0
    kind = PyUnicode_KIND(unicode);
675
0
    data = PyUnicode_DATA(unicode);
676
677
0
    size = 0;
678
    /* determine replacement size */
679
0
    for (i = collstart; i < collend; ++i) {
680
0
        Py_ssize_t incr;
681
682
0
        ch = PyUnicode_READ(kind, data, i);
683
0
        if (ch < 0x100)
684
0
            incr = 2+2;
685
0
        else if (ch < 0x10000)
686
0
            incr = 2+4;
687
0
        else {
688
0
            assert(ch <= MAX_UNICODE);
689
0
            incr = 2+8;
690
0
        }
691
0
        if (size > PY_SSIZE_T_MAX - incr) {
692
0
            PyErr_SetString(PyExc_OverflowError,
693
0
                            "encoded result is too long for a Python string");
694
0
            return NULL;
695
0
        }
696
0
        size += incr;
697
0
    }
698
699
0
    str = _PyBytesWriter_Prepare(writer, str, size);
700
0
    if (str == NULL)
701
0
        return NULL;
702
703
    /* generate replacement */
704
0
    for (i = collstart; i < collend; ++i) {
705
0
        ch = PyUnicode_READ(kind, data, i);
706
0
        *str++ = '\\';
707
0
        if (ch >= 0x00010000) {
708
0
            *str++ = 'U';
709
0
            *str++ = Py_hexdigits[(ch>>28)&0xf];
710
0
            *str++ = Py_hexdigits[(ch>>24)&0xf];
711
0
            *str++ = Py_hexdigits[(ch>>20)&0xf];
712
0
            *str++ = Py_hexdigits[(ch>>16)&0xf];
713
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
714
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
715
0
        }
716
0
        else if (ch >= 0x100) {
717
0
            *str++ = 'u';
718
0
            *str++ = Py_hexdigits[(ch>>12)&0xf];
719
0
            *str++ = Py_hexdigits[(ch>>8)&0xf];
720
0
        }
721
0
        else
722
0
            *str++ = 'x';
723
0
        *str++ = Py_hexdigits[(ch>>4)&0xf];
724
0
        *str++ = Py_hexdigits[ch&0xf];
725
0
    }
726
0
    return str;
727
0
}
728
729
/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
730
   ASCII, Latin1, UTF-8, etc. */
731
static char*
732
xmlcharrefreplace(_PyBytesWriter *writer, char *str,
733
                  PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
734
0
{
735
0
    Py_ssize_t size, i;
736
0
    Py_UCS4 ch;
737
0
    enum PyUnicode_Kind kind;
738
0
    void *data;
739
740
0
    assert(PyUnicode_IS_READY(unicode));
741
0
    kind = PyUnicode_KIND(unicode);
742
0
    data = PyUnicode_DATA(unicode);
743
744
0
    size = 0;
745
    /* determine replacement size */
746
0
    for (i = collstart; i < collend; ++i) {
747
0
        Py_ssize_t incr;
748
749
0
        ch = PyUnicode_READ(kind, data, i);
750
0
        if (ch < 10)
751
0
            incr = 2+1+1;
752
0
        else if (ch < 100)
753
0
            incr = 2+2+1;
754
0
        else if (ch < 1000)
755
0
            incr = 2+3+1;
756
0
        else if (ch < 10000)
757
0
            incr = 2+4+1;
758
0
        else if (ch < 100000)
759
0
            incr = 2+5+1;
760
0
        else if (ch < 1000000)
761
0
            incr = 2+6+1;
762
0
        else {
763
0
            assert(ch <= MAX_UNICODE);
764
0
            incr = 2+7+1;
765
0
        }
766
0
        if (size > PY_SSIZE_T_MAX - incr) {
767
0
            PyErr_SetString(PyExc_OverflowError,
768
0
                            "encoded result is too long for a Python string");
769
0
            return NULL;
770
0
        }
771
0
        size += incr;
772
0
    }
773
774
0
    str = _PyBytesWriter_Prepare(writer, str, size);
775
0
    if (str == NULL)
776
0
        return NULL;
777
778
    /* generate replacement */
779
0
    for (i = collstart; i < collend; ++i) {
780
0
        str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
781
0
    }
782
0
    return str;
783
0
}
784
785
/* --- Bloom Filters ----------------------------------------------------- */
786
787
/* stuff to implement simple "bloom filters" for Unicode characters.
788
   to keep things simple, we use a single bitmask, using the least 5
789
   bits from each unicode characters as the bit index. */
790
791
/* the linebreak mask is set up by Unicode_Init below */
792
793
#if LONG_BIT >= 128
794
#define BLOOM_WIDTH 128
795
#elif LONG_BIT >= 64
796
7.58k
#define BLOOM_WIDTH 64
797
#elif LONG_BIT >= 32
798
#define BLOOM_WIDTH 32
799
#else
800
#error "LONG_BIT is smaller than 32"
801
#endif
802
803
7.44k
#define BLOOM_MASK unsigned long
804
805
static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
806
807
3.75k
#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
808
809
#define BLOOM_LINEBREAK(ch)                                             \
810
0
    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
811
0
     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
812
813
static inline BLOOM_MASK
814
make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
815
3.72k
{
816
3.72k
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
817
3.72k
    do {                                               \
818
3.72k
        TYPE *data = (TYPE *)PTR;                      \
819
3.72k
        TYPE *end = data + LEN;                        \
820
3.72k
        Py_UCS4 ch;                                    \
821
7.55k
        for (; data != end; data++) {                  \
822
3.82k
            ch = *data;                                \
823
3.82k
            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
824
3.82k
        }                                              \
825
3.72k
        break;                                         \
826
3.72k
    } while (0)
827
828
    /* calculate simple bloom-style bitmask for a given unicode string */
829
830
3.72k
    BLOOM_MASK mask;
831
832
3.72k
    mask = 0;
833
3.72k
    switch (kind) {
834
3.71k
    case PyUnicode_1BYTE_KIND:
835
3.71k
        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
836
0
        break;
837
14
    case PyUnicode_2BYTE_KIND:
838
14
        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
839
0
        break;
840
0
    case PyUnicode_4BYTE_KIND:
841
0
        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
842
0
        break;
843
0
    default:
844
0
        Py_UNREACHABLE();
845
3.72k
    }
846
3.72k
    return mask;
847
848
3.72k
#undef BLOOM_UPDATE
849
3.72k
}
850
851
static int
852
ensure_unicode(PyObject *obj)
853
5.00k
{
854
5.00k
    if (!PyUnicode_Check(obj)) {
855
0
        PyErr_Format(PyExc_TypeError,
856
0
                     "must be str, not %.100s",
857
0
                     Py_TYPE(obj)->tp_name);
858
0
        return -1;
859
0
    }
860
5.00k
    return PyUnicode_READY(obj);
861
5.00k
}
862
863
/* Compilation of templated routines */
864
865
#include "stringlib/asciilib.h"
866
#include "stringlib/fastsearch.h"
867
#include "stringlib/partition.h"
868
#include "stringlib/split.h"
869
#include "stringlib/count.h"
870
#include "stringlib/find.h"
871
#include "stringlib/find_max_char.h"
872
#include "stringlib/undef.h"
873
874
#include "stringlib/ucs1lib.h"
875
#include "stringlib/fastsearch.h"
876
#include "stringlib/partition.h"
877
#include "stringlib/split.h"
878
#include "stringlib/count.h"
879
#include "stringlib/find.h"
880
#include "stringlib/replace.h"
881
#include "stringlib/find_max_char.h"
882
#include "stringlib/undef.h"
883
884
#include "stringlib/ucs2lib.h"
885
#include "stringlib/fastsearch.h"
886
#include "stringlib/partition.h"
887
#include "stringlib/split.h"
888
#include "stringlib/count.h"
889
#include "stringlib/find.h"
890
#include "stringlib/replace.h"
891
#include "stringlib/find_max_char.h"
892
#include "stringlib/undef.h"
893
894
#include "stringlib/ucs4lib.h"
895
#include "stringlib/fastsearch.h"
896
#include "stringlib/partition.h"
897
#include "stringlib/split.h"
898
#include "stringlib/count.h"
899
#include "stringlib/find.h"
900
#include "stringlib/replace.h"
901
#include "stringlib/find_max_char.h"
902
#include "stringlib/undef.h"
903
904
#include "stringlib/unicodedefs.h"
905
#include "stringlib/fastsearch.h"
906
#include "stringlib/count.h"
907
#include "stringlib/find.h"
908
#include "stringlib/undef.h"
909
910
/* --- Unicode Object ----------------------------------------------------- */
911
912
static inline Py_ssize_t
913
findchar(const void *s, int kind,
914
         Py_ssize_t size, Py_UCS4 ch,
915
         int direction)
916
2.66k
{
917
2.66k
    switch (kind) {
918
2.66k
    case PyUnicode_1BYTE_KIND:
919
2.66k
        if ((Py_UCS1) ch != ch)
920
0
            return -1;
921
2.66k
        if (direction > 0)
922
2.57k
            return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
923
84
        else
924
84
            return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
925
0
    case PyUnicode_2BYTE_KIND:
926
0
        if ((Py_UCS2) ch != ch)
927
0
            return -1;
928
0
        if (direction > 0)
929
0
            return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
930
0
        else
931
0
            return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
932
0
    case PyUnicode_4BYTE_KIND:
933
0
        if (direction > 0)
934
0
            return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
935
0
        else
936
0
            return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
937
0
    default:
938
0
        Py_UNREACHABLE();
939
2.66k
    }
940
2.66k
}
941
942
#ifdef Py_DEBUG
943
/* Fill the data of a Unicode string with invalid characters to detect bugs
944
   earlier.
945
946
   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
947
   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
948
   invalid character in Unicode 6.0. */
949
static void
950
unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
951
{
952
    int kind = PyUnicode_KIND(unicode);
953
    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
954
    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
955
    if (length <= old_length)
956
        return;
957
    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
958
}
959
#endif
960
961
static PyObject*
962
resize_compact(PyObject *unicode, Py_ssize_t length)
963
6.39k
{
964
6.39k
    Py_ssize_t char_size;
965
6.39k
    Py_ssize_t struct_size;
966
6.39k
    Py_ssize_t new_size;
967
6.39k
    int share_wstr;
968
6.39k
    PyObject *new_unicode;
969
#ifdef Py_DEBUG
970
    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
971
#endif
972
973
6.39k
    assert(unicode_modifiable(unicode));
974
6.39k
    assert(PyUnicode_IS_READY(unicode));
975
6.39k
    assert(PyUnicode_IS_COMPACT(unicode));
976
977
6.39k
    char_size = PyUnicode_KIND(unicode);
978
6.39k
    if (PyUnicode_IS_ASCII(unicode))
979
6.37k
        struct_size = sizeof(PyASCIIObject);
980
15
    else
981
15
        struct_size = sizeof(PyCompactUnicodeObject);
982
6.39k
    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
983
984
6.39k
    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
985
0
        PyErr_NoMemory();
986
0
        return NULL;
987
0
    }
988
6.39k
    new_size = (struct_size + (length + 1) * char_size);
989
990
6.39k
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
991
0
        PyObject_DEL(_PyUnicode_UTF8(unicode));
992
0
        _PyUnicode_UTF8(unicode) = NULL;
993
0
        _PyUnicode_UTF8_LENGTH(unicode) = 0;
994
0
    }
995
6.39k
    _Py_DEC_REFTOTAL;
996
6.39k
    _Py_ForgetReference(unicode);
997
998
6.39k
    new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
999
6.39k
    if (new_unicode == NULL) {
1000
0
        _Py_NewReference(unicode);
1001
0
        PyErr_NoMemory();
1002
0
        return NULL;
1003
0
    }
1004
6.39k
    unicode = new_unicode;
1005
6.39k
    _Py_NewReference(unicode);
1006
1007
6.39k
    _PyUnicode_LENGTH(unicode) = length;
1008
6.39k
    if (share_wstr) {
1009
0
        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
1010
0
        if (!PyUnicode_IS_ASCII(unicode))
1011
0
            _PyUnicode_WSTR_LENGTH(unicode) = length;
1012
0
    }
1013
6.39k
    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1014
0
        PyObject_DEL(_PyUnicode_WSTR(unicode));
1015
0
        _PyUnicode_WSTR(unicode) = NULL;
1016
0
        if (!PyUnicode_IS_ASCII(unicode))
1017
0
            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1018
0
    }
1019
#ifdef Py_DEBUG
1020
    unicode_fill_invalid(unicode, old_length);
1021
#endif
1022
6.39k
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1023
6.39k
                    length, 0);
1024
6.39k
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1025
6.39k
    return unicode;
1026
6.39k
}
1027
1028
static int
1029
resize_inplace(PyObject *unicode, Py_ssize_t length)
1030
0
{
1031
0
    wchar_t *wstr;
1032
0
    Py_ssize_t new_size;
1033
0
    assert(!PyUnicode_IS_COMPACT(unicode));
1034
0
    assert(Py_REFCNT(unicode) == 1);
1035
1036
0
    if (PyUnicode_IS_READY(unicode)) {
1037
0
        Py_ssize_t char_size;
1038
0
        int share_wstr, share_utf8;
1039
0
        void *data;
1040
#ifdef Py_DEBUG
1041
        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1042
#endif
1043
1044
0
        data = _PyUnicode_DATA_ANY(unicode);
1045
0
        char_size = PyUnicode_KIND(unicode);
1046
0
        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1047
0
        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
1048
1049
0
        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1050
0
            PyErr_NoMemory();
1051
0
            return -1;
1052
0
        }
1053
0
        new_size = (length + 1) * char_size;
1054
1055
0
        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1056
0
        {
1057
0
            PyObject_DEL(_PyUnicode_UTF8(unicode));
1058
0
            _PyUnicode_UTF8(unicode) = NULL;
1059
0
            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1060
0
        }
1061
1062
0
        data = (PyObject *)PyObject_REALLOC(data, new_size);
1063
0
        if (data == NULL) {
1064
0
            PyErr_NoMemory();
1065
0
            return -1;
1066
0
        }
1067
0
        _PyUnicode_DATA_ANY(unicode) = data;
1068
0
        if (share_wstr) {
1069
0
            _PyUnicode_WSTR(unicode) = data;
1070
0
            _PyUnicode_WSTR_LENGTH(unicode) = length;
1071
0
        }
1072
0
        if (share_utf8) {
1073
0
            _PyUnicode_UTF8(unicode) = data;
1074
0
            _PyUnicode_UTF8_LENGTH(unicode) = length;
1075
0
        }
1076
0
        _PyUnicode_LENGTH(unicode) = length;
1077
0
        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
1078
#ifdef Py_DEBUG
1079
        unicode_fill_invalid(unicode, old_length);
1080
#endif
1081
0
        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
1082
0
            assert(_PyUnicode_CheckConsistency(unicode, 0));
1083
0
            return 0;
1084
0
        }
1085
0
    }
1086
0
    assert(_PyUnicode_WSTR(unicode) != NULL);
1087
1088
    /* check for integer overflow */
1089
0
    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
1090
0
        PyErr_NoMemory();
1091
0
        return -1;
1092
0
    }
1093
0
    new_size = sizeof(wchar_t) * (length + 1);
1094
0
    wstr =  _PyUnicode_WSTR(unicode);
1095
0
    wstr = PyObject_REALLOC(wstr, new_size);
1096
0
    if (!wstr) {
1097
0
        PyErr_NoMemory();
1098
0
        return -1;
1099
0
    }
1100
0
    _PyUnicode_WSTR(unicode) = wstr;
1101
0
    _PyUnicode_WSTR(unicode)[length] = 0;
1102
0
    _PyUnicode_WSTR_LENGTH(unicode) = length;
1103
0
    assert(_PyUnicode_CheckConsistency(unicode, 0));
1104
0
    return 0;
1105
0
}
1106
1107
static PyObject*
1108
resize_copy(PyObject *unicode, Py_ssize_t length)
1109
0
{
1110
0
    Py_ssize_t copy_length;
1111
0
    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
1112
0
        PyObject *copy;
1113
1114
0
        assert(PyUnicode_IS_READY(unicode));
1115
1116
0
        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1117
0
        if (copy == NULL)
1118
0
            return NULL;
1119
1120
0
        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
1121
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
1122
0
        return copy;
1123
0
    }
1124
0
    else {
1125
0
        PyObject *w;
1126
1127
0
        w = (PyObject*)_PyUnicode_New(length);
1128
0
        if (w == NULL)
1129
0
            return NULL;
1130
0
        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1131
0
        copy_length = Py_MIN(copy_length, length);
1132
0
        memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
1133
0
                  copy_length * sizeof(wchar_t));
1134
0
        return w;
1135
0
    }
1136
0
}
1137
1138
/* We allocate one more byte to make sure the string is
1139
   Ux0000 terminated; some code (e.g. new_identifier)
1140
   relies on that.
1141
1142
   XXX This allocator could further be enhanced by assuring that the
1143
   free list never reduces its size below 1.
1144
1145
*/
1146
1147
static PyUnicodeObject *
1148
_PyUnicode_New(Py_ssize_t length)
1149
14
{
1150
14
    PyUnicodeObject *unicode;
1151
14
    size_t new_size;
1152
1153
    /* Optimization for empty strings */
1154
14
    if (length == 0 && unicode_empty != NULL) {
1155
14
        Py_INCREF(unicode_empty);
1156
14
        return (PyUnicodeObject*)unicode_empty;
1157
14
    }
1158
1159
    /* Ensure we won't overflow the size. */
1160
0
    if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
1161
0
        return (PyUnicodeObject *)PyErr_NoMemory();
1162
0
    }
1163
0
    if (length < 0) {
1164
0
        PyErr_SetString(PyExc_SystemError,
1165
0
                        "Negative size passed to _PyUnicode_New");
1166
0
        return NULL;
1167
0
    }
1168
1169
0
    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1170
0
    if (unicode == NULL)
1171
0
        return NULL;
1172
0
    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
1173
1174
0
    _PyUnicode_WSTR_LENGTH(unicode) = length;
1175
0
    _PyUnicode_HASH(unicode) = -1;
1176
0
    _PyUnicode_STATE(unicode).interned = 0;
1177
0
    _PyUnicode_STATE(unicode).kind = 0;
1178
0
    _PyUnicode_STATE(unicode).compact = 0;
1179
0
    _PyUnicode_STATE(unicode).ready = 0;
1180
0
    _PyUnicode_STATE(unicode).ascii = 0;
1181
0
    _PyUnicode_DATA_ANY(unicode) = NULL;
1182
0
    _PyUnicode_LENGTH(unicode) = 0;
1183
0
    _PyUnicode_UTF8(unicode) = NULL;
1184
0
    _PyUnicode_UTF8_LENGTH(unicode) = 0;
1185
1186
0
    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1187
0
    if (!_PyUnicode_WSTR(unicode)) {
1188
0
        Py_DECREF(unicode);
1189
0
        PyErr_NoMemory();
1190
0
        return NULL;
1191
0
    }
1192
1193
    /* Initialize the first element to guard against cases where
1194
     * the caller fails before initializing str -- unicode_resize()
1195
     * reads str[0], and the Keep-Alive optimization can keep memory
1196
     * allocated for str alive across a call to unicode_dealloc(unicode).
1197
     * We don't want unicode_resize to read uninitialized memory in
1198
     * that case.
1199
     */
1200
0
    _PyUnicode_WSTR(unicode)[0] = 0;
1201
0
    _PyUnicode_WSTR(unicode)[length] = 0;
1202
1203
0
    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
1204
0
    return unicode;
1205
0
}
1206
1207
static const char*
1208
unicode_kind_name(PyObject *unicode)
1209
0
{
1210
    /* don't check consistency: unicode_kind_name() is called from
1211
       _PyUnicode_Dump() */
1212
0
    if (!PyUnicode_IS_COMPACT(unicode))
1213
0
    {
1214
0
        if (!PyUnicode_IS_READY(unicode))
1215
0
            return "wstr";
1216
0
        switch (PyUnicode_KIND(unicode))
1217
0
        {
1218
0
        case PyUnicode_1BYTE_KIND:
1219
0
            if (PyUnicode_IS_ASCII(unicode))
1220
0
                return "legacy ascii";
1221
0
            else
1222
0
                return "legacy latin1";
1223
0
        case PyUnicode_2BYTE_KIND:
1224
0
            return "legacy UCS2";
1225
0
        case PyUnicode_4BYTE_KIND:
1226
0
            return "legacy UCS4";
1227
0
        default:
1228
0
            return "<legacy invalid kind>";
1229
0
        }
1230
0
    }
1231
0
    assert(PyUnicode_IS_READY(unicode));
1232
0
    switch (PyUnicode_KIND(unicode)) {
1233
0
    case PyUnicode_1BYTE_KIND:
1234
0
        if (PyUnicode_IS_ASCII(unicode))
1235
0
            return "ascii";
1236
0
        else
1237
0
            return "latin1";
1238
0
    case PyUnicode_2BYTE_KIND:
1239
0
        return "UCS2";
1240
0
    case PyUnicode_4BYTE_KIND:
1241
0
        return "UCS4";
1242
0
    default:
1243
0
        return "<invalid compact kind>";
1244
0
    }
1245
0
}
1246
1247
#ifdef Py_DEBUG
1248
/* Functions wrapping macros for use in debugger */
1249
char *_PyUnicode_utf8(void *unicode_raw){
1250
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1251
    return PyUnicode_UTF8(unicode);
1252
}
1253
1254
void *_PyUnicode_compact_data(void *unicode_raw) {
1255
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1256
    return _PyUnicode_COMPACT_DATA(unicode);
1257
}
1258
void *_PyUnicode_data(void *unicode_raw) {
1259
    PyObject *unicode = _PyObject_CAST(unicode_raw);
1260
    printf("obj %p\n", (void*)unicode);
1261
    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1262
    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1263
    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1264
    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1265
    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1266
    return PyUnicode_DATA(unicode);
1267
}
1268
1269
void
1270
_PyUnicode_Dump(PyObject *op)
1271
{
1272
    PyASCIIObject *ascii = (PyASCIIObject *)op;
1273
    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1274
    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1275
    void *data;
1276
1277
    if (ascii->state.compact)
1278
    {
1279
        if (ascii->state.ascii)
1280
            data = (ascii + 1);
1281
        else
1282
            data = (compact + 1);
1283
    }
1284
    else
1285
        data = unicode->data.any;
1286
    printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1287
           unicode_kind_name(op), ascii->length);
1288
1289
    if (ascii->wstr == data)
1290
        printf("shared ");
1291
    printf("wstr=%p", (void *)ascii->wstr);
1292
1293
    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1294
        printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1295
        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1296
            printf("shared ");
1297
        printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1298
               (void *)compact->utf8, compact->utf8_length);
1299
    }
1300
    printf(", data=%p\n", data);
1301
}
1302
#endif
1303
1304
PyObject *
1305
PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1306
144k
{
1307
144k
    PyObject *obj;
1308
144k
    PyCompactUnicodeObject *unicode;
1309
144k
    void *data;
1310
144k
    enum PyUnicode_Kind kind;
1311
144k
    int is_sharing, is_ascii;
1312
144k
    Py_ssize_t char_size;
1313
144k
    Py_ssize_t struct_size;
1314
1315
    /* Optimization for empty strings */
1316
144k
    if (size == 0 && unicode_empty != NULL) {
1317
140
        Py_INCREF(unicode_empty);
1318
140
        return unicode_empty;
1319
140
    }
1320
1321
144k
    is_ascii = 0;
1322
144k
    is_sharing = 0;
1323
144k
    struct_size = sizeof(PyCompactUnicodeObject);
1324
144k
    if (maxchar < 128) {
1325
144k
        kind = PyUnicode_1BYTE_KIND;
1326
144k
        char_size = 1;
1327
144k
        is_ascii = 1;
1328
144k
        struct_size = sizeof(PyASCIIObject);
1329
144k
    }
1330
29
    else if (maxchar < 256) {
1331
15
        kind = PyUnicode_1BYTE_KIND;
1332
15
        char_size = 1;
1333
15
    }
1334
14
    else if (maxchar < 65536) {
1335
14
        kind = PyUnicode_2BYTE_KIND;
1336
14
        char_size = 2;
1337
14
        if (sizeof(wchar_t) == 2)
1338
0
            is_sharing = 1;
1339
14
    }
1340
0
    else {
1341
0
        if (maxchar > MAX_UNICODE) {
1342
0
            PyErr_SetString(PyExc_SystemError,
1343
0
                            "invalid maximum character passed to PyUnicode_New");
1344
0
            return NULL;
1345
0
        }
1346
0
        kind = PyUnicode_4BYTE_KIND;
1347
0
        char_size = 4;
1348
0
        if (sizeof(wchar_t) == 4)
1349
0
            is_sharing = 1;
1350
0
    }
1351
1352
    /* Ensure we won't overflow the size. */
1353
144k
    if (size < 0) {
1354
0
        PyErr_SetString(PyExc_SystemError,
1355
0
                        "Negative size passed to PyUnicode_New");
1356
0
        return NULL;
1357
0
    }
1358
144k
    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1359
0
        return PyErr_NoMemory();
1360
1361
    /* Duplicated allocation code from _PyObject_New() instead of a call to
1362
     * PyObject_New() so we are able to allocate space for the object and
1363
     * it's data buffer.
1364
     */
1365
144k
    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1366
144k
    if (obj == NULL)
1367
0
        return PyErr_NoMemory();
1368
144k
    obj = PyObject_INIT(obj, &PyUnicode_Type);
1369
144k
    if (obj == NULL)
1370
0
        return NULL;
1371
1372
144k
    unicode = (PyCompactUnicodeObject *)obj;
1373
144k
    if (is_ascii)
1374
144k
        data = ((PyASCIIObject*)obj) + 1;
1375
29
    else
1376
29
        data = unicode + 1;
1377
144k
    _PyUnicode_LENGTH(unicode) = size;
1378
144k
    _PyUnicode_HASH(unicode) = -1;
1379
144k
    _PyUnicode_STATE(unicode).interned = 0;
1380
144k
    _PyUnicode_STATE(unicode).kind = kind;
1381
144k
    _PyUnicode_STATE(unicode).compact = 1;
1382
144k
    _PyUnicode_STATE(unicode).ready = 1;
1383
144k
    _PyUnicode_STATE(unicode).ascii = is_ascii;
1384
144k
    if (is_ascii) {
1385
144k
        ((char*)data)[size] = 0;
1386
144k
        _PyUnicode_WSTR(unicode) = NULL;
1387
144k
    }
1388
29
    else if (kind == PyUnicode_1BYTE_KIND) {
1389
15
        ((char*)data)[size] = 0;
1390
15
        _PyUnicode_WSTR(unicode) = NULL;
1391
15
        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1392
15
        unicode->utf8 = NULL;
1393
15
        unicode->utf8_length = 0;
1394
15
    }
1395
14
    else {
1396
14
        unicode->utf8 = NULL;
1397
14
        unicode->utf8_length = 0;
1398
14
        if (kind == PyUnicode_2BYTE_KIND)
1399
14
            ((Py_UCS2*)data)[size] = 0;
1400
0
        else /* kind == PyUnicode_4BYTE_KIND */
1401
0
            ((Py_UCS4*)data)[size] = 0;
1402
14
        if (is_sharing) {
1403
0
            _PyUnicode_WSTR_LENGTH(unicode) = size;
1404
0
            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1405
0
        }
1406
14
        else {
1407
14
            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1408
14
            _PyUnicode_WSTR(unicode) = NULL;
1409
14
        }
1410
14
    }
1411
#ifdef Py_DEBUG
1412
    unicode_fill_invalid((PyObject*)unicode, 0);
1413
#endif
1414
144k
    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1415
144k
    return obj;
1416
144k
}
1417
1418
#if SIZEOF_WCHAR_T == 2
1419
/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1420
   will decode surrogate pairs, the other conversions are implemented as macros
1421
   for efficiency.
1422
1423
   This function assumes that unicode can hold one more code point than wstr
1424
   characters for a terminating null character. */
1425
static void
1426
unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1427
                              PyObject *unicode)
1428
{
1429
    const wchar_t *iter;
1430
    Py_UCS4 *ucs4_out;
1431
1432
    assert(unicode != NULL);
1433
    assert(_PyUnicode_CHECK(unicode));
1434
    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1435
    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1436
1437
    for (iter = begin; iter < end; ) {
1438
        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1439
                           _PyUnicode_GET_LENGTH(unicode)));
1440
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1441
            && (iter+1) < end
1442
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1443
        {
1444
            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1445
            iter += 2;
1446
        }
1447
        else {
1448
            *ucs4_out++ = *iter;
1449
            iter++;
1450
        }
1451
    }
1452
    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1453
                        _PyUnicode_GET_LENGTH(unicode)));
1454
1455
}
1456
#endif
1457
1458
static int
1459
unicode_check_modifiable(PyObject *unicode)
1460
0
{
1461
0
    if (!unicode_modifiable(unicode)) {
1462
0
        PyErr_SetString(PyExc_SystemError,
1463
0
                        "Cannot modify a string currently used");
1464
0
        return -1;
1465
0
    }
1466
0
    return 0;
1467
0
}
1468
1469
static int
1470
_copy_characters(PyObject *to, Py_ssize_t to_start,
1471
                 PyObject *from, Py_ssize_t from_start,
1472
                 Py_ssize_t how_many, int check_maxchar)
1473
19.1k
{
1474
19.1k
    unsigned int from_kind, to_kind;
1475
19.1k
    void *from_data, *to_data;
1476
1477
19.1k
    assert(0 <= how_many);
1478
19.1k
    assert(0 <= from_start);
1479
19.1k
    assert(0 <= to_start);
1480
19.1k
    assert(PyUnicode_Check(from));
1481
19.1k
    assert(PyUnicode_IS_READY(from));
1482
19.1k
    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1483
1484
19.1k
    assert(PyUnicode_Check(to));
1485
19.1k
    assert(PyUnicode_IS_READY(to));
1486
19.1k
    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1487
1488
19.1k
    if (how_many == 0)
1489
0
        return 0;
1490
1491
19.1k
    from_kind = PyUnicode_KIND(from);
1492
19.1k
    from_data = PyUnicode_DATA(from);
1493
19.1k
    to_kind = PyUnicode_KIND(to);
1494
19.1k
    to_data = PyUnicode_DATA(to);
1495
1496
#ifdef Py_DEBUG
1497
    if (!check_maxchar
1498
        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1499
    {
1500
        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1501
        Py_UCS4 ch;
1502
        Py_ssize_t i;
1503
        for (i=0; i < how_many; i++) {
1504
            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1505
            assert(ch <= to_maxchar);
1506
        }
1507
    }
1508
#endif
1509
1510
19.1k
    if (from_kind == to_kind) {
1511
19.1k
        if (check_maxchar
1512
19.1k
            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1513
0
        {
1514
            /* Writing Latin-1 characters into an ASCII string requires to
1515
               check that all written characters are pure ASCII */
1516
0
            Py_UCS4 max_char;
1517
0
            max_char = ucs1lib_find_max_char(from_data,
1518
0
                                             (Py_UCS1*)from_data + how_many);
1519
0
            if (max_char >= 128)
1520
0
                return -1;
1521
0
        }
1522
19.1k
        memcpy((char*)to_data + to_kind * to_start,
1523
19.1k
                  (char*)from_data + from_kind * from_start,
1524
19.1k
                  to_kind * how_many);
1525
19.1k
    }
1526
14
    else if (from_kind == PyUnicode_1BYTE_KIND
1527
14
             && to_kind == PyUnicode_2BYTE_KIND)
1528
14
    {
1529
14
        _PyUnicode_CONVERT_BYTES(
1530
14
            Py_UCS1, Py_UCS2,
1531
14
            PyUnicode_1BYTE_DATA(from) + from_start,
1532
14
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1533
14
            PyUnicode_2BYTE_DATA(to) + to_start
1534
14
            );
1535
14
    }
1536
0
    else if (from_kind == PyUnicode_1BYTE_KIND
1537
0
             && to_kind == PyUnicode_4BYTE_KIND)
1538
0
    {
1539
0
        _PyUnicode_CONVERT_BYTES(
1540
0
            Py_UCS1, Py_UCS4,
1541
0
            PyUnicode_1BYTE_DATA(from) + from_start,
1542
0
            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1543
0
            PyUnicode_4BYTE_DATA(to) + to_start
1544
0
            );
1545
0
    }
1546
0
    else if (from_kind == PyUnicode_2BYTE_KIND
1547
0
             && to_kind == PyUnicode_4BYTE_KIND)
1548
0
    {
1549
0
        _PyUnicode_CONVERT_BYTES(
1550
0
            Py_UCS2, Py_UCS4,
1551
0
            PyUnicode_2BYTE_DATA(from) + from_start,
1552
0
            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1553
0
            PyUnicode_4BYTE_DATA(to) + to_start
1554
0
            );
1555
0
    }
1556
0
    else {
1557
0
        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1558
1559
0
        if (!check_maxchar) {
1560
0
            if (from_kind == PyUnicode_2BYTE_KIND
1561
0
                && to_kind == PyUnicode_1BYTE_KIND)
1562
0
            {
1563
0
                _PyUnicode_CONVERT_BYTES(
1564
0
                    Py_UCS2, Py_UCS1,
1565
0
                    PyUnicode_2BYTE_DATA(from) + from_start,
1566
0
                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1567
0
                    PyUnicode_1BYTE_DATA(to) + to_start
1568
0
                    );
1569
0
            }
1570
0
            else if (from_kind == PyUnicode_4BYTE_KIND
1571
0
                     && to_kind == PyUnicode_1BYTE_KIND)
1572
0
            {
1573
0
                _PyUnicode_CONVERT_BYTES(
1574
0
                    Py_UCS4, Py_UCS1,
1575
0
                    PyUnicode_4BYTE_DATA(from) + from_start,
1576
0
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1577
0
                    PyUnicode_1BYTE_DATA(to) + to_start
1578
0
                    );
1579
0
            }
1580
0
            else if (from_kind == PyUnicode_4BYTE_KIND
1581
0
                     && to_kind == PyUnicode_2BYTE_KIND)
1582
0
            {
1583
0
                _PyUnicode_CONVERT_BYTES(
1584
0
                    Py_UCS4, Py_UCS2,
1585
0
                    PyUnicode_4BYTE_DATA(from) + from_start,
1586
0
                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1587
0
                    PyUnicode_2BYTE_DATA(to) + to_start
1588
0
                    );
1589
0
            }
1590
0
            else {
1591
0
                Py_UNREACHABLE();
1592
0
            }
1593
0
        }
1594
0
        else {
1595
0
            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1596
0
            Py_UCS4 ch;
1597
0
            Py_ssize_t i;
1598
1599
0
            for (i=0; i < how_many; i++) {
1600
0
                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1601
0
                if (ch > to_maxchar)
1602
0
                    return -1;
1603
0
                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1604
0
            }
1605
0
        }
1606
0
    }
1607
19.1k
    return 0;
1608
19.1k
}
1609
1610
void
1611
_PyUnicode_FastCopyCharacters(
1612
    PyObject *to, Py_ssize_t to_start,
1613
    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1614
19.1k
{
1615
19.1k
    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1616
19.1k
}
1617
1618
Py_ssize_t
1619
PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1620
                         PyObject *from, Py_ssize_t from_start,
1621
                         Py_ssize_t how_many)
1622
0
{
1623
0
    int err;
1624
1625
0
    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1626
0
        PyErr_BadInternalCall();
1627
0
        return -1;
1628
0
    }
1629
1630
0
    if (PyUnicode_READY(from) == -1)
1631
0
        return -1;
1632
0
    if (PyUnicode_READY(to) == -1)
1633
0
        return -1;
1634
1635
0
    if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
1636
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1637
0
        return -1;
1638
0
    }
1639
0
    if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
1640
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
1641
0
        return -1;
1642
0
    }
1643
0
    if (how_many < 0) {
1644
0
        PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1645
0
        return -1;
1646
0
    }
1647
0
    how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
1648
0
    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1649
0
        PyErr_Format(PyExc_SystemError,
1650
0
                     "Cannot write %zi characters at %zi "
1651
0
                     "in a string of %zi characters",
1652
0
                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1653
0
        return -1;
1654
0
    }
1655
1656
0
    if (how_many == 0)
1657
0
        return 0;
1658
1659
0
    if (unicode_check_modifiable(to))
1660
0
        return -1;
1661
1662
0
    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1663
0
    if (err) {
1664
0
        PyErr_Format(PyExc_SystemError,
1665
0
                     "Cannot copy %s characters "
1666
0
                     "into a string of %s characters",
1667
0
                     unicode_kind_name(from),
1668
0
                     unicode_kind_name(to));
1669
0
        return -1;
1670
0
    }
1671
0
    return how_many;
1672
0
}
1673
1674
/* Find the maximum code point and count the number of surrogate pairs so a
1675
   correct string length can be computed before converting a string to UCS4.
1676
   This function counts single surrogates as a character and not as a pair.
1677
1678
   Return 0 on success, or -1 on error. */
1679
static int
1680
find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1681
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1682
5.14k
{
1683
5.14k
    const wchar_t *iter;
1684
5.14k
    Py_UCS4 ch;
1685
1686
5.14k
    assert(num_surrogates != NULL && maxchar != NULL);
1687
5.14k
    *num_surrogates = 0;
1688
5.14k
    *maxchar = 0;
1689
1690
67.8k
    for (iter = begin; iter < end; ) {
1691
#if SIZEOF_WCHAR_T == 2
1692
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1693
            && (iter+1) < end
1694
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1695
        {
1696
            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1697
            ++(*num_surrogates);
1698
            iter += 2;
1699
        }
1700
        else
1701
#endif
1702
62.6k
        {
1703
62.6k
            ch = *iter;
1704
62.6k
            iter++;
1705
62.6k
        }
1706
62.6k
        if (ch > *maxchar) {
1707
16.2k
            *maxchar = ch;
1708
16.2k
            if (*maxchar > MAX_UNICODE) {
1709
0
                PyErr_Format(PyExc_ValueError,
1710
0
                             "character U+%x is not in range [U+0000; U+10ffff]",
1711
0
                             ch);
1712
0
                return -1;
1713
0
            }
1714
16.2k
        }
1715
62.6k
    }
1716
5.14k
    return 0;
1717
5.14k
}
1718
1719
int
1720
_PyUnicode_Ready(PyObject *unicode)
1721
0
{
1722
0
    wchar_t *end;
1723
0
    Py_UCS4 maxchar = 0;
1724
0
    Py_ssize_t num_surrogates;
1725
#if SIZEOF_WCHAR_T == 2
1726
    Py_ssize_t length_wo_surrogates;
1727
#endif
1728
1729
    /* _PyUnicode_Ready() is only intended for old-style API usage where
1730
       strings were created using _PyObject_New() and where no canonical
1731
       representation (the str field) has been set yet aka strings
1732
       which are not yet ready. */
1733
0
    assert(_PyUnicode_CHECK(unicode));
1734
0
    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1735
0
    assert(_PyUnicode_WSTR(unicode) != NULL);
1736
0
    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1737
0
    assert(_PyUnicode_UTF8(unicode) == NULL);
1738
    /* Actually, it should neither be interned nor be anything else: */
1739
0
    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1740
1741
0
    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1742
0
    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1743
0
                                &maxchar, &num_surrogates) == -1)
1744
0
        return -1;
1745
1746
0
    if (maxchar < 256) {
1747
0
        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1748
0
        if (!_PyUnicode_DATA_ANY(unicode)) {
1749
0
            PyErr_NoMemory();
1750
0
            return -1;
1751
0
        }
1752
0
        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1753
0
                                _PyUnicode_WSTR(unicode), end,
1754
0
                                PyUnicode_1BYTE_DATA(unicode));
1755
0
        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1756
0
        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1757
0
        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1758
0
        if (maxchar < 128) {
1759
0
            _PyUnicode_STATE(unicode).ascii = 1;
1760
0
            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1761
0
            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1762
0
        }
1763
0
        else {
1764
0
            _PyUnicode_STATE(unicode).ascii = 0;
1765
0
            _PyUnicode_UTF8(unicode) = NULL;
1766
0
            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1767
0
        }
1768
0
        PyObject_FREE(_PyUnicode_WSTR(unicode));
1769
0
        _PyUnicode_WSTR(unicode) = NULL;
1770
0
        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1771
0
    }
1772
    /* In this case we might have to convert down from 4-byte native
1773
       wchar_t to 2-byte unicode. */
1774
0
    else if (maxchar < 65536) {
1775
0
        assert(num_surrogates == 0 &&
1776
0
               "FindMaxCharAndNumSurrogatePairs() messed up");
1777
1778
#if SIZEOF_WCHAR_T == 2
1779
        /* We can share representations and are done. */
1780
        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1781
        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1782
        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1783
        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1784
        _PyUnicode_UTF8(unicode) = NULL;
1785
        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1786
#else
1787
        /* sizeof(wchar_t) == 4 */
1788
0
        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1789
0
            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1790
0
        if (!_PyUnicode_DATA_ANY(unicode)) {
1791
0
            PyErr_NoMemory();
1792
0
            return -1;
1793
0
        }
1794
0
        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1795
0
                                _PyUnicode_WSTR(unicode), end,
1796
0
                                PyUnicode_2BYTE_DATA(unicode));
1797
0
        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1798
0
        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1799
0
        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1800
0
        _PyUnicode_UTF8(unicode) = NULL;
1801
0
        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1802
0
        PyObject_FREE(_PyUnicode_WSTR(unicode));
1803
0
        _PyUnicode_WSTR(unicode) = NULL;
1804
0
        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1805
0
#endif
1806
0
    }
1807
    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1808
0
    else {
1809
#if SIZEOF_WCHAR_T == 2
1810
        /* in case the native representation is 2-bytes, we need to allocate a
1811
           new normalized 4-byte version. */
1812
        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1813
        if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1814
            PyErr_NoMemory();
1815
            return -1;
1816
        }
1817
        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1818
        if (!_PyUnicode_DATA_ANY(unicode)) {
1819
            PyErr_NoMemory();
1820
            return -1;
1821
        }
1822
        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1823
        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1824
        _PyUnicode_UTF8(unicode) = NULL;
1825
        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1826
        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1827
        _PyUnicode_STATE(unicode).ready = 1;
1828
        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1829
        PyObject_FREE(_PyUnicode_WSTR(unicode));
1830
        _PyUnicode_WSTR(unicode) = NULL;
1831
        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1832
#else
1833
0
        assert(num_surrogates == 0);
1834
1835
0
        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1836
0
        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1837
0
        _PyUnicode_UTF8(unicode) = NULL;
1838
0
        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1839
0
        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1840
0
#endif
1841
0
        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1842
0
    }
1843
0
    _PyUnicode_STATE(unicode).ready = 1;
1844
0
    assert(_PyUnicode_CheckConsistency(unicode, 1));
1845
0
    return 0;
1846
0
}
1847
1848
static void
1849
unicode_dealloc(PyObject *unicode)
1850
76.6k
{
1851
76.6k
    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1852
75.4k
    case SSTATE_NOT_INTERNED:
1853
75.4k
        break;
1854
1855
1.16k
    case SSTATE_INTERNED_MORTAL:
1856
        /* revive dead object temporarily for DelItem */
1857
1.16k
        Py_REFCNT(unicode) = 3;
1858
1.16k
        if (PyDict_DelItem(interned, unicode) != 0)
1859
0
            Py_FatalError(
1860
0
                "deletion of interned string failed");
1861
1.16k
        break;
1862
1863
1.16k
    case SSTATE_INTERNED_IMMORTAL:
1864
0
        Py_FatalError("Immortal interned string died.");
1865
        /* fall through */
1866
1867
0
    default:
1868
0
        Py_FatalError("Inconsistent interned string state.");
1869
76.6k
    }
1870
1871
76.6k
    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1872
0
        PyObject_DEL(_PyUnicode_WSTR(unicode));
1873
76.6k
    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1874
0
        PyObject_DEL(_PyUnicode_UTF8(unicode));
1875
76.6k
    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1876
0
        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1877
1878
76.6k
    Py_TYPE(unicode)->tp_free(unicode);
1879
76.6k
}
1880
1881
#ifdef Py_DEBUG
1882
static int
1883
unicode_is_singleton(PyObject *unicode)
1884
{
1885
    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1886
    if (unicode == unicode_empty)
1887
        return 1;
1888
    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1889
    {
1890
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1891
        if (ch < 256 && unicode_latin1[ch] == unicode)
1892
            return 1;
1893
    }
1894
    return 0;
1895
}
1896
#endif
1897
1898
static int
1899
unicode_modifiable(PyObject *unicode)
1900
3.60k
{
1901
3.60k
    assert(_PyUnicode_CHECK(unicode));
1902
3.60k
    if (Py_REFCNT(unicode) != 1)
1903
3.40k
        return 0;
1904
208
    if (_PyUnicode_HASH(unicode) != -1)
1905
0
        return 0;
1906
208
    if (PyUnicode_CHECK_INTERNED(unicode))
1907
0
        return 0;
1908
208
    if (!PyUnicode_CheckExact(unicode))
1909
0
        return 0;
1910
#ifdef Py_DEBUG
1911
    /* singleton refcount is greater than 1 */
1912
    assert(!unicode_is_singleton(unicode));
1913
#endif
1914
208
    return 1;
1915
208
}
1916
1917
static int
1918
unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1919
104
{
1920
104
    PyObject *unicode;
1921
104
    Py_ssize_t old_length;
1922
1923
104
    assert(p_unicode != NULL);
1924
104
    unicode = *p_unicode;
1925
1926
104
    assert(unicode != NULL);
1927
104
    assert(PyUnicode_Check(unicode));
1928
104
    assert(0 <= length);
1929
1930
104
    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1931
0
        old_length = PyUnicode_WSTR_LENGTH(unicode);
1932
104
    else
1933
104
        old_length = PyUnicode_GET_LENGTH(unicode);
1934
104
    if (old_length == length)
1935
0
        return 0;
1936
1937
104
    if (length == 0) {
1938
0
        _Py_INCREF_UNICODE_EMPTY();
1939
0
        if (!unicode_empty)
1940
0
            return -1;
1941
0
        Py_SETREF(*p_unicode, unicode_empty);
1942
0
        return 0;
1943
0
    }
1944
1945
104
    if (!unicode_modifiable(unicode)) {
1946
0
        PyObject *copy = resize_copy(unicode, length);
1947
0
        if (copy == NULL)
1948
0
            return -1;
1949
0
        Py_SETREF(*p_unicode, copy);
1950
0
        return 0;
1951
0
    }
1952
1953
104
    if (PyUnicode_IS_COMPACT(unicode)) {
1954
104
        PyObject *new_unicode = resize_compact(unicode, length);
1955
104
        if (new_unicode == NULL)
1956
0
            return -1;
1957
104
        *p_unicode = new_unicode;
1958
104
        return 0;
1959
104
    }
1960
0
    return resize_inplace(unicode, length);
1961
104
}
1962
1963
int
1964
PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1965
0
{
1966
0
    PyObject *unicode;
1967
0
    if (p_unicode == NULL) {
1968
0
        PyErr_BadInternalCall();
1969
0
        return -1;
1970
0
    }
1971
0
    unicode = *p_unicode;
1972
0
    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1973
0
    {
1974
0
        PyErr_BadInternalCall();
1975
0
        return -1;
1976
0
    }
1977
0
    return unicode_resize(p_unicode, length);
1978
0
}
1979
1980
/* Copy an ASCII or latin1 char* string into a Python Unicode string.
1981
1982
   WARNING: The function doesn't copy the terminating null character and
1983
   doesn't check the maximum character (may write a latin1 character in an
1984
   ASCII string). */
1985
static void
1986
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1987
                   const char *str, Py_ssize_t len)
1988
0
{
1989
0
    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1990
0
    void *data = PyUnicode_DATA(unicode);
1991
0
    const char *end = str + len;
1992
1993
0
    switch (kind) {
1994
0
    case PyUnicode_1BYTE_KIND: {
1995
0
        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1996
#ifdef Py_DEBUG
1997
        if (PyUnicode_IS_ASCII(unicode)) {
1998
            Py_UCS4 maxchar = ucs1lib_find_max_char(
1999
                (const Py_UCS1*)str,
2000
                (const Py_UCS1*)str + len);
2001
            assert(maxchar < 128);
2002
        }
2003
#endif
2004
0
        memcpy((char *) data + index, str, len);
2005
0
        break;
2006
0
    }
2007
0
    case PyUnicode_2BYTE_KIND: {
2008
0
        Py_UCS2 *start = (Py_UCS2 *)data + index;
2009
0
        Py_UCS2 *ucs2 = start;
2010
0
        assert(index <= PyUnicode_GET_LENGTH(unicode));
2011
2012
0
        for (; str < end; ++ucs2, ++str)
2013
0
            *ucs2 = (Py_UCS2)*str;
2014
2015
0
        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
2016
0
        break;
2017
0
    }
2018
0
    default: {
2019
0
        Py_UCS4 *start = (Py_UCS4 *)data + index;
2020
0
        Py_UCS4 *ucs4 = start;
2021
0
        assert(kind == PyUnicode_4BYTE_KIND);
2022
0
        assert(index <= PyUnicode_GET_LENGTH(unicode));
2023
2024
0
        for (; str < end; ++ucs4, ++str)
2025
0
            *ucs4 = (Py_UCS4)*str;
2026
2027
0
        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
2028
0
    }
2029
0
    }
2030
0
}
2031
2032
static PyObject*
2033
get_latin1_char(unsigned char ch)
2034
8.09k
{
2035
8.09k
    PyObject *unicode = unicode_latin1[ch];
2036
8.09k
    if (!unicode) {
2037
932
        unicode = PyUnicode_New(1, ch);
2038
932
        if (!unicode)
2039
0
            return NULL;
2040
932
        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2041
932
        assert(_PyUnicode_CheckConsistency(unicode, 1));
2042
932
        unicode_latin1[ch] = unicode;
2043
932
    }
2044
8.09k
    Py_INCREF(unicode);
2045
8.09k
    return unicode;
2046
8.09k
}
2047
2048
static PyObject*
2049
unicode_char(Py_UCS4 ch)
2050
5.56k
{
2051
5.56k
    PyObject *unicode;
2052
2053
5.56k
    assert(ch <= MAX_UNICODE);
2054
2055
5.56k
    if (ch < 256)
2056
5.56k
        return get_latin1_char(ch);
2057
2058
0
    unicode = PyUnicode_New(1, ch);
2059
0
    if (unicode == NULL)
2060
0
        return NULL;
2061
2062
0
    assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2063
0
    if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
2064
0
        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
2065
0
    } else {
2066
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2067
0
        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2068
0
    }
2069
0
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2070
0
    return unicode;
2071
0
}
2072
2073
PyObject *
2074
PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
2075
0
{
2076
0
    if (u == NULL)
2077
0
        return (PyObject*)_PyUnicode_New(size);
2078
2079
0
    if (size < 0) {
2080
0
        PyErr_BadInternalCall();
2081
0
        return NULL;
2082
0
    }
2083
2084
0
    return PyUnicode_FromWideChar(u, size);
2085
0
}
2086
2087
PyObject *
2088
PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2089
5.15k
{
2090
5.15k
    PyObject *unicode;
2091
5.15k
    Py_UCS4 maxchar = 0;
2092
5.15k
    Py_ssize_t num_surrogates;
2093
2094
5.15k
    if (u == NULL && size != 0) {
2095
0
        PyErr_BadInternalCall();
2096
0
        return NULL;
2097
0
    }
2098
2099
5.15k
    if (size == -1) {
2100
294
        size = wcslen(u);
2101
294
    }
2102
2103
    /* If the Unicode data is known at construction time, we can apply
2104
       some optimizations which share commonly used objects. */
2105
2106
    /* Optimization for empty strings */
2107
5.15k
    if (size == 0)
2108
14
        _Py_RETURN_UNICODE_EMPTY();
2109
2110
    /* Single character Unicode objects in the Latin-1 range are
2111
       shared when using this constructor */
2112
5.14k
    if (size == 1 && (Py_UCS4)*u < 256)
2113
0
        return get_latin1_char((unsigned char)*u);
2114
2115
    /* If not empty and not single character, copy the Unicode data
2116
       into the new object */
2117
5.14k
    if (find_maxchar_surrogates(u, u + size,
2118
5.14k
                                &maxchar, &num_surrogates) == -1)
2119
0
        return NULL;
2120
2121
5.14k
    unicode = PyUnicode_New(size - num_surrogates, maxchar);
2122
5.14k
    if (!unicode)
2123
0
        return NULL;
2124
2125
5.14k
    switch (PyUnicode_KIND(unicode)) {
2126
5.14k
    case PyUnicode_1BYTE_KIND:
2127
5.14k
        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
2128
5.14k
                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
2129
5.14k
        break;
2130
0
    case PyUnicode_2BYTE_KIND:
2131
#if Py_UNICODE_SIZE == 2
2132
        memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
2133
#else
2134
0
        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
2135
0
                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
2136
0
#endif
2137
0
        break;
2138
0
    case PyUnicode_4BYTE_KIND:
2139
#if SIZEOF_WCHAR_T == 2
2140
        /* This is the only case which has to process surrogates, thus
2141
           a simple copy loop is not enough and we need a function. */
2142
        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
2143
#else
2144
0
        assert(num_surrogates == 0);
2145
0
        memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
2146
0
#endif
2147
0
        break;
2148
0
    default:
2149
0
        Py_UNREACHABLE();
2150
5.14k
    }
2151
2152
5.14k
    return unicode_result(unicode);
2153
5.14k
}
2154
2155
PyObject *
2156
PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
2157
2.08k
{
2158
2.08k
    if (size < 0) {
2159
0
        PyErr_SetString(PyExc_SystemError,
2160
0
                        "Negative size passed to PyUnicode_FromStringAndSize");
2161
0
        return NULL;
2162
0
    }
2163
2.08k
    if (u != NULL)
2164
2.07k
        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2165
14
    else
2166
14
        return (PyObject *)_PyUnicode_New(size);
2167
2.08k
}
2168
2169
PyObject *
2170
PyUnicode_FromString(const char *u)
2171
55.6k
{
2172
55.6k
    size_t size = strlen(u);
2173
55.6k
    if (size > PY_SSIZE_T_MAX) {
2174
0
        PyErr_SetString(PyExc_OverflowError, "input too long");
2175
0
        return NULL;
2176
0
    }
2177
55.6k
    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
2178
55.6k
}
2179
2180
PyObject *
2181
_PyUnicode_FromId(_Py_Identifier *id)
2182
101k
{
2183
101k
    if (!id->object) {
2184
1.41k
        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2185
1.41k
                                                  strlen(id->string),
2186
1.41k
                                                  NULL, NULL);
2187
1.41k
        if (!id->object)
2188
0
            return NULL;
2189
1.41k
        PyUnicode_InternInPlace(&id->object);
2190
1.41k
        assert(!id->next);
2191
1.41k
        id->next = static_strings;
2192
1.41k
        static_strings = id;
2193
1.41k
    }
2194
101k
    return id->object;
2195
101k
}
2196
2197
void
2198
_PyUnicode_ClearStaticStrings()
2199
0
{
2200
0
    _Py_Identifier *tmp, *s = static_strings;
2201
0
    while (s) {
2202
0
        Py_CLEAR(s->object);
2203
0
        tmp = s->next;
2204
0
        s->next = NULL;
2205
0
        s = tmp;
2206
0
    }
2207
0
    static_strings = NULL;
2208
0
}
2209
2210
/* Internal function, doesn't check maximum character */
2211
2212
PyObject*
2213
_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
2214
3.97k
{
2215
3.97k
    const unsigned char *s = (const unsigned char *)buffer;
2216
3.97k
    PyObject *unicode;
2217
3.97k
    if (size == 1) {
2218
#ifdef Py_DEBUG
2219
        assert((unsigned char)s[0] < 128);
2220
#endif
2221
196
        return get_latin1_char(s[0]);
2222
196
    }
2223
3.78k
    unicode = PyUnicode_New(size, 127);
2224
3.78k
    if (!unicode)
2225
0
        return NULL;
2226
3.78k
    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2227
3.78k
    assert(_PyUnicode_CheckConsistency(unicode, 1));
2228
3.78k
    return unicode;
2229
3.78k
}
2230
2231
static Py_UCS4
2232
kind_maxchar_limit(unsigned int kind)
2233
0
{
2234
0
    switch (kind) {
2235
0
    case PyUnicode_1BYTE_KIND:
2236
0
        return 0x80;
2237
0
    case PyUnicode_2BYTE_KIND:
2238
0
        return 0x100;
2239
0
    case PyUnicode_4BYTE_KIND:
2240
0
        return 0x10000;
2241
0
    default:
2242
0
        Py_UNREACHABLE();
2243
0
    }
2244
0
}
2245
2246
static PyObject*
2247
_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
2248
59.8k
{
2249
59.8k
    PyObject *res;
2250
59.8k
    unsigned char max_char;
2251
2252
59.8k
    if (size == 0)
2253
166
        _Py_RETURN_UNICODE_EMPTY();
2254
59.7k
    assert(size > 0);
2255
59.7k
    if (size == 1)
2256
1.76k
        return get_latin1_char(u[0]);
2257
2258
57.9k
    max_char = ucs1lib_find_max_char(u, u + size);
2259
57.9k
    res = PyUnicode_New(size, max_char);
2260
57.9k
    if (!res)
2261
0
        return NULL;
2262
57.9k
    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2263
57.9k
    assert(_PyUnicode_CheckConsistency(res, 1));
2264
57.9k
    return res;
2265
57.9k
}
2266
2267
static PyObject*
2268
_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2269
0
{
2270
0
    PyObject *res;
2271
0
    Py_UCS2 max_char;
2272
2273
0
    if (size == 0)
2274
0
        _Py_RETURN_UNICODE_EMPTY();
2275
0
    assert(size > 0);
2276
0
    if (size == 1)
2277
0
        return unicode_char(u[0]);
2278
2279
0
    max_char = ucs2lib_find_max_char(u, u + size);
2280
0
    res = PyUnicode_New(size, max_char);
2281
0
    if (!res)
2282
0
        return NULL;
2283
0
    if (max_char >= 256)
2284
0
        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2285
0
    else {
2286
0
        _PyUnicode_CONVERT_BYTES(
2287
0
            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2288
0
    }
2289
0
    assert(_PyUnicode_CheckConsistency(res, 1));
2290
0
    return res;
2291
0
}
2292
2293
static PyObject*
2294
_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2295
0
{
2296
0
    PyObject *res;
2297
0
    Py_UCS4 max_char;
2298
2299
0
    if (size == 0)
2300
0
        _Py_RETURN_UNICODE_EMPTY();
2301
0
    assert(size > 0);
2302
0
    if (size == 1)
2303
0
        return unicode_char(u[0]);
2304
2305
0
    max_char = ucs4lib_find_max_char(u, u + size);
2306
0
    res = PyUnicode_New(size, max_char);
2307
0
    if (!res)
2308
0
        return NULL;
2309
0
    if (max_char < 256)
2310
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2311
0
                                 PyUnicode_1BYTE_DATA(res));
2312
0
    else if (max_char < 0x10000)
2313
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2314
0
                                 PyUnicode_2BYTE_DATA(res));
2315
0
    else
2316
0
        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2317
0
    assert(_PyUnicode_CheckConsistency(res, 1));
2318
0
    return res;
2319
0
}
2320
2321
PyObject*
2322
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2323
59.8k
{
2324
59.8k
    if (size < 0) {
2325
0
        PyErr_SetString(PyExc_ValueError, "size must be positive");
2326
0
        return NULL;
2327
0
    }
2328
59.8k
    switch (kind) {
2329
59.8k
    case PyUnicode_1BYTE_KIND:
2330
59.8k
        return _PyUnicode_FromUCS1(buffer, size);
2331
0
    case PyUnicode_2BYTE_KIND:
2332
0
        return _PyUnicode_FromUCS2(buffer, size);
2333
0
    case PyUnicode_4BYTE_KIND:
2334
0
        return _PyUnicode_FromUCS4(buffer, size);
2335
0
    default:
2336
0
        PyErr_SetString(PyExc_SystemError, "invalid kind");
2337
0
        return NULL;
2338
59.8k
    }
2339
59.8k
}
2340
2341
Py_UCS4
2342
_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2343
120
{
2344
120
    enum PyUnicode_Kind kind;
2345
120
    void *startptr, *endptr;
2346
2347
120
    assert(PyUnicode_IS_READY(unicode));
2348
120
    assert(0 <= start);
2349
120
    assert(end <= PyUnicode_GET_LENGTH(unicode));
2350
120
    assert(start <= end);
2351
2352
120
    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2353
0
        return PyUnicode_MAX_CHAR_VALUE(unicode);
2354
2355
120
    if (start == end)
2356
0
        return 127;
2357
2358
120
    if (PyUnicode_IS_ASCII(unicode))
2359
120
        return 127;
2360
2361
0
    kind = PyUnicode_KIND(unicode);
2362
0
    startptr = PyUnicode_DATA(unicode);
2363
0
    endptr = (char *)startptr + end * kind;
2364
0
    startptr = (char *)startptr + start * kind;
2365
0
    switch(kind) {
2366
0
    case PyUnicode_1BYTE_KIND:
2367
0
        return ucs1lib_find_max_char(startptr, endptr);
2368
0
    case PyUnicode_2BYTE_KIND:
2369
0
        return ucs2lib_find_max_char(startptr, endptr);
2370
0
    case PyUnicode_4BYTE_KIND:
2371
0
        return ucs4lib_find_max_char(startptr, endptr);
2372
0
    default:
2373
0
        Py_UNREACHABLE();
2374
0
    }
2375
0
}
2376
2377
/* Ensure that a string uses the most efficient storage, if it is not the
2378
   case: create a new string with of the right kind. Write NULL into *p_unicode
2379
   on error. */
2380
static void
2381
unicode_adjust_maxchar(PyObject **p_unicode)
2382
0
{
2383
0
    PyObject *unicode, *copy;
2384
0
    Py_UCS4 max_char;
2385
0
    Py_ssize_t len;
2386
0
    unsigned int kind;
2387
2388
0
    assert(p_unicode != NULL);
2389
0
    unicode = *p_unicode;
2390
0
    assert(PyUnicode_IS_READY(unicode));
2391
0
    if (PyUnicode_IS_ASCII(unicode))
2392
0
        return;
2393
2394
0
    len = PyUnicode_GET_LENGTH(unicode);
2395
0
    kind = PyUnicode_KIND(unicode);
2396
0
    if (kind == PyUnicode_1BYTE_KIND) {
2397
0
        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2398
0
        max_char = ucs1lib_find_max_char(u, u + len);
2399
0
        if (max_char >= 128)
2400
0
            return;
2401
0
    }
2402
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2403
0
        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2404
0
        max_char = ucs2lib_find_max_char(u, u + len);
2405
0
        if (max_char >= 256)
2406
0
            return;
2407
0
    }
2408
0
    else {
2409
0
        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2410
0
        assert(kind == PyUnicode_4BYTE_KIND);
2411
0
        max_char = ucs4lib_find_max_char(u, u + len);
2412
0
        if (max_char >= 0x10000)
2413
0
            return;
2414
0
    }
2415
0
    copy = PyUnicode_New(len, max_char);
2416
0
    if (copy != NULL)
2417
0
        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2418
0
    Py_DECREF(unicode);
2419
0
    *p_unicode = copy;
2420
0
}
2421
2422
PyObject*
2423
_PyUnicode_Copy(PyObject *unicode)
2424
0
{
2425
0
    Py_ssize_t length;
2426
0
    PyObject *copy;
2427
2428
0
    if (!PyUnicode_Check(unicode)) {
2429
0
        PyErr_BadInternalCall();
2430
0
        return NULL;
2431
0
    }
2432
0
    if (PyUnicode_READY(unicode) == -1)
2433
0
        return NULL;
2434
2435
0
    length = PyUnicode_GET_LENGTH(unicode);
2436
0
    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2437
0
    if (!copy)
2438
0
        return NULL;
2439
0
    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2440
2441
0
    memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2442
0
              length * PyUnicode_KIND(unicode));
2443
0
    assert(_PyUnicode_CheckConsistency(copy, 1));
2444
0
    return copy;
2445
0
}
2446
2447
2448
/* Widen Unicode objects to larger buffers. Don't write terminating null
2449
   character. Return NULL on error. */
2450
2451
void*
2452
_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2453
0
{
2454
0
    Py_ssize_t len;
2455
0
    void *result;
2456
0
    unsigned int skind;
2457
2458
0
    if (PyUnicode_READY(s) == -1)
2459
0
        return NULL;
2460
2461
0
    len = PyUnicode_GET_LENGTH(s);
2462
0
    skind = PyUnicode_KIND(s);
2463
0
    if (skind >= kind) {
2464
0
        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2465
0
        return NULL;
2466
0
    }
2467
0
    switch (kind) {
2468
0
    case PyUnicode_2BYTE_KIND:
2469
0
        result = PyMem_New(Py_UCS2, len);
2470
0
        if (!result)
2471
0
            return PyErr_NoMemory();
2472
0
        assert(skind == PyUnicode_1BYTE_KIND);
2473
0
        _PyUnicode_CONVERT_BYTES(
2474
0
            Py_UCS1, Py_UCS2,
2475
0
            PyUnicode_1BYTE_DATA(s),
2476
0
            PyUnicode_1BYTE_DATA(s) + len,
2477
0
            result);
2478
0
        return result;
2479
0
    case PyUnicode_4BYTE_KIND:
2480
0
        result = PyMem_New(Py_UCS4, len);
2481
0
        if (!result)
2482
0
            return PyErr_NoMemory();
2483
0
        if (skind == PyUnicode_2BYTE_KIND) {
2484
0
            _PyUnicode_CONVERT_BYTES(
2485
0
                Py_UCS2, Py_UCS4,
2486
0
                PyUnicode_2BYTE_DATA(s),
2487
0
                PyUnicode_2BYTE_DATA(s) + len,
2488
0
                result);
2489
0
        }
2490
0
        else {
2491
0
            assert(skind == PyUnicode_1BYTE_KIND);
2492
0
            _PyUnicode_CONVERT_BYTES(
2493
0
                Py_UCS1, Py_UCS4,
2494
0
                PyUnicode_1BYTE_DATA(s),
2495
0
                PyUnicode_1BYTE_DATA(s) + len,
2496
0
                result);
2497
0
        }
2498
0
        return result;
2499
0
    default:
2500
0
        break;
2501
0
    }
2502
0
    PyErr_SetString(PyExc_SystemError, "invalid kind");
2503
0
    return NULL;
2504
0
}
2505
2506
static Py_UCS4*
2507
as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2508
        int copy_null)
2509
0
{
2510
0
    int kind;
2511
0
    void *data;
2512
0
    Py_ssize_t len, targetlen;
2513
0
    if (PyUnicode_READY(string) == -1)
2514
0
        return NULL;
2515
0
    kind = PyUnicode_KIND(string);
2516
0
    data = PyUnicode_DATA(string);
2517
0
    len = PyUnicode_GET_LENGTH(string);
2518
0
    targetlen = len;
2519
0
    if (copy_null)
2520
0
        targetlen++;
2521
0
    if (!target) {
2522
0
        target = PyMem_New(Py_UCS4, targetlen);
2523
0
        if (!target) {
2524
0
            PyErr_NoMemory();
2525
0
            return NULL;
2526
0
        }
2527
0
    }
2528
0
    else {
2529
0
        if (targetsize < targetlen) {
2530
0
            PyErr_Format(PyExc_SystemError,
2531
0
                         "string is longer than the buffer");
2532
0
            if (copy_null && 0 < targetsize)
2533
0
                target[0] = 0;
2534
0
            return NULL;
2535
0
        }
2536
0
    }
2537
0
    if (kind == PyUnicode_1BYTE_KIND) {
2538
0
        Py_UCS1 *start = (Py_UCS1 *) data;
2539
0
        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2540
0
    }
2541
0
    else if (kind == PyUnicode_2BYTE_KIND) {
2542
0
        Py_UCS2 *start = (Py_UCS2 *) data;
2543
0
        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2544
0
    }
2545
0
    else {
2546
0
        assert(kind == PyUnicode_4BYTE_KIND);
2547
0
        memcpy(target, data, len * sizeof(Py_UCS4));
2548
0
    }
2549
0
    if (copy_null)
2550
0
        target[len] = 0;
2551
0
    return target;
2552
0
}
2553
2554
Py_UCS4*
2555
PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2556
                 int copy_null)
2557
0
{
2558
0
    if (target == NULL || targetsize < 0) {
2559
0
        PyErr_BadInternalCall();
2560
0
        return NULL;
2561
0
    }
2562
0
    return as_ucs4(string, target, targetsize, copy_null);
2563
0
}
2564
2565
Py_UCS4*
2566
PyUnicode_AsUCS4Copy(PyObject *string)
2567
0
{
2568
0
    return as_ucs4(string, NULL, 0, 1);
2569
0
}
2570
2571
/* maximum number of characters required for output of %lld or %p.
2572
   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2573
   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2574
#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2575
2576
static int
2577
unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2578
                             Py_ssize_t width, Py_ssize_t precision)
2579
11.7k
{
2580
11.7k
    Py_ssize_t length, fill, arglen;
2581
11.7k
    Py_UCS4 maxchar;
2582
2583
11.7k
    if (PyUnicode_READY(str) == -1)
2584
0
        return -1;
2585
2586
11.7k
    length = PyUnicode_GET_LENGTH(str);
2587
11.7k
    if ((precision == -1 || precision >= length)
2588
11.7k
        && width <= length)
2589
11.7k
        return _PyUnicodeWriter_WriteStr(writer, str);
2590
2591
0
    if (precision != -1)
2592
0
        length = Py_MIN(precision, length);
2593
2594
0
    arglen = Py_MAX(length, width);
2595
0
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2596
0
        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2597
0
    else
2598
0
        maxchar = writer->maxchar;
2599
2600
0
    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2601
0
        return -1;
2602
2603
0
    if (width > length) {
2604
0
        fill = width - length;
2605
0
        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2606
0
            return -1;
2607
0
        writer->pos += fill;
2608
0
    }
2609
2610
0
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2611
0
                                  str, 0, length);
2612
0
    writer->pos += length;
2613
0
    return 0;
2614
0
}
2615
2616
static int
2617
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2618
                              Py_ssize_t width, Py_ssize_t precision)
2619
4.91k
{
2620
    /* UTF-8 */
2621
4.91k
    Py_ssize_t length;
2622
4.91k
    PyObject *unicode;
2623
4.91k
    int res;
2624
2625
4.91k
    if (precision == -1) {
2626
604
        length = strlen(str);
2627
604
    }
2628
4.31k
    else {
2629
4.31k
        length = 0;
2630
38.2k
        while (length < precision && str[length]) {
2631
33.9k
            length++;
2632
33.9k
        }
2633
4.31k
    }
2634
4.91k
    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2635
4.91k
    if (unicode == NULL)
2636
0
        return -1;
2637
2638
4.91k
    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2639
4.91k
    Py_DECREF(unicode);
2640
4.91k
    return res;
2641
4.91k
}
2642
2643
static const char*
2644
unicode_fromformat_arg(_PyUnicodeWriter *writer,
2645
                       const char *f, va_list *vargs)
2646
12.3k
{
2647
12.3k
    const char *p;
2648
12.3k
    Py_ssize_t len;
2649
12.3k
    int zeropad;
2650
12.3k
    Py_ssize_t width;
2651
12.3k
    Py_ssize_t precision;
2652
12.3k
    int longflag;
2653
12.3k
    int longlongflag;
2654
12.3k
    int size_tflag;
2655
12.3k
    Py_ssize_t fill;
2656
2657
12.3k
    p = f;
2658
12.3k
    f++;
2659
12.3k
    zeropad = 0;
2660
12.3k
    if (*f == '0') {
2661
0
        zeropad = 1;
2662
0
        f++;
2663
0
    }
2664
2665
    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2666
12.3k
    width = -1;
2667
12.3k
    if (Py_ISDIGIT((unsigned)*f)) {
2668
0
        width = *f - '0';
2669
0
        f++;
2670
0
        while (Py_ISDIGIT((unsigned)*f)) {
2671
0
            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2672
0
                PyErr_SetString(PyExc_ValueError,
2673
0
                                "width too big");
2674
0
                return NULL;
2675
0
            }
2676
0
            width = (width * 10) + (*f - '0');
2677
0
            f++;
2678
0
        }
2679
0
    }
2680
12.3k
    precision = -1;
2681
12.3k
    if (*f == '.') {
2682
4.31k
        f++;
2683
4.31k
        if (Py_ISDIGIT((unsigned)*f)) {
2684
4.31k
            precision = (*f - '0');
2685
4.31k
            f++;
2686
9.48k
            while (Py_ISDIGIT((unsigned)*f)) {
2687
5.17k
                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2688
0
                    PyErr_SetString(PyExc_ValueError,
2689
0
                                    "precision too big");
2690
0
                    return NULL;
2691
0
                }
2692
5.17k
                precision = (precision * 10) + (*f - '0');
2693
5.17k
                f++;
2694
5.17k
            }
2695
4.31k
        }
2696
4.31k
        if (*f == '%') {
2697
            /* "%.3%s" => f points to "3" */
2698
0
            f--;
2699
0
        }
2700
4.31k
    }
2701
12.3k
    if (*f == '\0') {
2702
        /* bogus format "%.123" => go backward, f points to "3" */
2703
0
        f--;
2704
0
    }
2705
2706
    /* Handle %ld, %lu, %lld and %llu. */
2707
12.3k
    longflag = 0;
2708
12.3k
    longlongflag = 0;
2709
12.3k
    size_tflag = 0;
2710
12.3k
    if (*f == 'l') {
2711
0
        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2712
0
            longflag = 1;
2713
0
            ++f;
2714
0
        }
2715
0
        else if (f[1] == 'l' &&
2716
0
                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2717
0
            longlongflag = 1;
2718
0
            f += 2;
2719
0
        }
2720
0
    }
2721
    /* handle the size_t flag. */
2722
12.3k
    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2723
84
        size_tflag = 1;
2724
84
        ++f;
2725
84
    }
2726
2727
12.3k
    if (f[1] == '\0')
2728
14
        writer->overallocate = 0;
2729
2730
12.3k
    switch (*f) {
2731
80
    case 'c':
2732
80
    {
2733
80
        int ordinal = va_arg(*vargs, int);
2734
80
        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2735
36
            PyErr_SetString(PyExc_OverflowError,
2736
36
                            "character argument not in range(0x110000)");
2737
36
            return NULL;
2738
36
        }
2739
44
        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2740
0
            return NULL;
2741
44
        break;
2742
44
    }
2743
2744
44
    case 'i':
2745
476
    case 'd':
2746
476
    case 'u':
2747
476
    case 'x':
2748
476
    {
2749
        /* used by sprintf */
2750
476
        char buffer[MAX_LONG_LONG_CHARS];
2751
476
        Py_ssize_t arglen;
2752
2753
476
        if (*f == 'u') {
2754
0
            if (longflag)
2755
0
                len = sprintf(buffer, "%lu",
2756
0
                        va_arg(*vargs, unsigned long));
2757
0
            else if (longlongflag)
2758
0
                len = sprintf(buffer, "%llu",
2759
0
                        va_arg(*vargs, unsigned long long));
2760
0
            else if (size_tflag)
2761
0
                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2762
0
                        va_arg(*vargs, size_t));
2763
0
            else
2764
0
                len = sprintf(buffer, "%u",
2765
0
                        va_arg(*vargs, unsigned int));
2766
0
        }
2767
476
        else if (*f == 'x') {
2768
0
            len = sprintf(buffer, "%x", va_arg(*vargs, int));
2769
0
        }
2770
476
        else {
2771
476
            if (longflag)
2772
0
                len = sprintf(buffer, "%li",
2773
0
                        va_arg(*vargs, long));
2774
476
            else if (longlongflag)
2775
0
                len = sprintf(buffer, "%lli",
2776
0
                        va_arg(*vargs, long long));
2777
476
            else if (size_tflag)
2778
84
                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2779
84
                        va_arg(*vargs, Py_ssize_t));
2780
392
            else
2781
392
                len = sprintf(buffer, "%i",
2782
392
                        va_arg(*vargs, int));
2783
476
        }
2784
476
        assert(len >= 0);
2785
2786
476
        if (precision < len)
2787
476
            precision = len;
2788
2789
476
        arglen = Py_MAX(precision, width);
2790
476
        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2791
0
            return NULL;
2792
2793
476
        if (width > precision) {
2794
0
            Py_UCS4 fillchar;
2795
0
            fill = width - precision;
2796
0
            fillchar = zeropad?'0':' ';
2797
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2798
0
                return NULL;
2799
0
            writer->pos += fill;
2800
0
        }
2801
476
        if (precision > len) {
2802
0
            fill = precision - len;
2803
0
            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2804
0
                return NULL;
2805
0
            writer->pos += fill;
2806
0
        }
2807
2808
476
        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2809
0
            return NULL;
2810
476
        break;
2811
476
    }
2812
2813
476
    case 'p':
2814
0
    {
2815
0
        char number[MAX_LONG_LONG_CHARS];
2816
2817
0
        len = sprintf(number, "%p", va_arg(*vargs, void*));
2818
0
        assert(len >= 0);
2819
2820
        /* %p is ill-defined:  ensure leading 0x. */
2821
0
        if (number[1] == 'X')
2822
0
            number[1] = 'x';
2823
0
        else if (number[1] != 'x') {
2824
0
            memmove(number + 2, number,
2825
0
                    strlen(number) + 1);
2826
0
            number[0] = '0';
2827
0
            number[1] = 'x';
2828
0
            len += 2;
2829
0
        }
2830
2831
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2832
0
            return NULL;
2833
0
        break;
2834
0
    }
2835
2836
4.91k
    case 's':
2837
4.91k
    {
2838
        /* UTF-8 */
2839
4.91k
        const char *s = va_arg(*vargs, const char*);
2840
4.91k
        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2841
0
            return NULL;
2842
4.91k
        break;
2843
4.91k
    }
2844
2845
6.85k
    case 'U':
2846
6.85k
    {
2847
6.85k
        PyObject *obj = va_arg(*vargs, PyObject *);
2848
6.85k
        assert(obj && _PyUnicode_CHECK(obj));
2849
2850
6.85k
        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2851
0
            return NULL;
2852
6.85k
        break;
2853
6.85k
    }
2854
2855
6.85k
    case 'V':
2856
0
    {
2857
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2858
0
        const char *str = va_arg(*vargs, const char *);
2859
0
        if (obj) {
2860
0
            assert(_PyUnicode_CHECK(obj));
2861
0
            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2862
0
                return NULL;
2863
0
        }
2864
0
        else {
2865
0
            assert(str != NULL);
2866
0
            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2867
0
                return NULL;
2868
0
        }
2869
0
        break;
2870
0
    }
2871
2872
0
    case 'S':
2873
0
    {
2874
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2875
0
        PyObject *str;
2876
0
        assert(obj);
2877
0
        str = PyObject_Str(obj);
2878
0
        if (!str)
2879
0
            return NULL;
2880
0
        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2881
0
            Py_DECREF(str);
2882
0
            return NULL;
2883
0
        }
2884
0
        Py_DECREF(str);
2885
0
        break;
2886
0
    }
2887
2888
28
    case 'R':
2889
28
    {
2890
28
        PyObject *obj = va_arg(*vargs, PyObject *);
2891
28
        PyObject *repr;
2892
28
        assert(obj);
2893
28
        repr = PyObject_Repr(obj);
2894
28
        if (!repr)
2895
0
            return NULL;
2896
28
        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2897
0
            Py_DECREF(repr);
2898
0
            return NULL;
2899
0
        }
2900
28
        Py_DECREF(repr);
2901
28
        break;
2902
28
    }
2903
2904
0
    case 'A':
2905
0
    {
2906
0
        PyObject *obj = va_arg(*vargs, PyObject *);
2907
0
        PyObject *ascii;
2908
0
        assert(obj);
2909
0
        ascii = PyObject_ASCII(obj);
2910
0
        if (!ascii)
2911
0
            return NULL;
2912
0
        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2913
0
            Py_DECREF(ascii);
2914
0
            return NULL;
2915
0
        }
2916
0
        Py_DECREF(ascii);
2917
0
        break;
2918
0
    }
2919
2920
0
    case '%':
2921
0
        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2922
0
            return NULL;
2923
0
        break;
2924
2925
0
    default:
2926
        /* if we stumble upon an unknown formatting code, copy the rest
2927
           of the format string to the output string. (we cannot just
2928
           skip the code, since there's no way to know what's in the
2929
           argument list) */
2930
0
        len = strlen(p);
2931
0
        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2932
0
            return NULL;
2933
0
        f = p+len;
2934
0
        return f;
2935
12.3k
    }
2936
2937
12.3k
    f++;
2938
12.3k
    return f;
2939
12.3k
}
2940
2941
PyObject *
2942
PyUnicode_FromFormatV(const char *format, va_list vargs)
2943
6.10k
{
2944
6.10k
    va_list vargs2;
2945
6.10k
    const char *f;
2946
6.10k
    _PyUnicodeWriter writer;
2947
2948
6.10k
    _PyUnicodeWriter_Init(&writer);
2949
6.10k
    writer.min_length = strlen(format) + 100;
2950
6.10k
    writer.overallocate = 1;
2951
2952
    // Copy varags to be able to pass a reference to a subfunction.
2953
6.10k
    va_copy(vargs2, vargs);
2954
2955
36.3k
    for (f = format; *f; ) {
2956
30.2k
        if (*f == '%') {
2957
12.3k
            f = unicode_fromformat_arg(&writer, f, &vargs2);
2958
12.3k
            if (f == NULL)
2959
36
                goto fail;
2960
12.3k
        }
2961
17.8k
        else {
2962
17.8k
            const char *p;
2963
17.8k
            Py_ssize_t len;
2964
2965
17.8k
            p = f;
2966
17.8k
            do
2967
180k
            {
2968
180k
                if ((unsigned char)*p > 127) {
2969
0
                    PyErr_Format(PyExc_ValueError,
2970
0
                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2971
0
                        "string, got a non-ASCII byte: 0x%02x",
2972
0
                        (unsigned char)*p);
2973
0
                    goto fail;
2974
0
                }
2975
180k
                p++;
2976
180k
            }
2977
180k
            while (*p != '\0' && *p != '%');
2978
17.8k
            len = p - f;
2979
2980
17.8k
            if (*p == '\0')
2981
6.05k
                writer.overallocate = 0;
2982
2983
17.8k
            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2984
0
                goto fail;
2985
2986
17.8k
            f = p;
2987
17.8k
        }
2988
30.2k
    }
2989
6.07k
    va_end(vargs2);
2990
6.07k
    return _PyUnicodeWriter_Finish(&writer);
2991
2992
36
  fail:
2993
36
    va_end(vargs2);
2994
36
    _PyUnicodeWriter_Dealloc(&writer);
2995
36
    return NULL;
2996
6.10k
}
2997
2998
PyObject *
2999
PyUnicode_FromFormat(const char *format, ...)
3000
28
{
3001
28
    PyObject* ret;
3002
28
    va_list vargs;
3003
3004
28
#ifdef HAVE_STDARG_PROTOTYPES
3005
28
    va_start(vargs, format);
3006
#else
3007
    va_start(vargs);
3008
#endif
3009
28
    ret = PyUnicode_FromFormatV(format, vargs);
3010
28
    va_end(vargs);
3011
28
    return ret;
3012
28
}
3013
3014
static Py_ssize_t
3015
unicode_get_widechar_size(PyObject *unicode)
3016
378
{
3017
378
    Py_ssize_t res;
3018
3019
378
    assert(unicode != NULL);
3020
378
    assert(_PyUnicode_CHECK(unicode));
3021
3022
378
    if (_PyUnicode_WSTR(unicode) != NULL) {
3023
0
        return PyUnicode_WSTR_LENGTH(unicode);
3024
0
    }
3025
378
    assert(PyUnicode_IS_READY(unicode));
3026
3027
378
    res = _PyUnicode_LENGTH(unicode);
3028
#if SIZEOF_WCHAR_T == 2
3029
    if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3030
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3031
        const Py_UCS4 *end = s + res;
3032
        for (; s < end; ++s) {
3033
            if (*s > 0xFFFF) {
3034
                ++res;
3035
            }
3036
        }
3037
    }
3038
#endif
3039
378
    return res;
3040
378
}
3041
3042
static void
3043
unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3044
378
{
3045
378
    const wchar_t *wstr;
3046
3047
378
    assert(unicode != NULL);
3048
378
    assert(_PyUnicode_CHECK(unicode));
3049
3050
378
    wstr = _PyUnicode_WSTR(unicode);
3051
378
    if (wstr != NULL) {
3052
0
        memcpy(w, wstr, size * sizeof(wchar_t));
3053
0
        return;
3054
0
    }
3055
378
    assert(PyUnicode_IS_READY(unicode));
3056
3057
378
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3058
378
        const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3059
28.3k
        for (; size--; ++s, ++w) {
3060
27.9k
            *w = *s;
3061
27.9k
        }
3062
378
    }
3063
0
    else {
3064
0
#if SIZEOF_WCHAR_T == 4
3065
0
        assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3066
0
        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3067
0
        for (; size--; ++s, ++w) {
3068
0
            *w = *s;
3069
0
        }
3070
#else
3071
        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3072
        const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3073
        for (; size--; ++s, ++w) {
3074
            Py_UCS4 ch = *s;
3075
            if (ch > 0xFFFF) {
3076
                assert(ch <= MAX_UNICODE);
3077
                /* encode surrogate pair in this case */
3078
                *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3079
                if (!size--)
3080
                    break;
3081
                *w = Py_UNICODE_LOW_SURROGATE(ch);
3082
            }
3083
            else {
3084
                *w = ch;
3085
            }
3086
        }
3087
#endif
3088
0
    }
3089
378
}
3090
3091
#ifdef HAVE_WCHAR_H
3092
3093
/* Convert a Unicode object to a wide character string.
3094
3095
   - If w is NULL: return the number of wide characters (including the null
3096
     character) required to convert the unicode object. Ignore size argument.
3097
3098
   - Otherwise: return the number of wide characters (excluding the null
3099
     character) written into w. Write at most size wide characters (including
3100
     the null character). */
3101
Py_ssize_t
3102
PyUnicode_AsWideChar(PyObject *unicode,
3103
                     wchar_t *w,
3104
                     Py_ssize_t size)
3105
0
{
3106
0
    Py_ssize_t res;
3107
3108
0
    if (unicode == NULL) {
3109
0
        PyErr_BadInternalCall();
3110
0
        return -1;
3111
0
    }
3112
0
    if (!PyUnicode_Check(unicode)) {
3113
0
        PyErr_BadArgument();
3114
0
        return -1;
3115
0
    }
3116
3117
0
    res = unicode_get_widechar_size(unicode);
3118
0
    if (w == NULL) {
3119
0
        return res + 1;
3120
0
    }
3121
3122
0
    if (size > res) {
3123
0
        size = res + 1;
3124
0
    }
3125
0
    else {
3126
0
        res = size;
3127
0
    }
3128
0
    unicode_copy_as_widechar(unicode, w, size);
3129
0
    return res;
3130
0
}
3131
3132
wchar_t*
3133
PyUnicode_AsWideCharString(PyObject *unicode,
3134
                           Py_ssize_t *size)
3135
378
{
3136
378
    wchar_t *buffer;
3137
378
    Py_ssize_t buflen;
3138
3139
378
    if (unicode == NULL) {
3140
0
        PyErr_BadInternalCall();
3141
0
        return NULL;
3142
0
    }
3143
378
    if (!PyUnicode_Check(unicode)) {
3144
0
        PyErr_BadArgument();
3145
0
        return NULL;
3146
0
    }
3147
3148
378
    buflen = unicode_get_widechar_size(unicode);
3149
378
    buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
3150
378
    if (buffer == NULL) {
3151
0
        PyErr_NoMemory();
3152
0
        return NULL;
3153
0
    }
3154
378
    unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3155
378
    if (size != NULL) {
3156
350
        *size = buflen;
3157
350
    }
3158
28
    else if (wcslen(buffer) != (size_t)buflen) {
3159
0
        PyMem_FREE(buffer);
3160
0
        PyErr_SetString(PyExc_ValueError,
3161
0
                        "embedded null character");
3162
0
        return NULL;
3163
0
    }
3164
378
    return buffer;
3165
378
}
3166
3167
#endif /* HAVE_WCHAR_H */
3168
3169
PyObject *
3170
PyUnicode_FromOrdinal(int ordinal)
3171
417
{
3172
417
    if (ordinal < 0 || ordinal > MAX_UNICODE) {
3173
0
        PyErr_SetString(PyExc_ValueError,
3174
0
                        "chr() arg not in range(0x110000)");
3175
0
        return NULL;
3176
0
    }
3177
3178
417
    return unicode_char((Py_UCS4)ordinal);
3179
417
}
3180
3181
PyObject *
3182
PyUnicode_FromObject(PyObject *obj)
3183
0
{
3184
    /* XXX Perhaps we should make this API an alias of
3185
       PyObject_Str() instead ?! */
3186
0
    if (PyUnicode_CheckExact(obj)) {
3187
0
        if (PyUnicode_READY(obj) == -1)
3188
0
            return NULL;
3189
0
        Py_INCREF(obj);
3190
0
        return obj;
3191
0
    }
3192
0
    if (PyUnicode_Check(obj)) {
3193
        /* For a Unicode subtype that's not a Unicode object,
3194
           return a true Unicode object with the same data. */
3195
0
        return _PyUnicode_Copy(obj);
3196
0
    }
3197
0
    PyErr_Format(PyExc_TypeError,
3198
0
                 "Can't convert '%.100s' object to str implicitly",
3199
0
                 Py_TYPE(obj)->tp_name);
3200
0
    return NULL;
3201
0
}
3202
3203
PyObject *
3204
PyUnicode_FromEncodedObject(PyObject *obj,
3205
                            const char *encoding,
3206
                            const char *errors)
3207
17
{
3208
17
    Py_buffer buffer;
3209
17
    PyObject *v;
3210
3211
17
    if (obj == NULL) {
3212
0
        PyErr_BadInternalCall();
3213
0
        return NULL;
3214
0
    }
3215
3216
    /* Decoding bytes objects is the most common case and should be fast */
3217
17
    if (PyBytes_Check(obj)) {
3218
17
        if (PyBytes_GET_SIZE(obj) == 0)
3219
0
            _Py_RETURN_UNICODE_EMPTY();
3220
17
        v = PyUnicode_Decode(
3221
17
                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3222
17
                encoding, errors);
3223
17
        return v;
3224
17
    }
3225
3226
0
    if (PyUnicode_Check(obj)) {
3227
0
        PyErr_SetString(PyExc_TypeError,
3228
0
                        "decoding str is not supported");
3229
0
        return NULL;
3230
0
    }
3231
3232
    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3233
0
    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3234
0
        PyErr_Format(PyExc_TypeError,
3235
0
                     "decoding to str: need a bytes-like object, %.80s found",
3236
0
                     Py_TYPE(obj)->tp_name);
3237
0
        return NULL;
3238
0
    }
3239
3240
0
    if (buffer.len == 0) {
3241
0
        PyBuffer_Release(&buffer);
3242
0
        _Py_RETURN_UNICODE_EMPTY();
3243
0
    }
3244
3245
0
    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
3246
0
    PyBuffer_Release(&buffer);
3247
0
    return v;
3248
0
}
3249
3250
/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3251
   also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3252
   longer than lower_len-1). */
3253
int
3254
_Py_normalize_encoding(const char *encoding,
3255
                       char *lower,
3256
                       size_t lower_len)
3257
1.06k
{
3258
1.06k
    const char *e;
3259
1.06k
    char *l;
3260
1.06k
    char *l_end;
3261
1.06k
    int punct;
3262
3263
1.06k
    assert(encoding != NULL);
3264
3265
1.06k
    e = encoding;
3266
1.06k
    l = lower;
3267
1.06k
    l_end = &lower[lower_len - 1];
3268
1.06k
    punct = 0;
3269
6.49k
    while (1) {
3270
6.49k
        char c = *e;
3271
6.49k
        if (c == 0) {
3272
1.06k
            break;
3273
1.06k
        }
3274
3275
5.43k
        if (Py_ISALNUM(c) || c == '.') {
3276
5.40k
            if (punct && l != lower) {
3277
30
                if (l == l_end) {
3278
0
                    return 0;
3279
0
                }
3280
30
                *l++ = '_';
3281
30
            }
3282
5.40k
            punct = 0;
3283
3284
5.40k
            if (l == l_end) {
3285
0
                return 0;
3286
0
            }
3287
5.40k
            *l++ = Py_TOLOWER(c);
3288
5.40k
        }
3289
30
        else {
3290
30
            punct = 1;
3291
30
        }
3292
3293
5.43k
        e++;
3294
5.43k
    }
3295
1.06k
    *l = '\0';
3296
1.06k
    return 1;
3297
1.06k
}
3298
3299
PyObject *
3300
PyUnicode_Decode(const char *s,
3301
                 Py_ssize_t size,
3302
                 const char *encoding,
3303
                 const char *errors)
3304
34
{
3305
34
    PyObject *buffer = NULL, *unicode;
3306
34
    Py_buffer info;
3307
34
    char buflower[11];   /* strlen("iso-8859-1\0") == 11, longest shortcut */
3308
3309
34
    if (encoding == NULL) {
3310
0
        return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3311
0
    }
3312
3313
    /* Shortcuts for common default encodings */
3314
34
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3315
34
        char *lower = buflower;
3316
3317
        /* Fast paths */
3318
34
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3319
2
            lower += 3;
3320
2
            if (*lower == '_') {
3321
                /* Match "utf8" and "utf_8" */
3322
2
                lower++;
3323
2
            }
3324
3325
2
            if (lower[0] == '8' && lower[1] == 0) {
3326
2
                return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3327
2
            }
3328
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3329
0
                return PyUnicode_DecodeUTF16(s, size, errors, 0);
3330
0
            }
3331
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3332
0
                return PyUnicode_DecodeUTF32(s, size, errors, 0);
3333
0
            }
3334
2
        }
3335
32
        else {
3336
32
            if (strcmp(lower, "ascii") == 0
3337
32
                || strcmp(lower, "us_ascii") == 0) {
3338
31
                return PyUnicode_DecodeASCII(s, size, errors);
3339
31
            }
3340
    #ifdef MS_WINDOWS
3341
            else if (strcmp(lower, "mbcs") == 0) {
3342
                return PyUnicode_DecodeMBCS(s, size, errors);
3343
            }
3344
    #endif
3345
1
            else if (strcmp(lower, "latin1") == 0
3346
1
                     || strcmp(lower, "latin_1") == 0
3347
1
                     || strcmp(lower, "iso_8859_1") == 0
3348
1
                     || strcmp(lower, "iso8859_1") == 0) {
3349
1
                return PyUnicode_DecodeLatin1(s, size, errors);
3350
1
            }
3351
32
        }
3352
34
    }
3353
3354
    /* Decode via the codec registry */
3355
0
    buffer = NULL;
3356
0
    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3357
0
        goto onError;
3358
0
    buffer = PyMemoryView_FromBuffer(&info);
3359
0
    if (buffer == NULL)
3360
0
        goto onError;
3361
0
    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3362
0
    if (unicode == NULL)
3363
0
        goto onError;
3364
0
    if (!PyUnicode_Check(unicode)) {
3365
0
        PyErr_Format(PyExc_TypeError,
3366
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3367
0
                     "use codecs.decode() to decode to arbitrary types",
3368
0
                     encoding,
3369
0
                     Py_TYPE(unicode)->tp_name);
3370
0
        Py_DECREF(unicode);
3371
0
        goto onError;
3372
0
    }
3373
0
    Py_DECREF(buffer);
3374
0
    return unicode_result(unicode);
3375
3376
0
  onError:
3377
0
    Py_XDECREF(buffer);
3378
0
    return NULL;
3379
0
}
3380
3381
PyObject *
3382
PyUnicode_AsDecodedObject(PyObject *unicode,
3383
                          const char *encoding,
3384
                          const char *errors)
3385
0
{
3386
0
    if (!PyUnicode_Check(unicode)) {
3387
0
        PyErr_BadArgument();
3388
0
        return NULL;
3389
0
    }
3390
3391
0
    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3392
0
                     "PyUnicode_AsDecodedObject() is deprecated; "
3393
0
                     "use PyCodec_Decode() to decode from str", 1) < 0)
3394
0
        return NULL;
3395
3396
0
    if (encoding == NULL)
3397
0
        encoding = PyUnicode_GetDefaultEncoding();
3398
3399
    /* Decode via the codec registry */
3400
0
    return PyCodec_Decode(unicode, encoding, errors);
3401
0
}
3402
3403
PyObject *
3404
PyUnicode_AsDecodedUnicode(PyObject *unicode,
3405
                           const char *encoding,
3406
                           const char *errors)
3407
0
{
3408
0
    PyObject *v;
3409
3410
0
    if (!PyUnicode_Check(unicode)) {
3411
0
        PyErr_BadArgument();
3412
0
        goto onError;
3413
0
    }
3414
3415
0
    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3416
0
                     "PyUnicode_AsDecodedUnicode() is deprecated; "
3417
0
                     "use PyCodec_Decode() to decode from str to str", 1) < 0)
3418
0
        return NULL;
3419
3420
0
    if (encoding == NULL)
3421
0
        encoding = PyUnicode_GetDefaultEncoding();
3422
3423
    /* Decode via the codec registry */
3424
0
    v = PyCodec_Decode(unicode, encoding, errors);
3425
0
    if (v == NULL)
3426
0
        goto onError;
3427
0
    if (!PyUnicode_Check(v)) {
3428
0
        PyErr_Format(PyExc_TypeError,
3429
0
                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3430
0
                     "use codecs.decode() to decode to arbitrary types",
3431
0
                     encoding,
3432
0
                     Py_TYPE(unicode)->tp_name);
3433
0
        Py_DECREF(v);
3434
0
        goto onError;
3435
0
    }
3436
0
    return unicode_result(v);
3437
3438
0
  onError:
3439
0
    return NULL;
3440
0
}
3441
3442
PyObject *
3443
PyUnicode_Encode(const Py_UNICODE *s,
3444
                 Py_ssize_t size,
3445
                 const char *encoding,
3446
                 const char *errors)
3447
0
{
3448
0
    PyObject *v, *unicode;
3449
3450
0
    unicode = PyUnicode_FromWideChar(s, size);
3451
0
    if (unicode == NULL)
3452
0
        return NULL;
3453
0
    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3454
0
    Py_DECREF(unicode);
3455
0
    return v;
3456
0
}
3457
3458
PyObject *
3459
PyUnicode_AsEncodedObject(PyObject *unicode,
3460
                          const char *encoding,
3461
                          const char *errors)
3462
0
{
3463
0
    PyObject *v;
3464
3465
0
    if (!PyUnicode_Check(unicode)) {
3466
0
        PyErr_BadArgument();
3467
0
        goto onError;
3468
0
    }
3469
3470
0
    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3471
0
                     "PyUnicode_AsEncodedObject() is deprecated; "
3472
0
                     "use PyUnicode_AsEncodedString() to encode from str to bytes "
3473
0
                     "or PyCodec_Encode() for generic encoding", 1) < 0)
3474
0
        return NULL;
3475
3476
0
    if (encoding == NULL)
3477
0
        encoding = PyUnicode_GetDefaultEncoding();
3478
3479
    /* Encode via the codec registry */
3480
0
    v = PyCodec_Encode(unicode, encoding, errors);
3481
0
    if (v == NULL)
3482
0
        goto onError;
3483
0
    return v;
3484
3485
0
  onError:
3486
0
    return NULL;
3487
0
}
3488
3489
3490
static PyObject *
3491
unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
3492
                      int current_locale)
3493
350
{
3494
350
    Py_ssize_t wlen;
3495
350
    wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3496
350
    if (wstr == NULL) {
3497
0
        return NULL;
3498
0
    }
3499
3500
350
    if ((size_t)wlen != wcslen(wstr)) {
3501
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3502
0
        PyMem_Free(wstr);
3503
0
        return NULL;
3504
0
    }
3505
3506
350
    char *str;
3507
350
    size_t error_pos;
3508
350
    const char *reason;
3509
350
    int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
3510
350
                                 current_locale, error_handler);
3511
350
    PyMem_Free(wstr);
3512
3513
350
    if (res != 0) {
3514
0
        if (res == -2) {
3515
0
            PyObject *exc;
3516
0
            exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3517
0
                    "locale", unicode,
3518
0
                    (Py_ssize_t)error_pos,
3519
0
                    (Py_ssize_t)(error_pos+1),
3520
0
                    reason);
3521
0
            if (exc != NULL) {
3522
0
                PyCodec_StrictErrors(exc);
3523
0
                Py_DECREF(exc);
3524
0
            }
3525
0
        }
3526
0
        else if (res == -3) {
3527
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3528
0
        }
3529
0
        else {
3530
0
            PyErr_NoMemory();
3531
0
        }
3532
0
        return NULL;
3533
0
    }
3534
3535
350
    PyObject *bytes = PyBytes_FromString(str);
3536
350
    PyMem_RawFree(str);
3537
350
    return bytes;
3538
350
}
3539
3540
PyObject *
3541
PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3542
0
{
3543
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3544
0
    return unicode_encode_locale(unicode, error_handler, 1);
3545
0
}
3546
3547
PyObject *
3548
PyUnicode_EncodeFSDefault(PyObject *unicode)
3549
1.34k
{
3550
1.34k
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3551
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3552
    if (interp->fs_codec.encoding) {
3553
        return unicode_encode_utf8(unicode,
3554
                                   interp->fs_codec.error_handler,
3555
                                   interp->fs_codec.errors);
3556
    }
3557
    else {
3558
        const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3559
        _Py_error_handler errors;
3560
        errors = get_error_handler_wide(filesystem_errors);
3561
        assert(errors != _Py_ERROR_UNKNOWN);
3562
        return unicode_encode_utf8(unicode, errors, NULL);
3563
    }
3564
#else
3565
    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3566
       cannot use it to encode and decode filenames before it is loaded. Load
3567
       the Python codec requires to encode at least its own filename. Use the C
3568
       implementation of the locale codec until the codec registry is
3569
       initialized and the Python codec is loaded. See initfsencoding(). */
3570
1.34k
    if (interp->fs_codec.encoding) {
3571
999
        return PyUnicode_AsEncodedString(unicode,
3572
999
                                         interp->fs_codec.encoding,
3573
999
                                         interp->fs_codec.errors);
3574
999
    }
3575
350
    else {
3576
350
        const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3577
350
        _Py_error_handler errors;
3578
350
        errors = get_error_handler_wide(filesystem_errors);
3579
350
        assert(errors != _Py_ERROR_UNKNOWN);
3580
350
        return unicode_encode_locale(unicode, errors, 0);
3581
350
    }
3582
1.34k
#endif
3583
1.34k
}
3584
3585
PyObject *
3586
PyUnicode_AsEncodedString(PyObject *unicode,
3587
                          const char *encoding,
3588
                          const char *errors)
3589
1.01k
{
3590
1.01k
    PyObject *v;
3591
1.01k
    char buflower[11];   /* strlen("iso_8859_1\0") == 11, longest shortcut */
3592
3593
1.01k
    if (!PyUnicode_Check(unicode)) {
3594
0
        PyErr_BadArgument();
3595
0
        return NULL;
3596
0
    }
3597
3598
1.01k
    if (encoding == NULL) {
3599
0
        return _PyUnicode_AsUTF8String(unicode, errors);
3600
0
    }
3601
3602
    /* Shortcuts for common default encodings */
3603
1.01k
    if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3604
1.01k
        char *lower = buflower;
3605
3606
        /* Fast paths */
3607
1.01k
        if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3608
0
            lower += 3;
3609
0
            if (*lower == '_') {
3610
                /* Match "utf8" and "utf_8" */
3611
0
                lower++;
3612
0
            }
3613
3614
0
            if (lower[0] == '8' && lower[1] == 0) {
3615
0
                return _PyUnicode_AsUTF8String(unicode, errors);
3616
0
            }
3617
0
            else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3618
0
                return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3619
0
            }
3620
0
            else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3621
0
                return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3622
0
            }
3623
0
        }
3624
1.01k
        else {
3625
1.01k
            if (strcmp(lower, "ascii") == 0
3626
1.01k
                || strcmp(lower, "us_ascii") == 0) {
3627
1.01k
                return _PyUnicode_AsASCIIString(unicode, errors);
3628
1.01k
            }
3629
#ifdef MS_WINDOWS
3630
            else if (strcmp(lower, "mbcs") == 0) {
3631
                return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3632
            }
3633
#endif
3634
0
            else if (strcmp(lower, "latin1") == 0 ||
3635
0
                     strcmp(lower, "latin_1") == 0 ||
3636
0
                     strcmp(lower, "iso_8859_1") == 0 ||
3637
0
                     strcmp(lower, "iso8859_1") == 0) {
3638
0
                return _PyUnicode_AsLatin1String(unicode, errors);
3639
0
            }
3640
1.01k
        }
3641
1.01k
    }
3642
3643
    /* Encode via the codec registry */
3644
0
    v = _PyCodec_EncodeText(unicode, encoding, errors);
3645
0
    if (v == NULL)
3646
0
        return NULL;
3647
3648
    /* The normal path */
3649
0
    if (PyBytes_Check(v))
3650
0
        return v;
3651
3652
    /* If the codec returns a buffer, raise a warning and convert to bytes */
3653
0
    if (PyByteArray_Check(v)) {
3654
0
        int error;
3655
0
        PyObject *b;
3656
3657
0
        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3658
0
            "encoder %s returned bytearray instead of bytes; "
3659
0
            "use codecs.encode() to encode to arbitrary types",
3660
0
            encoding);
3661
0
        if (error) {
3662
0
            Py_DECREF(v);
3663
0
            return NULL;
3664
0
        }
3665
3666
0
        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3667
0
                                      PyByteArray_GET_SIZE(v));
3668
0
        Py_DECREF(v);
3669
0
        return b;
3670
0
    }
3671
3672
0
    PyErr_Format(PyExc_TypeError,
3673
0
                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3674
0
                 "use codecs.encode() to encode to arbitrary types",
3675
0
                 encoding,
3676
0
                 Py_TYPE(v)->tp_name);
3677
0
    Py_DECREF(v);
3678
0
    return NULL;
3679
0
}
3680
3681
PyObject *
3682
PyUnicode_AsEncodedUnicode(PyObject *unicode,
3683
                           const char *encoding,
3684
                           const char *errors)
3685
0
{
3686
0
    PyObject *v;
3687
3688
0
    if (!PyUnicode_Check(unicode)) {
3689
0
        PyErr_BadArgument();
3690
0
        goto onError;
3691
0
    }
3692
3693
0
    if (PyErr_WarnEx(PyExc_DeprecationWarning,
3694
0
                     "PyUnicode_AsEncodedUnicode() is deprecated; "
3695
0
                     "use PyCodec_Encode() to encode from str to str", 1) < 0)
3696
0
        return NULL;
3697
3698
0
    if (encoding == NULL)
3699
0
        encoding = PyUnicode_GetDefaultEncoding();
3700
3701
    /* Encode via the codec registry */
3702
0
    v = PyCodec_Encode(unicode, encoding, errors);
3703
0
    if (v == NULL)
3704
0
        goto onError;
3705
0
    if (!PyUnicode_Check(v)) {
3706
0
        PyErr_Format(PyExc_TypeError,
3707
0
                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3708
0
                     "use codecs.encode() to encode to arbitrary types",
3709
0
                     encoding,
3710
0
                     Py_TYPE(v)->tp_name);
3711
0
        Py_DECREF(v);
3712
0
        goto onError;
3713
0
    }
3714
0
    return v;
3715
3716
0
  onError:
3717
0
    return NULL;
3718
0
}
3719
3720
static PyObject*
3721
unicode_decode_locale(const char *str, Py_ssize_t len,
3722
                      _Py_error_handler errors, int current_locale)
3723
4.86k
{
3724
4.86k
    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3725
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3726
0
        return NULL;
3727
0
    }
3728
3729
4.86k
    wchar_t *wstr;
3730
4.86k
    size_t wlen;
3731
4.86k
    const char *reason;
3732
4.86k
    int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
3733
4.86k
                                 current_locale, errors);
3734
4.86k
    if (res != 0) {
3735
0
        if (res == -2) {
3736
0
            PyObject *exc;
3737
0
            exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3738
0
                                        "locale", str, len,
3739
0
                                        (Py_ssize_t)wlen,
3740
0
                                        (Py_ssize_t)(wlen + 1),
3741
0
                                        reason);
3742
0
            if (exc != NULL) {
3743
0
                PyCodec_StrictErrors(exc);
3744
0
                Py_DECREF(exc);
3745
0
            }
3746
0
        }
3747
0
        else if (res == -3) {
3748
0
            PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3749
0
        }
3750
0
        else {
3751
0
            PyErr_NoMemory();
3752
0
        }
3753
0
        return NULL;
3754
0
    }
3755
3756
4.86k
    PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3757
4.86k
    PyMem_RawFree(wstr);
3758
4.86k
    return unicode;
3759
4.86k
}
3760
3761
PyObject*
3762
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3763
                              const char *errors)
3764
0
{
3765
0
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3766
0
    return unicode_decode_locale(str, len, error_handler, 1);
3767
0
}
3768
3769
PyObject*
3770
PyUnicode_DecodeLocale(const char *str, const char *errors)
3771
230
{
3772
230
    Py_ssize_t size = (Py_ssize_t)strlen(str);
3773
230
    _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3774
230
    return unicode_decode_locale(str, size, error_handler, 1);
3775
230
}
3776
3777
3778
PyObject*
3779
14
PyUnicode_DecodeFSDefault(const char *s) {
3780
14
    Py_ssize_t size = (Py_ssize_t)strlen(s);
3781
14
    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3782
14
}
3783
3784
PyObject*
3785
PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3786
4.65k
{
3787
4.65k
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
3788
#ifdef _Py_FORCE_UTF8_FS_ENCODING
3789
    if (interp->fs_codec.encoding) {
3790
        return unicode_decode_utf8(s, size,
3791
                                   interp->fs_codec.error_handler,
3792
                                   interp->fs_codec.errors,
3793
                                   NULL);
3794
    }
3795
    else {
3796
        const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3797
        _Py_error_handler errors;
3798
        errors = get_error_handler_wide(filesystem_errors);
3799
        assert(errors != _Py_ERROR_UNKNOWN);
3800
        return unicode_decode_utf8(s, size, errors, NULL, NULL);
3801
    }
3802
#else
3803
    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3804
       cannot use it to encode and decode filenames before it is loaded. Load
3805
       the Python codec requires to encode at least its own filename. Use the C
3806
       implementation of the locale codec until the codec registry is
3807
       initialized and the Python codec is loaded. See initfsencoding(). */
3808
4.65k
    if (interp->fs_codec.encoding) {
3809
17
        return PyUnicode_Decode(s, size,
3810
17
                                interp->fs_codec.encoding,
3811
17
                                interp->fs_codec.errors);
3812
17
    }
3813
4.63k
    else {
3814
4.63k
        const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3815
4.63k
        _Py_error_handler errors;
3816
4.63k
        errors = get_error_handler_wide(filesystem_errors);
3817
4.63k
        return unicode_decode_locale(s, size, errors, 0);
3818
4.63k
    }
3819
4.65k
#endif
3820
4.65k
}
3821
3822
3823
int
3824
PyUnicode_FSConverter(PyObject* arg, void* addr)
3825
1.34k
{
3826
1.34k
    PyObject *path = NULL;
3827
1.34k
    PyObject *output = NULL;
3828
1.34k
    Py_ssize_t size;
3829
1.34k
    void *data;
3830
1.34k
    if (arg == NULL) {
3831
0
        Py_DECREF(*(PyObject**)addr);
3832
0
        *(PyObject**)addr = NULL;
3833
0
        return 1;
3834
0
    }
3835
1.34k
    path = PyOS_FSPath(arg);
3836
1.34k
    if (path == NULL) {
3837
0
        return 0;
3838
0
    }
3839
1.34k
    if (PyBytes_Check(path)) {
3840
0
        output = path;
3841
0
    }
3842
1.34k
    else {  // PyOS_FSPath() guarantees its returned value is bytes or str.
3843
1.34k
        output = PyUnicode_EncodeFSDefault(path);
3844
1.34k
        Py_DECREF(path);
3845
1.34k
        if (!output) {
3846
0
            return 0;
3847
0
        }
3848
1.34k
        assert(PyBytes_Check(output));
3849
1.34k
    }
3850
3851
1.34k
    size = PyBytes_GET_SIZE(output);
3852
1.34k
    data = PyBytes_AS_STRING(output);
3853
1.34k
    if ((size_t)size != strlen(data)) {
3854
0
        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3855
0
        Py_DECREF(output);
3856
0
        return 0;
3857
0
    }
3858
1.34k
    *(PyObject**)addr = output;
3859
1.34k
    return Py_CLEANUP_SUPPORTED;
3860
1.34k
}
3861
3862
3863
int
3864
PyUnicode_FSDecoder(PyObject* arg, void* addr)
3865
0
{
3866
0
    int is_buffer = 0;
3867
0
    PyObject *path = NULL;
3868
0
    PyObject *output = NULL;
3869
0
    if (arg == NULL) {
3870
0
        Py_DECREF(*(PyObject**)addr);
3871
0
        *(PyObject**)addr = NULL;
3872
0
        return 1;
3873
0
    }
3874
3875
0
    is_buffer = PyObject_CheckBuffer(arg);
3876
0
    if (!is_buffer) {
3877
0
        path = PyOS_FSPath(arg);
3878
0
        if (path == NULL) {
3879
0
            return 0;
3880
0
        }
3881
0
    }
3882
0
    else {
3883
0
        path = arg;
3884
0
        Py_INCREF(arg);
3885
0
    }
3886
3887
0
    if (PyUnicode_Check(path)) {
3888
0
        output = path;
3889
0
    }
3890
0
    else if (PyBytes_Check(path) || is_buffer) {
3891
0
        PyObject *path_bytes = NULL;
3892
3893
0
        if (!PyBytes_Check(path) &&
3894
0
            PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
3895
0
            "path should be string, bytes, or os.PathLike, not %.200s",
3896
0
            Py_TYPE(arg)->tp_name)) {
3897
0
                Py_DECREF(path);
3898
0
            return 0;
3899
0
        }
3900
0
        path_bytes = PyBytes_FromObject(path);
3901
0
        Py_DECREF(path);
3902
0
        if (!path_bytes) {
3903
0
            return 0;
3904
0
        }
3905
0
        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3906
0
                                                  PyBytes_GET_SIZE(path_bytes));
3907
0
        Py_DECREF(path_bytes);
3908
0
        if (!output) {
3909
0
            return 0;
3910
0
        }
3911
0
    }
3912
0
    else {
3913
0
        PyErr_Format(PyExc_TypeError,
3914
0
                     "path should be string, bytes, or os.PathLike, not %.200s",
3915
0
                     Py_TYPE(arg)->tp_name);
3916
0
        Py_DECREF(path);
3917
0
        return 0;
3918
0
    }
3919
0
    if (PyUnicode_READY(output) == -1) {
3920
0
        Py_DECREF(output);
3921
0
        return 0;
3922
0
    }
3923
0
    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3924
0
                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3925
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
3926
0
        Py_DECREF(output);
3927
0
        return 0;
3928
0
    }
3929
0
    *(PyObject**)addr = output;
3930
0
    return Py_CLEANUP_SUPPORTED;
3931
0
}
3932
3933
3934
const char *
3935
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3936
3.43k
{
3937
3.43k
    PyObject *bytes;
3938
3939
3.43k
    if (!PyUnicode_Check(unicode)) {
3940
0
        PyErr_BadArgument();
3941
0
        return NULL;
3942
0
    }
3943
3.43k
    if (PyUnicode_READY(unicode) == -1)
3944
0
        return NULL;
3945
3946
3.43k
    if (PyUnicode_UTF8(unicode) == NULL) {
3947
0
        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3948
0
        bytes = _PyUnicode_AsUTF8String(unicode, NULL);
3949
0
        if (bytes == NULL)
3950
0
            return NULL;
3951
0
        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3952
0
        if (_PyUnicode_UTF8(unicode) == NULL) {
3953
0
            PyErr_NoMemory();
3954
0
            Py_DECREF(bytes);
3955
0
            return NULL;
3956
0
        }
3957
0
        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3958
0
        memcpy(_PyUnicode_UTF8(unicode),
3959
0
                  PyBytes_AS_STRING(bytes),
3960
0
                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3961
0
        Py_DECREF(bytes);
3962
0
    }
3963
3964
3.43k
    if (psize)
3965
2.13k
        *psize = PyUnicode_UTF8_LENGTH(unicode);
3966
3.43k
    return PyUnicode_UTF8(unicode);
3967
3.43k
}
3968
3969
const char *
3970
PyUnicode_AsUTF8(PyObject *unicode)
3971
1.29k
{
3972
1.29k
    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3973
1.29k
}
3974
3975
Py_UNICODE *
3976
PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3977
0
{
3978
0
    if (!PyUnicode_Check(unicode)) {
3979
0
        PyErr_BadArgument();
3980
0
        return NULL;
3981
0
    }
3982
0
    Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3983
0
    if (w == NULL) {
3984
        /* Non-ASCII compact unicode object */
3985
0
        assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
3986
0
        assert(PyUnicode_IS_READY(unicode));
3987
3988
0
        Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3989
0
        if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3990
0
            PyErr_NoMemory();
3991
0
            return NULL;
3992
0
        }
3993
0
        w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3994
0
        if (w == NULL) {
3995
0
            PyErr_NoMemory();
3996
0
            return NULL;
3997
0
        }
3998
0
        unicode_copy_as_widechar(unicode, w, wlen + 1);
3999
0
        _PyUnicode_WSTR(unicode) = w;
4000
0
        if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4001
0
            _PyUnicode_WSTR_LENGTH(unicode) = wlen;
4002
0
        }
4003
0
    }
4004
0
    if (size != NULL)
4005
0
        *size = PyUnicode_WSTR_LENGTH(unicode);
4006
0
    return w;
4007
0
}
4008
4009
Py_UNICODE *
4010
PyUnicode_AsUnicode(PyObject *unicode)
4011
0
{
4012
0
    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
4013
0
}
4014
4015
const Py_UNICODE *
4016
_PyUnicode_AsUnicode(PyObject *unicode)
4017
0
{
4018
0
    Py_ssize_t size;
4019
0
    const Py_UNICODE *wstr;
4020
4021
0
    wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4022
0
    if (wstr && wcslen(wstr) != (size_t)size) {
4023
0
        PyErr_SetString(PyExc_ValueError, "embedded null character");
4024
0
        return NULL;
4025
0
    }
4026
0
    return wstr;
4027
0
}
4028
4029
4030
Py_ssize_t
4031
PyUnicode_GetSize(PyObject *unicode)
4032
0
{
4033
0
    if (!PyUnicode_Check(unicode)) {
4034
0
        PyErr_BadArgument();
4035
0
        goto onError;
4036
0
    }
4037
0
    if (_PyUnicode_WSTR(unicode) == NULL) {
4038
0
        if (PyUnicode_AsUnicode(unicode) == NULL)
4039
0
            goto onError;
4040
0
    }
4041
0
    return PyUnicode_WSTR_LENGTH(unicode);
4042
4043
0
  onError:
4044
0
    return -1;
4045
0
}
4046
4047
Py_ssize_t
4048
PyUnicode_GetLength(PyObject *unicode)
4049
0
{
4050
0
    if (!PyUnicode_Check(unicode)) {
4051
0
        PyErr_BadArgument();
4052
0
        return -1;
4053
0
    }
4054
0
    if (PyUnicode_READY(unicode) == -1)
4055
0
        return -1;
4056
0
    return PyUnicode_GET_LENGTH(unicode);
4057
0
}
4058
4059
Py_UCS4
4060
PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4061
0
{
4062
0
    void *data;
4063
0
    int kind;
4064
4065
0
    if (!PyUnicode_Check(unicode)) {
4066
0
        PyErr_BadArgument();
4067
0
        return (Py_UCS4)-1;
4068
0
    }
4069
0
    if (PyUnicode_READY(unicode) == -1) {
4070
0
        return (Py_UCS4)-1;
4071
0
    }
4072
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4073
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4074
0
        return (Py_UCS4)-1;
4075
0
    }
4076
0
    data = PyUnicode_DATA(unicode);
4077
0
    kind = PyUnicode_KIND(unicode);
4078
0
    return PyUnicode_READ(kind, data, index);
4079
0
}
4080
4081
int
4082
PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4083
0
{
4084
0
    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
4085
0
        PyErr_BadArgument();
4086
0
        return -1;
4087
0
    }
4088
0
    assert(PyUnicode_IS_READY(unicode));
4089
0
    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
4090
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
4091
0
        return -1;
4092
0
    }
4093
0
    if (unicode_check_modifiable(unicode))
4094
0
        return -1;
4095
0
    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4096
0
        PyErr_SetString(PyExc_ValueError, "character out of range");
4097
0
        return -1;
4098
0
    }
4099
0
    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4100
0
                    index, ch);
4101
0
    return 0;
4102
0
}
4103
4104
const char *
4105
PyUnicode_GetDefaultEncoding(void)
4106
0
{
4107
0
    return "utf-8";
4108
0
}
4109
4110
/* create or adjust a UnicodeDecodeError */
4111
static void
4112
make_decode_exception(PyObject **exceptionObject,
4113
                      const char *encoding,
4114
                      const char *input, Py_ssize_t length,
4115
                      Py_ssize_t startpos, Py_ssize_t endpos,
4116
                      const char *reason)
4117
0
{
4118
0
    if (*exceptionObject == NULL) {
4119
0
        *exceptionObject = PyUnicodeDecodeError_Create(
4120
0
            encoding, input, length, startpos, endpos, reason);
4121
0
    }
4122
0
    else {
4123
0
        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4124
0
            goto onError;
4125
0
        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4126
0
            goto onError;
4127
0
        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4128
0
            goto onError;
4129
0
    }
4130
0
    return;
4131
4132
0
onError:
4133
0
    Py_CLEAR(*exceptionObject);
4134
0
}
4135
4136
#ifdef MS_WINDOWS
4137
static int
4138
widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4139
{
4140
    if (newsize > *size) {
4141
        wchar_t *newbuf = *buf;
4142
        if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4143
            PyErr_NoMemory();
4144
            return -1;
4145
        }
4146
        *buf = newbuf;
4147
    }
4148
    *size = newsize;
4149
    return 0;
4150
}
4151
4152
/* error handling callback helper:
4153
   build arguments, call the callback and check the arguments,
4154
   if no exception occurred, copy the replacement to the output
4155
   and adjust various state variables.
4156
   return 0 on success, -1 on error
4157
*/
4158
4159
static int
4160
unicode_decode_call_errorhandler_wchar(
4161
    const char *errors, PyObject **errorHandler,
4162
    const char *encoding, const char *reason,
4163
    const char **input, const char **inend, Py_ssize_t *startinpos,
4164
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4165
    wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
4166
{
4167
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4168
4169
    PyObject *restuple = NULL;
4170
    PyObject *repunicode = NULL;
4171
    Py_ssize_t outsize;
4172
    Py_ssize_t insize;
4173
    Py_ssize_t requiredsize;
4174
    Py_ssize_t newpos;
4175
    PyObject *inputobj = NULL;
4176
    wchar_t *repwstr;
4177
    Py_ssize_t repwlen;
4178
4179
    if (*errorHandler == NULL) {
4180
        *errorHandler = PyCodec_LookupError(errors);
4181
        if (*errorHandler == NULL)
4182
            goto onError;
4183
    }
4184
4185
    make_decode_exception(exceptionObject,
4186
        encoding,
4187
        *input, *inend - *input,
4188
        *startinpos, *endinpos,
4189
        reason);
4190
    if (*exceptionObject == NULL)
4191
        goto onError;
4192
4193
    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4194
    if (restuple == NULL)
4195
        goto onError;
4196
    if (!PyTuple_Check(restuple)) {
4197
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4198
        goto onError;
4199
    }
4200
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4201
        goto onError;
4202
4203
    /* Copy back the bytes variables, which might have been modified by the
4204
       callback */
4205
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4206
    if (!inputobj)
4207
        goto onError;
4208
    *input = PyBytes_AS_STRING(inputobj);
4209
    insize = PyBytes_GET_SIZE(inputobj);
4210
    *inend = *input + insize;
4211
    /* we can DECREF safely, as the exception has another reference,
4212
       so the object won't go away. */
4213
    Py_DECREF(inputobj);
4214
4215
    if (newpos<0)
4216
        newpos = insize+newpos;
4217
    if (newpos<0 || newpos>insize) {
4218
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4219
        goto onError;
4220
    }
4221
4222
    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4223
    if (repwstr == NULL)
4224
        goto onError;
4225
    /* need more space? (at least enough for what we
4226
       have+the replacement+the rest of the string (starting
4227
       at the new input position), so we won't have to check space
4228
       when there are no errors in the rest of the string) */
4229
    requiredsize = *outpos;
4230
    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4231
        goto overflow;
4232
    requiredsize += repwlen;
4233
    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4234
        goto overflow;
4235
    requiredsize += insize - newpos;
4236
    outsize = *bufsize;
4237
    if (requiredsize > outsize) {
4238
        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4239
            requiredsize = 2*outsize;
4240
        if (widechar_resize(buf, bufsize, requiredsize) < 0) {
4241
            goto onError;
4242
        }
4243
    }
4244
    wcsncpy(*buf + *outpos, repwstr, repwlen);
4245
    *outpos += repwlen;
4246
    *endinpos = newpos;
4247
    *inptr = *input + newpos;
4248
4249
    /* we made it! */
4250
    Py_DECREF(restuple);
4251
    return 0;
4252
4253
  overflow:
4254
    PyErr_SetString(PyExc_OverflowError,
4255
                    "decoded result is too long for a Python string");
4256
4257
  onError:
4258
    Py_XDECREF(restuple);
4259
    return -1;
4260
}
4261
#endif   /* MS_WINDOWS */
4262
4263
static int
4264
unicode_decode_call_errorhandler_writer(
4265
    const char *errors, PyObject **errorHandler,
4266
    const char *encoding, const char *reason,
4267
    const char **input, const char **inend, Py_ssize_t *startinpos,
4268
    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4269
    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4270
0
{
4271
0
    static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
4272
4273
0
    PyObject *restuple = NULL;
4274
0
    PyObject *repunicode = NULL;
4275
0
    Py_ssize_t insize;
4276
0
    Py_ssize_t newpos;
4277
0
    Py_ssize_t replen;
4278
0
    Py_ssize_t remain;
4279
0
    PyObject *inputobj = NULL;
4280
0
    int need_to_grow = 0;
4281
0
    const char *new_inptr;
4282
4283
0
    if (*errorHandler == NULL) {
4284
0
        *errorHandler = PyCodec_LookupError(errors);
4285
0
        if (*errorHandler == NULL)
4286
0
            goto onError;
4287
0
    }
4288
4289
0
    make_decode_exception(exceptionObject,
4290
0
        encoding,
4291
0
        *input, *inend - *input,
4292
0
        *startinpos, *endinpos,
4293
0
        reason);
4294
0
    if (*exceptionObject == NULL)
4295
0
        goto onError;
4296
4297
0
    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4298
0
    if (restuple == NULL)
4299
0
        goto onError;
4300
0
    if (!PyTuple_Check(restuple)) {
4301
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4302
0
        goto onError;
4303
0
    }
4304
0
    if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
4305
0
        goto onError;
4306
4307
    /* Copy back the bytes variables, which might have been modified by the
4308
       callback */
4309
0
    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4310
0
    if (!inputobj)
4311
0
        goto onError;
4312
0
    remain = *inend - *input - *endinpos;
4313
0
    *input = PyBytes_AS_STRING(inputobj);
4314
0
    insize = PyBytes_GET_SIZE(inputobj);
4315
0
    *inend = *input + insize;
4316
    /* we can DECREF safely, as the exception has another reference,
4317
       so the object won't go away. */
4318
0
    Py_DECREF(inputobj);
4319
4320
0
    if (newpos<0)
4321
0
        newpos = insize+newpos;
4322
0
    if (newpos<0 || newpos>insize) {
4323
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4324
0
        goto onError;
4325
0
    }
4326
4327
0
    replen = PyUnicode_GET_LENGTH(repunicode);
4328
0
    if (replen > 1) {
4329
0
        writer->min_length += replen - 1;
4330
0
        need_to_grow = 1;
4331
0
    }
4332
0
    new_inptr = *input + newpos;
4333
0
    if (*inend - new_inptr > remain) {
4334
        /* We don't know the decoding algorithm here so we make the worst
4335
           assumption that one byte decodes to one unicode character.
4336
           If unfortunately one byte could decode to more unicode characters,
4337
           the decoder may write out-of-bound then.  Is it possible for the
4338
           algorithms using this function? */
4339
0
        writer->min_length += *inend - new_inptr - remain;
4340
0
        need_to_grow = 1;
4341
0
    }
4342
0
    if (need_to_grow) {
4343
0
        writer->overallocate = 1;
4344
0
        if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
4345
0
                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4346
0
            goto onError;
4347
0
    }
4348
0
    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4349
0
        goto onError;
4350
4351
0
    *endinpos = newpos;
4352
0
    *inptr = new_inptr;
4353
4354
    /* we made it! */
4355
0
    Py_DECREF(restuple);
4356
0
    return 0;
4357
4358
0
  onError:
4359
0
    Py_XDECREF(restuple);
4360
0
    return -1;
4361
0
}
4362
4363
/* --- UTF-7 Codec -------------------------------------------------------- */
4364
4365
/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4366
4367
/* Three simple macros defining base-64. */
4368
4369
/* Is c a base-64 character? */
4370
4371
#define IS_BASE64(c) \
4372
0
    (((c) >= 'A' && (c) <= 'Z') ||     \
4373
0
     ((c) >= 'a' && (c) <= 'z') ||     \
4374
0
     ((c) >= '0' && (c) <= '9') ||     \
4375
0
     (c) == '+' || (c) == '/')
4376
4377
/* given that c is a base-64 character, what is its base-64 value? */
4378
4379
#define FROM_BASE64(c)                                                  \
4380
0
    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4381
0
     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4382
0
     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4383
0
     (c) == '+' ? 62 : 63)
4384
4385
/* What is the base-64 character of the bottom 6 bits of n? */
4386
4387
#define TO_BASE64(n)  \
4388
0
    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4389
4390
/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4391
 * decoded as itself.  We are permissive on decoding; the only ASCII
4392
 * byte not decoding to itself is the + which begins a base64
4393
 * string. */
4394
4395
#define DECODE_DIRECT(c)                                \
4396
0
    ((c) <= 127 && (c) != '+')
4397
4398
/* The UTF-7 encoder treats ASCII characters differently according to
4399
 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4400
 * the above).  See RFC2152.  This array identifies these different
4401
 * sets:
4402
 * 0 : "Set D"
4403
 *     alphanumeric and '(),-./:?
4404
 * 1 : "Set O"
4405
 *     !"#$%&*;<=>@[]^_`{|}
4406
 * 2 : "whitespace"
4407
 *     ht nl cr sp
4408
 * 3 : special (must be base64 encoded)
4409
 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4410
 */
4411
4412
static
4413
char utf7_category[128] = {
4414
/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4415
    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4416
/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4417
    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4418
/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4419
    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4420
/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4421
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4422
/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4423
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4424
/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4425
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4426
/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4427
    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4428
/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4429
    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4430
};
4431
4432
/* ENCODE_DIRECT: this character should be encoded as itself.  The
4433
 * answer depends on whether we are encoding set O as itself, and also
4434
 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4435
 * clear that the answers to these questions vary between
4436
 * applications, so this code needs to be flexible.  */
4437
4438
#define ENCODE_DIRECT(c, directO, directWS)             \
4439
0
    ((c) < 128 && (c) > 0 &&                            \
4440
0
     ((utf7_category[(c)] == 0) ||                      \
4441
0
      (directWS && (utf7_category[(c)] == 2)) ||        \
4442
0
      (directO && (utf7_category[(c)] == 1))))
4443
4444
PyObject *
4445
PyUnicode_DecodeUTF7(const char *s,
4446
                     Py_ssize_t size,
4447
                     const char *errors)
4448
0
{
4449
0
    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4450
0
}
4451
4452
/* The decoder.  The only state we preserve is our read position,
4453
 * i.e. how many characters we have consumed.  So if we end in the
4454
 * middle of a shift sequence we have to back off the read position
4455
 * and the output to the beginning of the sequence, otherwise we lose
4456
 * all the shift state (seen bits, number of bits seen, high
4457
 * surrogate). */
4458
4459
PyObject *
4460
PyUnicode_DecodeUTF7Stateful(const char *s,
4461
                             Py_ssize_t size,
4462
                             const char *errors,
4463
                             Py_ssize_t *consumed)
4464
0
{
4465
0
    const char *starts = s;
4466
0
    Py_ssize_t startinpos;
4467
0
    Py_ssize_t endinpos;
4468
0
    const char *e;
4469
0
    _PyUnicodeWriter writer;
4470
0
    const char *errmsg = "";
4471
0
    int inShift = 0;
4472
0
    Py_ssize_t shiftOutStart;
4473
0
    unsigned int base64bits = 0;
4474
0
    unsigned long base64buffer = 0;
4475
0
    Py_UCS4 surrogate = 0;
4476
0
    PyObject *errorHandler = NULL;
4477
0
    PyObject *exc = NULL;
4478
4479
0
    if (size == 0) {
4480
0
        if (consumed)
4481
0
            *consumed = 0;
4482
0
        _Py_RETURN_UNICODE_EMPTY();
4483
0
    }
4484
4485
    /* Start off assuming it's all ASCII. Widen later as necessary. */
4486
0
    _PyUnicodeWriter_Init(&writer);
4487
0
    writer.min_length = size;
4488
4489
0
    shiftOutStart = 0;
4490
0
    e = s + size;
4491
4492
0
    while (s < e) {
4493
0
        Py_UCS4 ch;
4494
0
      restart:
4495
0
        ch = (unsigned char) *s;
4496
4497
0
        if (inShift) { /* in a base-64 section */
4498
0
            if (IS_BASE64(ch)) { /* consume a base-64 character */
4499
0
                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4500
0
                base64bits += 6;
4501
0
                s++;
4502
0
                if (base64bits >= 16) {
4503
                    /* we have enough bits for a UTF-16 value */
4504
0
                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4505
0
                    base64bits -= 16;
4506
0
                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4507
0
                    assert(outCh <= 0xffff);
4508
0
                    if (surrogate) {
4509
                        /* expecting a second surrogate */
4510
0
                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4511
0
                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4512
0
                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4513
0
                                goto onError;
4514
0
                            surrogate = 0;
4515
0
                            continue;
4516
0
                        }
4517
0
                        else {
4518
0
                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4519
0
                                goto onError;
4520
0
                            surrogate = 0;
4521
0
                        }
4522
0
                    }
4523
0
                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4524
                        /* first surrogate */
4525
0
                        surrogate = outCh;
4526
0
                    }
4527
0
                    else {
4528
0
                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4529
0
                            goto onError;
4530
0
                    }
4531
0
                }
4532
0
            }
4533
0
            else { /* now leaving a base-64 section */
4534
0
                inShift = 0;
4535
0
                if (base64bits > 0) { /* left-over bits */
4536
0
                    if (base64bits >= 6) {
4537
                        /* We've seen at least one base-64 character */
4538
0
                        s++;
4539
0
                        errmsg = "partial character in shift sequence";
4540
0
                        goto utf7Error;
4541
0
                    }
4542
0
                    else {
4543
                        /* Some bits remain; they should be zero */
4544
0
                        if (base64buffer != 0) {
4545
0
                            s++;
4546
0
                            errmsg = "non-zero padding bits in shift sequence";
4547
0
                            goto utf7Error;
4548
0
                        }
4549
0
                    }
4550
0
                }
4551
0
                if (surrogate && DECODE_DIRECT(ch)) {
4552
0
                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4553
0
                        goto onError;
4554
0
                }
4555
0
                surrogate = 0;
4556
0
                if (ch == '-') {
4557
                    /* '-' is absorbed; other terminating
4558
                       characters are preserved */
4559
0
                    s++;
4560
0
                }
4561
0
            }
4562
0
        }
4563
0
        else if ( ch == '+' ) {
4564
0
            startinpos = s-starts;
4565
0
            s++; /* consume '+' */
4566
0
            if (s < e && *s == '-') { /* '+-' encodes '+' */
4567
0
                s++;
4568
0
                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4569
0
                    goto onError;
4570
0
            }
4571
0
            else if (s < e && !IS_BASE64(*s)) {
4572
0
                s++;
4573
0
                errmsg = "ill-formed sequence";
4574
0
                goto utf7Error;
4575
0
            }
4576
0
            else { /* begin base64-encoded section */
4577
0
                inShift = 1;
4578
0
                surrogate = 0;
4579
0
                shiftOutStart = writer.pos;
4580
0
                base64bits = 0;
4581
0
                base64buffer = 0;
4582
0
            }
4583
0
        }
4584
0
        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4585
0
            s++;
4586
0
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4587
0
                goto onError;
4588
0
        }
4589
0
        else {
4590
0
            startinpos = s-starts;
4591
0
            s++;
4592
0
            errmsg = "unexpected special character";
4593
0
            goto utf7Error;
4594
0
        }
4595
0
        continue;
4596
0
utf7Error:
4597
0
        endinpos = s-starts;
4598
0
        if (unicode_decode_call_errorhandler_writer(
4599
0
                errors, &errorHandler,
4600
0
                "utf7", errmsg,
4601
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
4602
0
                &writer))
4603
0
            goto onError;
4604
0
    }
4605
4606
    /* end of string */
4607
4608
0
    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4609
        /* if we're in an inconsistent state, that's an error */
4610
0
        inShift = 0;
4611
0
        if (surrogate ||
4612
0
                (base64bits >= 6) ||
4613
0
                (base64bits > 0 && base64buffer != 0)) {
4614
0
            endinpos = size;
4615
0
            if (unicode_decode_call_errorhandler_writer(
4616
0
                    errors, &errorHandler,
4617
0
                    "utf7", "unterminated shift sequence",
4618
0
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4619
0
                    &writer))
4620
0
                goto onError;
4621
0
            if (s < e)
4622
0
                goto restart;
4623
0
        }
4624
0
    }
4625
4626
    /* return state */
4627
0
    if (consumed) {
4628
0
        if (inShift) {
4629
0
            *consumed = startinpos;
4630
0
            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4631
0
                PyObject *result = PyUnicode_FromKindAndData(
4632
0
                        writer.kind, writer.data, shiftOutStart);
4633
0
                Py_XDECREF(errorHandler);
4634
0
                Py_XDECREF(exc);
4635
0
                _PyUnicodeWriter_Dealloc(&writer);
4636
0
                return result;
4637
0
            }
4638
0
            writer.pos = shiftOutStart; /* back off output */
4639
0
        }
4640
0
        else {
4641
0
            *consumed = s-starts;
4642
0
        }
4643
0
    }
4644
4645
0
    Py_XDECREF(errorHandler);
4646
0
    Py_XDECREF(exc);
4647
0
    return _PyUnicodeWriter_Finish(&writer);
4648
4649
0
  onError:
4650
0
    Py_XDECREF(errorHandler);
4651
0
    Py_XDECREF(exc);
4652
0
    _PyUnicodeWriter_Dealloc(&writer);
4653
0
    return NULL;
4654
0
}
4655
4656
4657
PyObject *
4658
_PyUnicode_EncodeUTF7(PyObject *str,
4659
                      int base64SetO,
4660
                      int base64WhiteSpace,
4661
                      const char *errors)
4662
0
{
4663
0
    int kind;
4664
0
    void *data;
4665
0
    Py_ssize_t len;
4666
0
    PyObject *v;
4667
0
    int inShift = 0;
4668
0
    Py_ssize_t i;
4669
0
    unsigned int base64bits = 0;
4670
0
    unsigned long base64buffer = 0;
4671
0
    char * out;
4672
0
    char * start;
4673
4674
0
    if (PyUnicode_READY(str) == -1)
4675
0
        return NULL;
4676
0
    kind = PyUnicode_KIND(str);
4677
0
    data = PyUnicode_DATA(str);
4678
0
    len = PyUnicode_GET_LENGTH(str);
4679
4680
0
    if (len == 0)
4681
0
        return PyBytes_FromStringAndSize(NULL, 0);
4682
4683
    /* It might be possible to tighten this worst case */
4684
0
    if (len > PY_SSIZE_T_MAX / 8)
4685
0
        return PyErr_NoMemory();
4686
0
    v = PyBytes_FromStringAndSize(NULL, len * 8);
4687
0
    if (v == NULL)
4688
0
        return NULL;
4689
4690
0
    start = out = PyBytes_AS_STRING(v);
4691
0
    for (i = 0; i < len; ++i) {
4692
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4693
4694
0
        if (inShift) {
4695
0
            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4696
                /* shifting out */
4697
0
                if (base64bits) { /* output remaining bits */
4698
0
                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4699
0
                    base64buffer = 0;
4700
0
                    base64bits = 0;
4701
0
                }
4702
0
                inShift = 0;
4703
                /* Characters not in the BASE64 set implicitly unshift the sequence
4704
                   so no '-' is required, except if the character is itself a '-' */
4705
0
                if (IS_BASE64(ch) || ch == '-') {
4706
0
                    *out++ = '-';
4707
0
                }
4708
0
                *out++ = (char) ch;
4709
0
            }
4710
0
            else {
4711
0
                goto encode_char;
4712
0
            }
4713
0
        }
4714
0
        else { /* not in a shift sequence */
4715
0
            if (ch == '+') {
4716
0
                *out++ = '+';
4717
0
                        *out++ = '-';
4718
0
            }
4719
0
            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4720
0
                *out++ = (char) ch;
4721
0
            }
4722
0
            else {
4723
0
                *out++ = '+';
4724
0
                inShift = 1;
4725
0
                goto encode_char;
4726
0
            }
4727
0
        }
4728
0
        continue;
4729
0
encode_char:
4730
0
        if (ch >= 0x10000) {
4731
0
            assert(ch <= MAX_UNICODE);
4732
4733
            /* code first surrogate */
4734
0
            base64bits += 16;
4735
0
            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4736
0
            while (base64bits >= 6) {
4737
0
                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4738
0
                base64bits -= 6;
4739
0
            }
4740
            /* prepare second surrogate */
4741
0
            ch = Py_UNICODE_LOW_SURROGATE(ch);
4742
0
        }
4743
0
        base64bits += 16;
4744
0
        base64buffer = (base64buffer << 16) | ch;
4745
0
        while (base64bits >= 6) {
4746
0
            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4747
0
            base64bits -= 6;
4748
0
        }
4749
0
    }
4750
0
    if (base64bits)
4751
0
        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4752
0
    if (inShift)
4753
0
        *out++ = '-';
4754
0
    if (_PyBytes_Resize(&v, out - start) < 0)
4755
0
        return NULL;
4756
0
    return v;
4757
0
}
4758
PyObject *
4759
PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4760
                     Py_ssize_t size,
4761
                     int base64SetO,
4762
                     int base64WhiteSpace,
4763
                     const char *errors)
4764
0
{
4765
0
    PyObject *result;
4766
0
    PyObject *tmp = PyUnicode_FromWideChar(s, size);
4767
0
    if (tmp == NULL)
4768
0
        return NULL;
4769
0
    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4770
0
                                   base64WhiteSpace, errors);
4771
0
    Py_DECREF(tmp);
4772
0
    return result;
4773
0
}
4774
4775
#undef IS_BASE64
4776
#undef FROM_BASE64
4777
#undef TO_BASE64
4778
#undef DECODE_DIRECT
4779
#undef ENCODE_DIRECT
4780
4781
/* --- UTF-8 Codec -------------------------------------------------------- */
4782
4783
PyObject *
4784
PyUnicode_DecodeUTF8(const char *s,
4785
                     Py_ssize_t size,
4786
                     const char *errors)
4787
405
{
4788
405
    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4789
405
}
4790
4791
#include "stringlib/asciilib.h"
4792
#include "stringlib/codecs.h"
4793
#include "stringlib/undef.h"
4794
4795
#include "stringlib/ucs1lib.h"
4796
#include "stringlib/codecs.h"
4797
#include "stringlib/undef.h"
4798
4799
#include "stringlib/ucs2lib.h"
4800
#include "stringlib/codecs.h"
4801
#include "stringlib/undef.h"
4802
4803
#include "stringlib/ucs4lib.h"
4804
#include "stringlib/codecs.h"
4805
#include "stringlib/undef.h"
4806
4807
/* Mask to quickly check whether a C 'long' contains a
4808
   non-ASCII, UTF8-encoded char. */
4809
#if (SIZEOF_LONG == 8)
4810
78.5k
# define ASCII_CHAR_MASK 0x8080808080808080UL
4811
#elif (SIZEOF_LONG == 4)
4812
# define ASCII_CHAR_MASK 0x80808080UL
4813
#else
4814
# error C 'long' size should be either 4 or 8!
4815
#endif
4816
4817
static Py_ssize_t
4818
ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4819
64.0k
{
4820
64.0k
    const char *p = start;
4821
64.0k
    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4822
4823
    /*
4824
     * Issue #17237: m68k is a bit different from most architectures in
4825
     * that objects do not use "natural alignment" - for example, int and
4826
     * long are only aligned at 2-byte boundaries.  Therefore the assert()
4827
     * won't work; also, tests have shown that skipping the "optimised
4828
     * version" will even speed up m68k.
4829
     */
4830
64.0k
#if !defined(__m68k__)
4831
64.0k
#if SIZEOF_LONG <= SIZEOF_VOID_P
4832
64.0k
    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4833
64.0k
    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4834
        /* Fast path, see in STRINGLIB(utf8_decode) for
4835
           an explanation. */
4836
        /* Help allocation */
4837
14.7k
        const char *_p = p;
4838
14.7k
        Py_UCS1 * q = dest;
4839
66.4k
        while (_p < aligned_end) {
4840
51.7k
            unsigned long value = *(const unsigned long *) _p;
4841
51.7k
            if (value & ASCII_CHAR_MASK)
4842
0
                break;
4843
51.7k
            *((unsigned long *)q) = value;
4844
51.7k
            _p += SIZEOF_LONG;
4845
51.7k
            q += SIZEOF_LONG;
4846
51.7k
        }
4847
14.7k
        p = _p;
4848
61.7k
        while (p < end) {
4849
47.0k
            if ((unsigned char)*p & 0x80)
4850
0
                break;
4851
47.0k
            *q++ = *p++;
4852
47.0k
        }
4853
14.7k
        return p - start;
4854
14.7k
    }
4855
49.3k
#endif
4856
49.3k
#endif
4857
396k
    while (p < end) {
4858
        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4859
           for an explanation. */
4860
349k
        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4861
            /* Help allocation */
4862
43.4k
            const char *_p = p;
4863
70.3k
            while (_p < aligned_end) {
4864
26.8k
                unsigned long value = *(const unsigned long *) _p;
4865
26.8k
                if (value & ASCII_CHAR_MASK)
4866
15
                    break;
4867
26.8k
                _p += SIZEOF_LONG;
4868
26.8k
            }
4869
43.4k
            p = _p;
4870
43.4k
            if (_p == end)
4871
2.65k
                break;
4872
43.4k
        }
4873
347k
        if ((unsigned char)*p & 0x80)
4874
15
            break;
4875
347k
        ++p;
4876
347k
    }
4877
49.3k
    memcpy(dest, start, p - start);
4878
49.3k
    return p - start;
4879
64.0k
}
4880
4881
static PyObject *
4882
unicode_decode_utf8(const char *s, Py_ssize_t size,
4883
                    _Py_error_handler error_handler, const char *errors,
4884
                    Py_ssize_t *consumed)
4885
64.5k
{
4886
64.5k
    _PyUnicodeWriter writer;
4887
64.5k
    const char *starts = s;
4888
64.5k
    const char *end = s + size;
4889
4890
64.5k
    Py_ssize_t startinpos;
4891
64.5k
    Py_ssize_t endinpos;
4892
64.5k
    const char *errmsg = "";
4893
64.5k
    PyObject *error_handler_obj = NULL;
4894
64.5k
    PyObject *exc = NULL;
4895
4896
64.5k
    if (size == 0) {
4897
331
        if (consumed)
4898
0
            *consumed = 0;
4899
331
        _Py_RETURN_UNICODE_EMPTY();
4900
331
    }
4901
4902
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4903
64.1k
    if (size == 1 && (unsigned char)s[0] < 128) {
4904
564
        if (consumed)
4905
0
            *consumed = 1;
4906
564
        return get_latin1_char((unsigned char)s[0]);
4907
564
    }
4908
4909
63.6k
    _PyUnicodeWriter_Init(&writer);
4910
63.6k
    writer.min_length = size;
4911
63.6k
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4912
0
        goto onError;
4913
4914
63.6k
    writer.pos = ascii_decode(s, end, writer.data);
4915
63.6k
    s += writer.pos;
4916
63.6k
    while (s < end) {
4917
44
        Py_UCS4 ch;
4918
44
        int kind = writer.kind;
4919
4920
44
        if (kind == PyUnicode_1BYTE_KIND) {
4921
30
            if (PyUnicode_IS_ASCII(writer.buffer))
4922
15
                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4923
15
            else
4924
15
                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4925
30
        } else if (kind == PyUnicode_2BYTE_KIND) {
4926
14
            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4927
14
        } else {
4928
0
            assert(kind == PyUnicode_4BYTE_KIND);
4929
0
            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4930
0
        }
4931
4932
44
        switch (ch) {
4933
15
        case 0:
4934
15
            if (s == end || consumed)
4935
15
                goto End;
4936
0
            errmsg = "unexpected end of data";
4937
0
            startinpos = s - starts;
4938
0
            endinpos = end - starts;
4939
0
            break;
4940
0
        case 1:
4941
0
            errmsg = "invalid start byte";
4942
0
            startinpos = s - starts;
4943
0
            endinpos = startinpos + 1;
4944
0
            break;
4945
0
        case 2:
4946
0
            if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4947
0
                && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4948
0
            {
4949
                /* Truncated surrogate code in range D800-DFFF */
4950
0
                goto End;
4951
0
            }
4952
            /* fall through */
4953
0
        case 3:
4954
0
        case 4:
4955
0
            errmsg = "invalid continuation byte";
4956
0
            startinpos = s - starts;
4957
0
            endinpos = startinpos + ch - 1;
4958
0
            break;
4959
29
        default:
4960
29
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4961
0
                goto onError;
4962
29
            continue;
4963
44
        }
4964
4965
0
        if (error_handler == _Py_ERROR_UNKNOWN)
4966
0
            error_handler = _Py_GetErrorHandler(errors);
4967
4968
0
        switch (error_handler) {
4969
0
        case _Py_ERROR_IGNORE:
4970
0
            s += (endinpos - startinpos);
4971
0
            break;
4972
4973
0
        case _Py_ERROR_REPLACE:
4974
0
            if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4975
0
                goto onError;
4976
0
            s += (endinpos - startinpos);
4977
0
            break;
4978
4979
0
        case _Py_ERROR_SURROGATEESCAPE:
4980
0
        {
4981
0
            Py_ssize_t i;
4982
4983
0
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4984
0
                goto onError;
4985
0
            for (i=startinpos; i<endinpos; i++) {
4986
0
                ch = (Py_UCS4)(unsigned char)(starts[i]);
4987
0
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4988
0
                                ch + 0xdc00);
4989
0
                writer.pos++;
4990
0
            }
4991
0
            s += (endinpos - startinpos);
4992
0
            break;
4993
0
        }
4994
4995
0
        default:
4996
0
            if (unicode_decode_call_errorhandler_writer(
4997
0
                    errors, &error_handler_obj,
4998
0
                    "utf-8", errmsg,
4999
0
                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5000
0
                    &writer))
5001
0
                goto onError;
5002
0
        }
5003
0
    }
5004
5005
63.6k
End:
5006
63.6k
    if (consumed)
5007
2
        *consumed = s - starts;
5008
5009
63.6k
    Py_XDECREF(error_handler_obj);
5010
63.6k
    Py_XDECREF(exc);
5011
63.6k
    return _PyUnicodeWriter_Finish(&writer);
5012
5013
0
onError:
5014
0
    Py_XDECREF(error_handler_obj);
5015
0
    Py_XDECREF(exc);
5016
0
    _PyUnicodeWriter_Dealloc(&writer);
5017
0
    return NULL;
5018
63.6k
}
5019
5020
5021
PyObject *
5022
PyUnicode_DecodeUTF8Stateful(const char *s,
5023
                             Py_ssize_t size,
5024
                             const char *errors,
5025
                             Py_ssize_t *consumed)
5026
64.5k
{
5027
64.5k
    return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5028
64.5k
}
5029
5030
5031
/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5032
   non-zero, use strict error handler otherwise.
5033
5034
   On success, write a pointer to a newly allocated wide character string into
5035
   *wstr (use PyMem_RawFree() to free the memory) and write the output length
5036
   (in number of wchar_t units) into *wlen (if wlen is set).
5037
5038
   On memory allocation failure, return -1.
5039
5040
   On decoding error (if surrogateescape is zero), return -2. If wlen is
5041
   non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5042
   is not NULL, write the decoding error message into *reason. */
5043
int
5044
_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
5045
                 const char **reason, _Py_error_handler errors)
5046
0
{
5047
0
    const char *orig_s = s;
5048
0
    const char *e;
5049
0
    wchar_t *unicode;
5050
0
    Py_ssize_t outpos;
5051
5052
0
    int surrogateescape = 0;
5053
0
    int surrogatepass = 0;
5054
0
    switch (errors)
5055
0
    {
5056
0
    case _Py_ERROR_STRICT:
5057
0
        break;
5058
0
    case _Py_ERROR_SURROGATEESCAPE:
5059
0
        surrogateescape = 1;
5060
0
        break;
5061
0
    case _Py_ERROR_SURROGATEPASS:
5062
0
        surrogatepass = 1;
5063
0
        break;
5064
0
    default:
5065
0
        return -3;
5066
0
    }
5067
5068
    /* Note: size will always be longer than the resulting Unicode
5069
       character count */
5070
0
    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
5071
0
        return -1;
5072
0
    }
5073
5074
0
    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
5075
0
    if (!unicode) {
5076
0
        return -1;
5077
0
    }
5078
5079
    /* Unpack UTF-8 encoded data */
5080
0
    e = s + size;
5081
0
    outpos = 0;
5082
0
    while (s < e) {
5083
0
        Py_UCS4 ch;
5084
0
#if SIZEOF_WCHAR_T == 4
5085
0
        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
5086
#else
5087
        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
5088
#endif
5089
0
        if (ch > 0xFF) {
5090
0
#if SIZEOF_WCHAR_T == 4
5091
0
            Py_UNREACHABLE();
5092
#else
5093
            assert(ch > 0xFFFF && ch <= MAX_UNICODE);
5094
            /* write a surrogate pair */
5095
            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5096
            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5097
#endif
5098
0
        }
5099
0
        else {
5100
0
            if (!ch && s == e) {
5101
0
                break;
5102
0
            }
5103
5104
0
            if (surrogateescape) {
5105
0
                unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5106
0
            }
5107
0
            else {
5108
                /* Is it a valid three-byte code? */
5109
0
                if (surrogatepass
5110
0
                    && (e - s) >= 3
5111
0
                    && (s[0] & 0xf0) == 0xe0
5112
0
                    && (s[1] & 0xc0) == 0x80
5113
0
                    && (s[2] & 0xc0) == 0x80)
5114
0
                {
5115
0
                    ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5116
0
                    s += 3;
5117
0
                    unicode[outpos++] = ch;
5118
0
                }
5119
0
                else {
5120
0
                    PyMem_RawFree(unicode );
5121
0
                    if (reason != NULL) {
5122
0
                        switch (ch) {
5123
0
                        case 0:
5124
0
                            *reason = "unexpected end of data";
5125
0
                            break;
5126
0
                        case 1:
5127
0
                            *reason = "invalid start byte";
5128
0
                            break;
5129
                        /* 2, 3, 4 */
5130
0
                        default:
5131
0
                            *reason = "invalid continuation byte";
5132
0
                            break;
5133
0
                        }
5134
0
                    }
5135
0
                    if (wlen != NULL) {
5136
0
                        *wlen = s - orig_s;
5137
0
                    }
5138
0
                    return -2;
5139
0
                }
5140
0
            }
5141
0
        }
5142
0
    }
5143
0
    unicode[outpos] = L'\0';
5144
0
    if (wlen) {
5145
0
        *wlen = outpos;
5146
0
    }
5147
0
    *wstr = unicode;
5148
0
    return 0;
5149
0
}
5150
5151
5152
wchar_t*
5153
_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5154
                               size_t *wlen)
5155
0
{
5156
0
    wchar_t *wstr;
5157
0
    int res = _Py_DecodeUTF8Ex(arg, arglen,
5158
0
                               &wstr, wlen,
5159
0
                               NULL, _Py_ERROR_SURROGATEESCAPE);
5160
0
    if (res != 0) {
5161
        /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5162
0
        assert(res != -3);
5163
0
        if (wlen) {
5164
0
            *wlen = (size_t)res;
5165
0
        }
5166
0
        return NULL;
5167
0
    }
5168
0
    return wstr;
5169
0
}
5170
5171
5172
/* UTF-8 encoder using the surrogateescape error handler .
5173
5174
   On success, return 0 and write the newly allocated character string (use
5175
   PyMem_Free() to free the memory) into *str.
5176
5177
   On encoding failure, return -2 and write the position of the invalid
5178
   surrogate character into *error_pos (if error_pos is set) and the decoding
5179
   error message into *reason (if reason is set).
5180
5181
   On memory allocation failure, return -1. */
5182
int
5183
_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
5184
                 const char **reason, int raw_malloc, _Py_error_handler errors)
5185
56
{
5186
56
    const Py_ssize_t max_char_size = 4;
5187
56
    Py_ssize_t len = wcslen(text);
5188
5189
56
    assert(len >= 0);
5190
5191
56
    int surrogateescape = 0;
5192
56
    int surrogatepass = 0;
5193
56
    switch (errors)
5194
56
    {
5195
56
    case _Py_ERROR_STRICT:
5196
56
        break;
5197
0
    case _Py_ERROR_SURROGATEESCAPE:
5198
0
        surrogateescape = 1;
5199
0
        break;
5200
0
    case _Py_ERROR_SURROGATEPASS:
5201
0
        surrogatepass = 1;
5202
0
        break;
5203
0
    default:
5204
0
        return -3;
5205
56
    }
5206
5207
56
    if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5208
0
        return -1;
5209
0
    }
5210
56
    char *bytes;
5211
56
    if (raw_malloc) {
5212
56
        bytes = PyMem_RawMalloc((len + 1) * max_char_size);
5213
56
    }
5214
0
    else {
5215
0
        bytes = PyMem_Malloc((len + 1) * max_char_size);
5216
0
    }
5217
56
    if (bytes == NULL) {
5218
0
        return -1;
5219
0
    }
5220
5221
56
    char *p = bytes;
5222
56
    Py_ssize_t i;
5223
728
    for (i = 0; i < len; ) {
5224
672
        Py_ssize_t ch_pos = i;
5225
672
        Py_UCS4 ch = text[i];
5226
672
        i++;
5227
#if Py_UNICODE_SIZE == 2
5228
        if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5229
            && i < len
5230
            && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5231
        {
5232
            ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5233
            i++;
5234
        }
5235
#endif
5236
5237
672
        if (ch < 0x80) {
5238
            /* Encode ASCII */
5239
672
            *p++ = (char) ch;
5240
5241
672
        }
5242
0
        else if (ch < 0x0800) {
5243
            /* Encode Latin-1 */
5244
0
            *p++ = (char)(0xc0 | (ch >> 6));
5245
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5246
0
        }
5247
0
        else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
5248
            /* surrogateescape error handler */
5249
0
            if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
5250
0
                if (error_pos != NULL) {
5251
0
                    *error_pos = (size_t)ch_pos;
5252
0
                }
5253
0
                if (reason != NULL) {
5254
0
                    *reason = "encoding error";
5255
0
                }
5256
0
                if (raw_malloc) {
5257
0
                    PyMem_RawFree(bytes);
5258
0
                }
5259
0
                else {
5260
0
                    PyMem_Free(bytes);
5261
0
                }
5262
0
                return -2;
5263
0
            }
5264
0
            *p++ = (char)(ch & 0xff);
5265
0
        }
5266
0
        else if (ch < 0x10000) {
5267
0
            *p++ = (char)(0xe0 | (ch >> 12));
5268
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5269
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5270
0
        }
5271
0
        else {  /* ch >= 0x10000 */
5272
0
            assert(ch <= MAX_UNICODE);
5273
            /* Encode UCS4 Unicode ordinals */
5274
0
            *p++ = (char)(0xf0 | (ch >> 18));
5275
0
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5276
0
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5277
0
            *p++ = (char)(0x80 | (ch & 0x3f));
5278
0
        }
5279
672
    }
5280
56
    *p++ = '\0';
5281
5282
56
    size_t final_size = (p - bytes);
5283
56
    char *bytes2;
5284
56
    if (raw_malloc) {
5285
56
        bytes2 = PyMem_RawRealloc(bytes, final_size);
5286
56
    }
5287
0
    else {
5288
0
        bytes2 = PyMem_Realloc(bytes, final_size);
5289
0
    }
5290
56
    if (bytes2 == NULL) {
5291
0
        if (error_pos != NULL) {
5292
0
            *error_pos = (size_t)-1;
5293
0
        }
5294
0
        if (raw_malloc) {
5295
0
            PyMem_RawFree(bytes);
5296
0
        }
5297
0
        else {
5298
0
            PyMem_Free(bytes);
5299
0
        }
5300
0
        return -1;
5301
0
    }
5302
56
    *str = bytes2;
5303
56
    return 0;
5304
56
}
5305
5306
5307
/* Primary internal function which creates utf8 encoded bytes objects.
5308
5309
   Allocation strategy:  if the string is short, convert into a stack buffer
5310
   and allocate exactly as much space needed at the end.  Else allocate the
5311
   maximum possible needed (4 result bytes per Unicode character), and return
5312
   the excess memory at the end.
5313
*/
5314
static PyObject *
5315
unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5316
                    const char *errors)
5317
0
{
5318
0
    enum PyUnicode_Kind kind;
5319
0
    void *data;
5320
0
    Py_ssize_t size;
5321
5322
0
    if (!PyUnicode_Check(unicode)) {
5323
0
        PyErr_BadArgument();
5324
0
        return NULL;
5325
0
    }
5326
5327
0
    if (PyUnicode_READY(unicode) == -1)
5328
0
        return NULL;
5329
5330
0
    if (PyUnicode_UTF8(unicode))
5331
0
        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5332
0
                                         PyUnicode_UTF8_LENGTH(unicode));
5333
5334
0
    kind = PyUnicode_KIND(unicode);
5335
0
    data = PyUnicode_DATA(unicode);
5336
0
    size = PyUnicode_GET_LENGTH(unicode);
5337
5338
0
    switch (kind) {
5339
0
    default:
5340
0
        Py_UNREACHABLE();
5341
0
    case PyUnicode_1BYTE_KIND:
5342
        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5343
0
        assert(!PyUnicode_IS_ASCII(unicode));
5344
0
        return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
5345
0
    case PyUnicode_2BYTE_KIND:
5346
0
        return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
5347
0
    case PyUnicode_4BYTE_KIND:
5348
0
        return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
5349
0
    }
5350
0
}
5351
5352
PyObject *
5353
_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5354
0
{
5355
0
    return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5356
0
}
5357
5358
5359
PyObject *
5360
PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5361
                     Py_ssize_t size,
5362
                     const char *errors)
5363
0
{
5364
0
    PyObject *v, *unicode;
5365
5366
0
    unicode = PyUnicode_FromWideChar(s, size);
5367
0
    if (unicode == NULL)
5368
0
        return NULL;
5369
0
    v = _PyUnicode_AsUTF8String(unicode, errors);
5370
0
    Py_DECREF(unicode);
5371
0
    return v;
5372
0
}
5373
5374
PyObject *
5375
PyUnicode_AsUTF8String(PyObject *unicode)
5376
0
{
5377
0
    return _PyUnicode_AsUTF8String(unicode, NULL);
5378
0
}
5379
5380
/* --- UTF-32 Codec ------------------------------------------------------- */
5381
5382
PyObject *
5383
PyUnicode_DecodeUTF32(const char *s,
5384
                      Py_ssize_t size,
5385
                      const char *errors,
5386
                      int *byteorder)
5387
0
{
5388
0
    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5389
0
}
5390
5391
PyObject *
5392
PyUnicode_DecodeUTF32Stateful(const char *s,
5393
                              Py_ssize_t size,
5394
                              const char *errors,
5395
                              int *byteorder,
5396
                              Py_ssize_t *consumed)
5397
0
{
5398
0
    const char *starts = s;
5399
0
    Py_ssize_t startinpos;
5400
0
    Py_ssize_t endinpos;
5401
0
    _PyUnicodeWriter writer;
5402
0
    const unsigned char *q, *e;
5403
0
    int le, bo = 0;       /* assume native ordering by default */
5404
0
    const char *encoding;
5405
0
    const char *errmsg = "";
5406
0
    PyObject *errorHandler = NULL;
5407
0
    PyObject *exc = NULL;
5408
5409
0
    q = (const unsigned char *)s;
5410
0
    e = q + size;
5411
5412
0
    if (byteorder)
5413
0
        bo = *byteorder;
5414
5415
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5416
       byte order setting accordingly. In native mode, the leading BOM
5417
       mark is skipped, in all other modes, it is copied to the output
5418
       stream as-is (giving a ZWNBSP character). */
5419
0
    if (bo == 0 && size >= 4) {
5420
0
        Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5421
0
        if (bom == 0x0000FEFF) {
5422
0
            bo = -1;
5423
0
            q += 4;
5424
0
        }
5425
0
        else if (bom == 0xFFFE0000) {
5426
0
            bo = 1;
5427
0
            q += 4;
5428
0
        }
5429
0
        if (byteorder)
5430
0
            *byteorder = bo;
5431
0
    }
5432
5433
0
    if (q == e) {
5434
0
        if (consumed)
5435
0
            *consumed = size;
5436
0
        _Py_RETURN_UNICODE_EMPTY();
5437
0
    }
5438
5439
#ifdef WORDS_BIGENDIAN
5440
    le = bo < 0;
5441
#else
5442
0
    le = bo <= 0;
5443
0
#endif
5444
0
    encoding = le ? "utf-32-le" : "utf-32-be";
5445
5446
0
    _PyUnicodeWriter_Init(&writer);
5447
0
    writer.min_length = (e - q + 3) / 4;
5448
0
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5449
0
        goto onError;
5450
5451
0
    while (1) {
5452
0
        Py_UCS4 ch = 0;
5453
0
        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5454
5455
0
        if (e - q >= 4) {
5456
0
            enum PyUnicode_Kind kind = writer.kind;
5457
0
            void *data = writer.data;
5458
0
            const unsigned char *last = e - 4;
5459
0
            Py_ssize_t pos = writer.pos;
5460
0
            if (le) {
5461
0
                do {
5462
0
                    ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5463
0
                    if (ch > maxch)
5464
0
                        break;
5465
0
                    if (kind != PyUnicode_1BYTE_KIND &&
5466
0
                        Py_UNICODE_IS_SURROGATE(ch))
5467
0
                        break;
5468
0
                    PyUnicode_WRITE(kind, data, pos++, ch);
5469
0
                    q += 4;
5470
0
                } while (q <= last);
5471
0
            }
5472
0
            else {
5473
0
                do {
5474
0
                    ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5475
0
                    if (ch > maxch)
5476
0
                        break;
5477
0
                    if (kind != PyUnicode_1BYTE_KIND &&
5478
0
                        Py_UNICODE_IS_SURROGATE(ch))
5479
0
                        break;
5480
0
                    PyUnicode_WRITE(kind, data, pos++, ch);
5481
0
                    q += 4;
5482
0
                } while (q <= last);
5483
0
            }
5484
0
            writer.pos = pos;
5485
0
        }
5486
5487
0
        if (Py_UNICODE_IS_SURROGATE(ch)) {
5488
0
            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5489
0
            startinpos = ((const char *)q) - starts;
5490
0
            endinpos = startinpos + 4;
5491
0
        }
5492
0
        else if (ch <= maxch) {
5493
0
            if (q == e || consumed)
5494
0
                break;
5495
            /* remaining bytes at the end? (size should be divisible by 4) */
5496
0
            errmsg = "truncated data";
5497
0
            startinpos = ((const char *)q) - starts;
5498
0
            endinpos = ((const char *)e) - starts;
5499
0
        }
5500
0
        else {
5501
0
            if (ch < 0x110000) {
5502
0
                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5503
0
                    goto onError;
5504
0
                q += 4;
5505
0
                continue;
5506
0
            }
5507
0
            errmsg = "code point not in range(0x110000)";
5508
0
            startinpos = ((const char *)q) - starts;
5509
0
            endinpos = startinpos + 4;
5510
0
        }
5511
5512
        /* The remaining input chars are ignored if the callback
5513
           chooses to skip the input */
5514
0
        if (unicode_decode_call_errorhandler_writer(
5515
0
                errors, &errorHandler,
5516
0
                encoding, errmsg,
5517
0
                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5518
0
                &writer))
5519
0
            goto onError;
5520
0
    }
5521
5522
0
    if (consumed)
5523
0
        *consumed = (const char *)q-starts;
5524
5525
0
    Py_XDECREF(errorHandler);
5526
0
    Py_XDECREF(exc);
5527
0
    return _PyUnicodeWriter_Finish(&writer);
5528
5529
0
  onError:
5530
0
    _PyUnicodeWriter_Dealloc(&writer);
5531
0
    Py_XDECREF(errorHandler);
5532
0
    Py_XDECREF(exc);
5533
0
    return NULL;
5534
0
}
5535
5536
PyObject *
5537
_PyUnicode_EncodeUTF32(PyObject *str,
5538
                       const char *errors,
5539
                       int byteorder)
5540
0
{
5541
0
    enum PyUnicode_Kind kind;
5542
0
    const void *data;
5543
0
    Py_ssize_t len;
5544
0
    PyObject *v;
5545
0
    uint32_t *out;
5546
0
#if PY_LITTLE_ENDIAN
5547
0
    int native_ordering = byteorder <= 0;
5548
#else
5549
    int native_ordering = byteorder >= 0;
5550
#endif
5551
0
    const char *encoding;
5552
0
    Py_ssize_t nsize, pos;
5553
0
    PyObject *errorHandler = NULL;
5554
0
    PyObject *exc = NULL;
5555
0
    PyObject *rep = NULL;
5556
5557
0
    if (!PyUnicode_Check(str)) {
5558
0
        PyErr_BadArgument();
5559
0
        return NULL;
5560
0
    }
5561
0
    if (PyUnicode_READY(str) == -1)
5562
0
        return NULL;
5563
0
    kind = PyUnicode_KIND(str);
5564
0
    data = PyUnicode_DATA(str);
5565
0
    len = PyUnicode_GET_LENGTH(str);
5566
5567
0
    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5568
0
        return PyErr_NoMemory();
5569
0
    nsize = len + (byteorder == 0);
5570
0
    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5571
0
    if (v == NULL)
5572
0
        return NULL;
5573
5574
    /* output buffer is 4-bytes aligned */
5575
0
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5576
0
    out = (uint32_t *)PyBytes_AS_STRING(v);
5577
0
    if (byteorder == 0)
5578
0
        *out++ = 0xFEFF;
5579
0
    if (len == 0)
5580
0
        goto done;
5581
5582
0
    if (byteorder == -1)
5583
0
        encoding = "utf-32-le";
5584
0
    else if (byteorder == 1)
5585
0
        encoding = "utf-32-be";
5586
0
    else
5587
0
        encoding = "utf-32";
5588
5589
0
    if (kind == PyUnicode_1BYTE_KIND) {
5590
0
        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5591
0
        goto done;
5592
0
    }
5593
5594
0
    pos = 0;
5595
0
    while (pos < len) {
5596
0
        Py_ssize_t repsize, moreunits;
5597
5598
0
        if (kind == PyUnicode_2BYTE_KIND) {
5599
0
            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5600
0
                                        &out, native_ordering);
5601
0
        }
5602
0
        else {
5603
0
            assert(kind == PyUnicode_4BYTE_KIND);
5604
0
            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5605
0
                                        &out, native_ordering);
5606
0
        }
5607
0
        if (pos == len)
5608
0
            break;
5609
5610
0
        rep = unicode_encode_call_errorhandler(
5611
0
                errors, &errorHandler,
5612
0
                encoding, "surrogates not allowed",
5613
0
                str, &exc, pos, pos + 1, &pos);
5614
0
        if (!rep)
5615
0
            goto error;
5616
5617
0
        if (PyBytes_Check(rep)) {
5618
0
            repsize = PyBytes_GET_SIZE(rep);
5619
0
            if (repsize & 3) {
5620
0
                raise_encode_exception(&exc, encoding,
5621
0
                                       str, pos - 1, pos,
5622
0
                                       "surrogates not allowed");
5623
0
                goto error;
5624
0
            }
5625
0
            moreunits = repsize / 4;
5626
0
        }
5627
0
        else {
5628
0
            assert(PyUnicode_Check(rep));
5629
0
            if (PyUnicode_READY(rep) < 0)
5630
0
                goto error;
5631
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5632
0
            if (!PyUnicode_IS_ASCII(rep)) {
5633
0
                raise_encode_exception(&exc, encoding,
5634
0
                                       str, pos - 1, pos,
5635
0
                                       "surrogates not allowed");
5636
0
                goto error;
5637
0
            }
5638
0
        }
5639
5640
        /* four bytes are reserved for each surrogate */
5641
0
        if (moreunits > 1) {
5642
0
            Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
5643
0
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
5644
                /* integer overflow */
5645
0
                PyErr_NoMemory();
5646
0
                goto error;
5647
0
            }
5648
0
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
5649
0
                goto error;
5650
0
            out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
5651
0
        }
5652
5653
0
        if (PyBytes_Check(rep)) {
5654
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
5655
0
            out += moreunits;
5656
0
        } else /* rep is unicode */ {
5657
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5658
0
            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5659
0
                                 &out, native_ordering);
5660
0
        }
5661
5662
0
        Py_CLEAR(rep);
5663
0
    }
5664
5665
    /* Cut back to size actually needed. This is necessary for, for example,
5666
       encoding of a string containing isolated surrogates and the 'ignore'
5667
       handler is used. */
5668
0
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5669
0
    if (nsize != PyBytes_GET_SIZE(v))
5670
0
      _PyBytes_Resize(&v, nsize);
5671
0
    Py_XDECREF(errorHandler);
5672
0
    Py_XDECREF(exc);
5673
0
  done:
5674
0
    return v;
5675
0
  error:
5676
0
    Py_XDECREF(rep);
5677
0
    Py_XDECREF(errorHandler);
5678
0
    Py_XDECREF(exc);
5679
0
    Py_XDECREF(v);
5680
0
    return NULL;
5681
0
}
5682
5683
PyObject *
5684
PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5685
                      Py_ssize_t size,
5686
                      const char *errors,
5687
                      int byteorder)
5688
0
{
5689
0
    PyObject *result;
5690
0
    PyObject *tmp = PyUnicode_FromWideChar(s, size);
5691
0
    if (tmp == NULL)
5692
0
        return NULL;
5693
0
    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5694
0
    Py_DECREF(tmp);
5695
0
    return result;
5696
0
}
5697
5698
PyObject *
5699
PyUnicode_AsUTF32String(PyObject *unicode)
5700
0
{
5701
0
    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5702
0
}
5703
5704
/* --- UTF-16 Codec ------------------------------------------------------- */
5705
5706
PyObject *
5707
PyUnicode_DecodeUTF16(const char *s,
5708
                      Py_ssize_t size,
5709
                      const char *errors,
5710
                      int *byteorder)
5711
0
{
5712
0
    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5713
0
}
5714
5715
PyObject *
5716
PyUnicode_DecodeUTF16Stateful(const char *s,
5717
                              Py_ssize_t size,
5718
                              const char *errors,
5719
                              int *byteorder,
5720
                              Py_ssize_t *consumed)
5721
0
{
5722
0
    const char *starts = s;
5723
0
    Py_ssize_t startinpos;
5724
0
    Py_ssize_t endinpos;
5725
0
    _PyUnicodeWriter writer;
5726
0
    const unsigned char *q, *e;
5727
0
    int bo = 0;       /* assume native ordering by default */
5728
0
    int native_ordering;
5729
0
    const char *errmsg = "";
5730
0
    PyObject *errorHandler = NULL;
5731
0
    PyObject *exc = NULL;
5732
0
    const char *encoding;
5733
5734
0
    q = (const unsigned char *)s;
5735
0
    e = q + size;
5736
5737
0
    if (byteorder)
5738
0
        bo = *byteorder;
5739
5740
    /* Check for BOM marks (U+FEFF) in the input and adjust current
5741
       byte order setting accordingly. In native mode, the leading BOM
5742
       mark is skipped, in all other modes, it is copied to the output
5743
       stream as-is (giving a ZWNBSP character). */
5744
0
    if (bo == 0 && size >= 2) {
5745
0
        const Py_UCS4 bom = (q[1] << 8) | q[0];
5746
0
        if (bom == 0xFEFF) {
5747
0
            q += 2;
5748
0
            bo = -1;
5749
0
        }
5750
0
        else if (bom == 0xFFFE) {
5751
0
            q += 2;
5752
0
            bo = 1;
5753
0
        }
5754
0
        if (byteorder)
5755
0
            *byteorder = bo;
5756
0
    }
5757
5758
0
    if (q == e) {
5759
0
        if (consumed)
5760
0
            *consumed = size;
5761
0
        _Py_RETURN_UNICODE_EMPTY();
5762
0
    }
5763
5764
0
#if PY_LITTLE_ENDIAN
5765
0
    native_ordering = bo <= 0;
5766
0
    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5767
#else
5768
    native_ordering = bo >= 0;
5769
    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5770
#endif
5771
5772
    /* Note: size will always be longer than the resulting Unicode
5773
       character count normally.  Error handler will take care of
5774
       resizing when needed. */
5775
0
    _PyUnicodeWriter_Init(&writer);
5776
0
    writer.min_length = (e - q + 1) / 2;
5777
0
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5778
0
        goto onError;
5779
5780
0
    while (1) {
5781
0
        Py_UCS4 ch = 0;
5782
0
        if (e - q >= 2) {
5783
0
            int kind = writer.kind;
5784
0
            if (kind == PyUnicode_1BYTE_KIND) {
5785
0
                if (PyUnicode_IS_ASCII(writer.buffer))
5786
0
                    ch = asciilib_utf16_decode(&q, e,
5787
0
                            (Py_UCS1*)writer.data, &writer.pos,
5788
0
                            native_ordering);
5789
0
                else
5790
0
                    ch = ucs1lib_utf16_decode(&q, e,
5791
0
                            (Py_UCS1*)writer.data, &writer.pos,
5792
0
                            native_ordering);
5793
0
            } else if (kind == PyUnicode_2BYTE_KIND) {
5794
0
                ch = ucs2lib_utf16_decode(&q, e,
5795
0
                        (Py_UCS2*)writer.data, &writer.pos,
5796
0
                        native_ordering);
5797
0
            } else {
5798
0
                assert(kind == PyUnicode_4BYTE_KIND);
5799
0
                ch = ucs4lib_utf16_decode(&q, e,
5800
0
                        (Py_UCS4*)writer.data, &writer.pos,
5801
0
                        native_ordering);
5802
0
            }
5803
0
        }
5804
5805
0
        switch (ch)
5806
0
        {
5807
0
        case 0:
5808
            /* remaining byte at the end? (size should be even) */
5809
0
            if (q == e || consumed)
5810
0
                goto End;
5811
0
            errmsg = "truncated data";
5812
0
            startinpos = ((const char *)q) - starts;
5813
0
            endinpos = ((const char *)e) - starts;
5814
0
            break;
5815
            /* The remaining input chars are ignored if the callback
5816
               chooses to skip the input */
5817
0
        case 1:
5818
0
            q -= 2;
5819
0
            if (consumed)
5820
0
                goto End;
5821
0
            errmsg = "unexpected end of data";
5822
0
            startinpos = ((const char *)q) - starts;
5823
0
            endinpos = ((const char *)e) - starts;
5824
0
            break;
5825
0
        case 2:
5826
0
            errmsg = "illegal encoding";
5827
0
            startinpos = ((const char *)q) - 2 - starts;
5828
0
            endinpos = startinpos + 2;
5829
0
            break;
5830
0
        case 3:
5831
0
            errmsg = "illegal UTF-16 surrogate";
5832
0
            startinpos = ((const char *)q) - 4 - starts;
5833
0
            endinpos = startinpos + 2;
5834
0
            break;
5835
0
        default:
5836
0
            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5837
0
                goto onError;
5838
0
            continue;
5839
0
        }
5840
5841
0
        if (unicode_decode_call_errorhandler_writer(
5842
0
                errors,
5843
0
                &errorHandler,
5844
0
                encoding, errmsg,
5845
0
                &starts,
5846
0
                (const char **)&e,
5847
0
                &startinpos,
5848
0
                &endinpos,
5849
0
                &exc,
5850
0
                (const char **)&q,
5851
0
                &writer))
5852
0
            goto onError;
5853
0
    }
5854
5855
0
End:
5856
0
    if (consumed)
5857
0
        *consumed = (const char *)q-starts;
5858
5859
0
    Py_XDECREF(errorHandler);
5860
0
    Py_XDECREF(exc);
5861
0
    return _PyUnicodeWriter_Finish(&writer);
5862
5863
0
  onError:
5864
0
    _PyUnicodeWriter_Dealloc(&writer);
5865
0
    Py_XDECREF(errorHandler);
5866
0
    Py_XDECREF(exc);
5867
0
    return NULL;
5868
0
}
5869
5870
PyObject *
5871
_PyUnicode_EncodeUTF16(PyObject *str,
5872
                       const char *errors,
5873
                       int byteorder)
5874
0
{
5875
0
    enum PyUnicode_Kind kind;
5876
0
    const void *data;
5877
0
    Py_ssize_t len;
5878
0
    PyObject *v;
5879
0
    unsigned short *out;
5880
0
    Py_ssize_t pairs;
5881
#if PY_BIG_ENDIAN
5882
    int native_ordering = byteorder >= 0;
5883
#else
5884
0
    int native_ordering = byteorder <= 0;
5885
0
#endif
5886
0
    const char *encoding;
5887
0
    Py_ssize_t nsize, pos;
5888
0
    PyObject *errorHandler = NULL;
5889
0
    PyObject *exc = NULL;
5890
0
    PyObject *rep = NULL;
5891
5892
0
    if (!PyUnicode_Check(str)) {
5893
0
        PyErr_BadArgument();
5894
0
        return NULL;
5895
0
    }
5896
0
    if (PyUnicode_READY(str) == -1)
5897
0
        return NULL;
5898
0
    kind = PyUnicode_KIND(str);
5899
0
    data = PyUnicode_DATA(str);
5900
0
    len = PyUnicode_GET_LENGTH(str);
5901
5902
0
    pairs = 0;
5903
0
    if (kind == PyUnicode_4BYTE_KIND) {
5904
0
        const Py_UCS4 *in = (const Py_UCS4 *)data;
5905
0
        const Py_UCS4 *end = in + len;
5906
0
        while (in < end) {
5907
0
            if (*in++ >= 0x10000) {
5908
0
                pairs++;
5909
0
            }
5910
0
        }
5911
0
    }
5912
0
    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
5913
0
        return PyErr_NoMemory();
5914
0
    }
5915
0
    nsize = len + pairs + (byteorder == 0);
5916
0
    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5917
0
    if (v == NULL) {
5918
0
        return NULL;
5919
0
    }
5920
5921
    /* output buffer is 2-bytes aligned */
5922
0
    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5923
0
    out = (unsigned short *)PyBytes_AS_STRING(v);
5924
0
    if (byteorder == 0) {
5925
0
        *out++ = 0xFEFF;
5926
0
    }
5927
0
    if (len == 0) {
5928
0
        goto done;
5929
0
    }
5930
5931
0
    if (kind == PyUnicode_1BYTE_KIND) {
5932
0
        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5933
0
        goto done;
5934
0
    }
5935
5936
0
    if (byteorder < 0) {
5937
0
        encoding = "utf-16-le";
5938
0
    }
5939
0
    else if (byteorder > 0) {
5940
0
        encoding = "utf-16-be";
5941
0
    }
5942
0
    else {
5943
0
        encoding = "utf-16";
5944
0
    }
5945
5946
0
    pos = 0;
5947
0
    while (pos < len) {
5948
0
        Py_ssize_t repsize, moreunits;
5949
5950
0
        if (kind == PyUnicode_2BYTE_KIND) {
5951
0
            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5952
0
                                        &out, native_ordering);
5953
0
        }
5954
0
        else {
5955
0
            assert(kind == PyUnicode_4BYTE_KIND);
5956
0
            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5957
0
                                        &out, native_ordering);
5958
0
        }
5959
0
        if (pos == len)
5960
0
            break;
5961
5962
0
        rep = unicode_encode_call_errorhandler(
5963
0
                errors, &errorHandler,
5964
0
                encoding, "surrogates not allowed",
5965
0
                str, &exc, pos, pos + 1, &pos);
5966
0
        if (!rep)
5967
0
            goto error;
5968
5969
0
        if (PyBytes_Check(rep)) {
5970
0
            repsize = PyBytes_GET_SIZE(rep);
5971
0
            if (repsize & 1) {
5972
0
                raise_encode_exception(&exc, encoding,
5973
0
                                       str, pos - 1, pos,
5974
0
                                       "surrogates not allowed");
5975
0
                goto error;
5976
0
            }
5977
0
            moreunits = repsize / 2;
5978
0
        }
5979
0
        else {
5980
0
            assert(PyUnicode_Check(rep));
5981
0
            if (PyUnicode_READY(rep) < 0)
5982
0
                goto error;
5983
0
            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5984
0
            if (!PyUnicode_IS_ASCII(rep)) {
5985
0
                raise_encode_exception(&exc, encoding,
5986
0
                                       str, pos - 1, pos,
5987
0
                                       "surrogates not allowed");
5988
0
                goto error;
5989
0
            }
5990
0
        }
5991
5992
        /* two bytes are reserved for each surrogate */
5993
0
        if (moreunits > 1) {
5994
0
            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5995
0
            if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
5996
                /* integer overflow */
5997
0
                PyErr_NoMemory();
5998
0
                goto error;
5999
0
            }
6000
0
            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
6001
0
                goto error;
6002
0
            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6003
0
        }
6004
6005
0
        if (PyBytes_Check(rep)) {
6006
0
            memcpy(out, PyBytes_AS_STRING(rep), repsize);
6007
0
            out += moreunits;
6008
0
        } else /* rep is unicode */ {
6009
0
            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6010
0
            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6011
0
                                 &out, native_ordering);
6012
0
        }
6013
6014
0
        Py_CLEAR(rep);
6015
0
    }
6016
6017
    /* Cut back to size actually needed. This is necessary for, for example,
6018
    encoding of a string containing isolated surrogates and the 'ignore' handler
6019
    is used. */
6020
0
    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6021
0
    if (nsize != PyBytes_GET_SIZE(v))
6022
0
      _PyBytes_Resize(&v, nsize);
6023
0
    Py_XDECREF(errorHandler);
6024
0
    Py_XDECREF(exc);
6025
0
  done:
6026
0
    return v;
6027
0
  error:
6028
0
    Py_XDECREF(rep);
6029
0
    Py_XDECREF(errorHandler);
6030
0
    Py_XDECREF(exc);
6031
0
    Py_XDECREF(v);
6032
0
    return NULL;
6033
0
#undef STORECHAR
6034
0
}
6035
6036
PyObject *
6037
PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6038
                      Py_ssize_t size,
6039
                      const char *errors,
6040
                      int byteorder)
6041
0
{
6042
0
    PyObject *result;
6043
0
    PyObject *tmp = PyUnicode_FromWideChar(s, size);
6044
0
    if (tmp == NULL)
6045
0
        return NULL;
6046
0
    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6047
0
    Py_DECREF(tmp);
6048
0
    return result;
6049
0
}
6050
6051
PyObject *
6052
PyUnicode_AsUTF16String(PyObject *unicode)
6053
0
{
6054
0
    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
6055
0
}
6056
6057
/* --- Unicode Escape Codec ----------------------------------------------- */
6058
6059
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
6060
6061
PyObject *
6062
_PyUnicode_DecodeUnicodeEscape(const char *s,
6063
                               Py_ssize_t size,
6064
                               const char *errors,
6065
                               const char **first_invalid_escape)
6066
4
{
6067
4
    const char *starts = s;
6068
4
    _PyUnicodeWriter writer;
6069
4
    const char *end;
6070
4
    PyObject *errorHandler = NULL;
6071
4
    PyObject *exc = NULL;
6072
6073
    // so we can remember if we've seen an invalid escape char or not
6074
4
    *first_invalid_escape = NULL;
6075
6076
4
    if (size == 0) {
6077
0
        _Py_RETURN_UNICODE_EMPTY();
6078
0
    }
6079
    /* Escaped strings will always be longer than the resulting
6080
       Unicode string, so we start with size here and then reduce the
6081
       length after conversion to the true value.
6082
       (but if the error callback returns a long replacement string
6083
       we'll have to allocate more space) */
6084
4
    _PyUnicodeWriter_Init(&writer);
6085
4
    writer.min_length = size;
6086
4
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6087
0
        goto onError;
6088
0
    }
6089
6090
4
    end = s + size;
6091
8
    while (s < end) {
6092
4
        unsigned char c = (unsigned char) *s++;
6093
4
        Py_UCS4 ch;
6094
4
        int count;
6095
4
        Py_ssize_t startinpos;
6096
4
        Py_ssize_t endinpos;
6097
4
        const char *message;
6098
6099
4
#define WRITE_ASCII_CHAR(ch)                                                  \
6100
4
            do {                                                              \
6101
2
                assert(ch <= 127);                                            \
6102
2
                assert(writer.pos < writer.size);                             \
6103
2
                PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch);  \
6104
2
            } while(0)
6105
6106
4
#define WRITE_CHAR(ch)                                                        \
6107
4
            do {                                                              \
6108
2
                if (ch <= writer.maxchar) {                                   \
6109
2
                    assert(writer.pos < writer.size);                         \
6110
2
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6111
2
                }                                                             \
6112
2
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6113
0
                    goto onError;                                             \
6114
0
                }                                                             \
6115
2
            } while(0)
6116
6117
        /* Non-escape characters are interpreted as Unicode ordinals */
6118
4
        if (c != '\\') {
6119
0
            WRITE_CHAR(c);
6120
0
            continue;
6121
0
        }
6122
6123
4
        startinpos = s - starts - 1;
6124
        /* \ - Escapes */
6125
4
        if (s >= end) {
6126
0
            message = "\\ at end of string";
6127
0
            goto error;
6128
0
        }
6129
4
        c = (unsigned char) *s++;
6130
6131
4
        assert(writer.pos < writer.size);
6132
4
        switch (c) {
6133
6134
            /* \x escapes */
6135
0
        case '\n': continue;
6136
0
        case '\\': WRITE_ASCII_CHAR('\\'); continue;
6137
0
        case '\'': WRITE_ASCII_CHAR('\''); continue;
6138
0
        case '\"': WRITE_ASCII_CHAR('\"'); continue;
6139
0
        case 'b': WRITE_ASCII_CHAR('\b'); continue;
6140
        /* FF */
6141
0
        case 'f': WRITE_ASCII_CHAR('\014'); continue;
6142
0
        case 't': WRITE_ASCII_CHAR('\t'); continue;
6143
2
        case 'n': WRITE_ASCII_CHAR('\n'); continue;
6144
2
        case 'r': WRITE_ASCII_CHAR('\r'); continue;
6145
        /* VT */
6146
0
        case 'v': WRITE_ASCII_CHAR('\013'); continue;
6147
        /* BEL, not classic C */
6148
0
        case 'a': WRITE_ASCII_CHAR('\007'); continue;
6149
6150
            /* \OOO (octal) escapes */
6151
2
        case '0': case '1': case '2': case '3':
6152
2
        case '4': case '5': case '6': case '7':
6153
2
            ch = c - '0';
6154
2
            if (s < end && '0' <= *s && *s <= '7') {
6155
0
                ch = (ch<<3) + *s++ - '0';
6156
0
                if (s < end && '0' <= *s && *s <= '7') {
6157
0
                    ch = (ch<<3) + *s++ - '0';
6158
0
                }
6159
0
            }
6160
2
            WRITE_CHAR(ch);
6161
2
            continue;
6162
6163
            /* hex escapes */
6164
            /* \xXX */
6165
2
        case 'x':
6166
0
            count = 2;
6167
0
            message = "truncated \\xXX escape";
6168
0
            goto hexescape;
6169
6170
            /* \uXXXX */
6171
0
        case 'u':
6172
0
            count = 4;
6173
0
            message = "truncated \\uXXXX escape";
6174
0
            goto hexescape;
6175
6176
            /* \UXXXXXXXX */
6177
0
        case 'U':
6178
0
            count = 8;
6179
0
            message = "truncated \\UXXXXXXXX escape";
6180
0
        hexescape:
6181
0
            for (ch = 0; count && s < end; ++s, --count) {
6182
0
                c = (unsigned char)*s;
6183
0
                ch <<= 4;
6184
0
                if (c >= '0' && c <= '9') {
6185
0
                    ch += c - '0';
6186
0
                }
6187
0
                else if (c >= 'a' && c <= 'f') {
6188
0
                    ch += c - ('a' - 10);
6189
0
                }
6190
0
                else if (c >= 'A' && c <= 'F') {
6191
0
                    ch += c - ('A' - 10);
6192
0
                }
6193
0
                else {
6194
0
                    break;
6195
0
                }
6196
0
            }
6197
0
            if (count) {
6198
0
                goto error;
6199
0
            }
6200
6201
            /* when we get here, ch is a 32-bit unicode character */
6202
0
            if (ch > MAX_UNICODE) {
6203
0
                message = "illegal Unicode character";
6204
0
                goto error;
6205
0
            }
6206
6207
0
            WRITE_CHAR(ch);
6208
0
            continue;
6209
6210
            /* \N{name} */
6211
0
        case 'N':
6212
0
            if (ucnhash_CAPI == NULL) {
6213
                /* load the unicode data module */
6214
0
                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6215
0
                                                PyUnicodeData_CAPSULE_NAME, 1);
6216
0
                if (ucnhash_CAPI == NULL) {
6217
0
                    PyErr_SetString(
6218
0
                        PyExc_UnicodeError,
6219
0
                        "\\N escapes not supported (can't load unicodedata module)"
6220
0
                        );
6221
0
                    goto onError;
6222
0
                }
6223
0
            }
6224
6225
0
            message = "malformed \\N character escape";
6226
0
            if (s < end && *s == '{') {
6227
0
                const char *start = ++s;
6228
0
                size_t namelen;
6229
                /* look for the closing brace */
6230
0
                while (s < end && *s != '}')
6231
0
                    s++;
6232
0
                namelen = s - start;
6233
0
                if (namelen && s < end) {
6234
                    /* found a name.  look it up in the unicode database */
6235
0
                    s++;
6236
0
                    ch = 0xffffffff; /* in case 'getcode' messes up */
6237
0
                    if (namelen <= INT_MAX &&
6238
0
                        ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6239
0
                                              &ch, 0)) {
6240
0
                        assert(ch <= MAX_UNICODE);
6241
0
                        WRITE_CHAR(ch);
6242
0
                        continue;
6243
0
                    }
6244
0
                    message = "unknown Unicode character name";
6245
0
                }
6246
0
            }
6247
0
            goto error;
6248
6249
0
        default:
6250
0
            if (*first_invalid_escape == NULL) {
6251
0
                *first_invalid_escape = s-1; /* Back up one char, since we've
6252
                                                already incremented s. */
6253
0
            }
6254
0
            WRITE_ASCII_CHAR('\\');
6255
0
            WRITE_CHAR(c);
6256
0
            continue;
6257
4
        }
6258
6259
0
      error:
6260
0
        endinpos = s-starts;
6261
0
        writer.min_length = end - s + writer.pos;
6262
0
        if (unicode_decode_call_errorhandler_writer(
6263
0
                errors, &errorHandler,
6264
0
                "unicodeescape", message,
6265
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6266
0
                &writer)) {
6267
0
            goto onError;
6268
0
        }
6269
0
        assert(end - s <= writer.size - writer.pos);
6270
6271
0
#undef WRITE_ASCII_CHAR
6272
0
#undef WRITE_CHAR
6273
0
    }
6274
6275
4
    Py_XDECREF(errorHandler);
6276
4
    Py_XDECREF(exc);
6277
4
    return _PyUnicodeWriter_Finish(&writer);
6278
6279
0
  onError:
6280
0
    _PyUnicodeWriter_Dealloc(&writer);
6281
0
    Py_XDECREF(errorHandler);
6282
0
    Py_XDECREF(exc);
6283
0
    return NULL;
6284
4
}
6285
6286
PyObject *
6287
PyUnicode_DecodeUnicodeEscape(const char *s,
6288
                              Py_ssize_t size,
6289
                              const char *errors)
6290
0
{
6291
0
    const char *first_invalid_escape;
6292
0
    PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6293
0
                                                      &first_invalid_escape);
6294
0
    if (result == NULL)
6295
0
        return NULL;
6296
0
    if (first_invalid_escape != NULL) {
6297
0
        if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6298
0
                             "invalid escape sequence '\\%c'",
6299
0
                             (unsigned char)*first_invalid_escape) < 0) {
6300
0
            Py_DECREF(result);
6301
0
            return NULL;
6302
0
        }
6303
0
    }
6304
0
    return result;
6305
0
}
6306
6307
/* Return a Unicode-Escape string version of the Unicode object. */
6308
6309
PyObject *
6310
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
6311
0
{
6312
0
    Py_ssize_t i, len;
6313
0
    PyObject *repr;
6314
0
    char *p;
6315
0
    enum PyUnicode_Kind kind;
6316
0
    void *data;
6317
0
    Py_ssize_t expandsize;
6318
6319
    /* Initial allocation is based on the longest-possible character
6320
       escape.
6321
6322
       For UCS1 strings it's '\xxx', 4 bytes per source character.
6323
       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6324
       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
6325
    */
6326
6327
0
    if (!PyUnicode_Check(unicode)) {
6328
0
        PyErr_BadArgument();
6329
0
        return NULL;
6330
0
    }
6331
0
    if (PyUnicode_READY(unicode) == -1) {
6332
0
        return NULL;
6333
0
    }
6334
6335
0
    len = PyUnicode_GET_LENGTH(unicode);
6336
0
    if (len == 0) {
6337
0
        return PyBytes_FromStringAndSize(NULL, 0);
6338
0
    }
6339
6340
0
    kind = PyUnicode_KIND(unicode);
6341
0
    data = PyUnicode_DATA(unicode);
6342
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6343
       bytes, and 1 byte characters 4. */
6344
0
    expandsize = kind * 2 + 2;
6345
0
    if (len > PY_SSIZE_T_MAX / expandsize) {
6346
0
        return PyErr_NoMemory();
6347
0
    }
6348
0
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6349
0
    if (repr == NULL) {
6350
0
        return NULL;
6351
0
    }
6352
6353
0
    p = PyBytes_AS_STRING(repr);
6354
0
    for (i = 0; i < len; i++) {
6355
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6356
6357
        /* U+0000-U+00ff range */
6358
0
        if (ch < 0x100) {
6359
0
            if (ch >= ' ' && ch < 127) {
6360
0
                if (ch != '\\') {
6361
                    /* Copy printable US ASCII as-is */
6362
0
                    *p++ = (char) ch;
6363
0
                }
6364
                /* Escape backslashes */
6365
0
                else {
6366
0
                    *p++ = '\\';
6367
0
                    *p++ = '\\';
6368
0
                }
6369
0
            }
6370
6371
            /* Map special whitespace to '\t', \n', '\r' */
6372
0
            else if (ch == '\t') {
6373
0
                *p++ = '\\';
6374
0
                *p++ = 't';
6375
0
            }
6376
0
            else if (ch == '\n') {
6377
0
                *p++ = '\\';
6378
0
                *p++ = 'n';
6379
0
            }
6380
0
            else if (ch == '\r') {
6381
0
                *p++ = '\\';
6382
0
                *p++ = 'r';
6383
0
            }
6384
6385
            /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6386
0
            else {
6387
0
                *p++ = '\\';
6388
0
                *p++ = 'x';
6389
0
                *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6390
0
                *p++ = Py_hexdigits[ch & 0x000F];
6391
0
            }
6392
0
        }
6393
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6394
0
        else if (ch < 0x10000) {
6395
0
            *p++ = '\\';
6396
0
            *p++ = 'u';
6397
0
            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6398
0
            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6399
0
            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6400
0
            *p++ = Py_hexdigits[ch & 0x000F];
6401
0
        }
6402
        /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6403
0
        else {
6404
6405
            /* Make sure that the first two digits are zero */
6406
0
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6407
0
            *p++ = '\\';
6408
0
            *p++ = 'U';
6409
0
            *p++ = '0';
6410
0
            *p++ = '0';
6411
0
            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6412
0
            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6413
0
            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6414
0
            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6415
0
            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6416
0
            *p++ = Py_hexdigits[ch & 0x0000000F];
6417
0
        }
6418
0
    }
6419
6420
0
    assert(p - PyBytes_AS_STRING(repr) > 0);
6421
0
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6422
0
        return NULL;
6423
0
    }
6424
0
    return repr;
6425
0
}
6426
6427
PyObject *
6428
PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6429
                              Py_ssize_t size)
6430
0
{
6431
0
    PyObject *result;
6432
0
    PyObject *tmp = PyUnicode_FromWideChar(s, size);
6433
0
    if (tmp == NULL) {
6434
0
        return NULL;
6435
0
    }
6436
6437
0
    result = PyUnicode_AsUnicodeEscapeString(tmp);
6438
0
    Py_DECREF(tmp);
6439
0
    return result;
6440
0
}
6441
6442
/* --- Raw Unicode Escape Codec ------------------------------------------- */
6443
6444
PyObject *
6445
PyUnicode_DecodeRawUnicodeEscape(const char *s,
6446
                                 Py_ssize_t size,
6447
                                 const char *errors)
6448
0
{
6449
0
    const char *starts = s;
6450
0
    _PyUnicodeWriter writer;
6451
0
    const char *end;
6452
0
    PyObject *errorHandler = NULL;
6453
0
    PyObject *exc = NULL;
6454
6455
0
    if (size == 0) {
6456
0
        _Py_RETURN_UNICODE_EMPTY();
6457
0
    }
6458
6459
    /* Escaped strings will always be longer than the resulting
6460
       Unicode string, so we start with size here and then reduce the
6461
       length after conversion to the true value. (But decoding error
6462
       handler might have to resize the string) */
6463
0
    _PyUnicodeWriter_Init(&writer);
6464
0
     writer.min_length = size;
6465
0
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6466
0
        goto onError;
6467
0
    }
6468
6469
0
    end = s + size;
6470
0
    while (s < end) {
6471
0
        unsigned char c = (unsigned char) *s++;
6472
0
        Py_UCS4 ch;
6473
0
        int count;
6474
0
        Py_ssize_t startinpos;
6475
0
        Py_ssize_t endinpos;
6476
0
        const char *message;
6477
6478
0
#define WRITE_CHAR(ch)                                                        \
6479
0
            do {                                                              \
6480
0
                if (ch <= writer.maxchar) {                                   \
6481
0
                    assert(writer.pos < writer.size);                         \
6482
0
                    PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6483
0
                }                                                             \
6484
0
                else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6485
0
                    goto onError;                                             \
6486
0
                }                                                             \
6487
0
            } while(0)
6488
6489
        /* Non-escape characters are interpreted as Unicode ordinals */
6490
0
        if (c != '\\' || s >= end) {
6491
0
            WRITE_CHAR(c);
6492
0
            continue;
6493
0
        }
6494
6495
0
        c = (unsigned char) *s++;
6496
0
        if (c == 'u') {
6497
0
            count = 4;
6498
0
            message = "truncated \\uXXXX escape";
6499
0
        }
6500
0
        else if (c == 'U') {
6501
0
            count = 8;
6502
0
            message = "truncated \\UXXXXXXXX escape";
6503
0
        }
6504
0
        else {
6505
0
            assert(writer.pos < writer.size);
6506
0
            PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6507
0
            WRITE_CHAR(c);
6508
0
            continue;
6509
0
        }
6510
0
        startinpos = s - starts - 2;
6511
6512
        /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6513
0
        for (ch = 0; count && s < end; ++s, --count) {
6514
0
            c = (unsigned char)*s;
6515
0
            ch <<= 4;
6516
0
            if (c >= '0' && c <= '9') {
6517
0
                ch += c - '0';
6518
0
            }
6519
0
            else if (c >= 'a' && c <= 'f') {
6520
0
                ch += c - ('a' - 10);
6521
0
            }
6522
0
            else if (c >= 'A' && c <= 'F') {
6523
0
                ch += c - ('A' - 10);
6524
0
            }
6525
0
            else {
6526
0
                break;
6527
0
            }
6528
0
        }
6529
0
        if (!count) {
6530
0
            if (ch <= MAX_UNICODE) {
6531
0
                WRITE_CHAR(ch);
6532
0
                continue;
6533
0
            }
6534
0
            message = "\\Uxxxxxxxx out of range";
6535
0
        }
6536
6537
0
        endinpos = s-starts;
6538
0
        writer.min_length = end - s + writer.pos;
6539
0
        if (unicode_decode_call_errorhandler_writer(
6540
0
                errors, &errorHandler,
6541
0
                "rawunicodeescape", message,
6542
0
                &starts, &end, &startinpos, &endinpos, &exc, &s,
6543
0
                &writer)) {
6544
0
            goto onError;
6545
0
        }
6546
0
        assert(end - s <= writer.size - writer.pos);
6547
6548
0
#undef WRITE_CHAR
6549
0
    }
6550
0
    Py_XDECREF(errorHandler);
6551
0
    Py_XDECREF(exc);
6552
0
    return _PyUnicodeWriter_Finish(&writer);
6553
6554
0
  onError:
6555
0
    _PyUnicodeWriter_Dealloc(&writer);
6556
0
    Py_XDECREF(errorHandler);
6557
0
    Py_XDECREF(exc);
6558
0
    return NULL;
6559
6560
0
}
6561
6562
6563
PyObject *
6564
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6565
0
{
6566
0
    PyObject *repr;
6567
0
    char *p;
6568
0
    Py_ssize_t expandsize, pos;
6569
0
    int kind;
6570
0
    void *data;
6571
0
    Py_ssize_t len;
6572
6573
0
    if (!PyUnicode_Check(unicode)) {
6574
0
        PyErr_BadArgument();
6575
0
        return NULL;
6576
0
    }
6577
0
    if (PyUnicode_READY(unicode) == -1) {
6578
0
        return NULL;
6579
0
    }
6580
0
    kind = PyUnicode_KIND(unicode);
6581
0
    data = PyUnicode_DATA(unicode);
6582
0
    len = PyUnicode_GET_LENGTH(unicode);
6583
0
    if (kind == PyUnicode_1BYTE_KIND) {
6584
0
        return PyBytes_FromStringAndSize(data, len);
6585
0
    }
6586
6587
    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6588
       bytes, and 1 byte characters 4. */
6589
0
    expandsize = kind * 2 + 2;
6590
6591
0
    if (len > PY_SSIZE_T_MAX / expandsize) {
6592
0
        return PyErr_NoMemory();
6593
0
    }
6594
0
    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6595
0
    if (repr == NULL) {
6596
0
        return NULL;
6597
0
    }
6598
0
    if (len == 0) {
6599
0
        return repr;
6600
0
    }
6601
6602
0
    p = PyBytes_AS_STRING(repr);
6603
0
    for (pos = 0; pos < len; pos++) {
6604
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6605
6606
        /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6607
0
        if (ch < 0x100) {
6608
0
            *p++ = (char) ch;
6609
0
        }
6610
        /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
6611
0
        else if (ch < 0x10000) {
6612
0
            *p++ = '\\';
6613
0
            *p++ = 'u';
6614
0
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6615
0
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6616
0
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6617
0
            *p++ = Py_hexdigits[ch & 15];
6618
0
        }
6619
        /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6620
0
        else {
6621
0
            assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6622
0
            *p++ = '\\';
6623
0
            *p++ = 'U';
6624
0
            *p++ = '0';
6625
0
            *p++ = '0';
6626
0
            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6627
0
            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6628
0
            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6629
0
            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6630
0
            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6631
0
            *p++ = Py_hexdigits[ch & 15];
6632
0
        }
6633
0
    }
6634
6635
0
    assert(p > PyBytes_AS_STRING(repr));
6636
0
    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6637
0
        return NULL;
6638
0
    }
6639
0
    return repr;
6640
0
}
6641
6642
PyObject *
6643
PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6644
                                 Py_ssize_t size)
6645
0
{
6646
0
    PyObject *result;
6647
0
    PyObject *tmp = PyUnicode_FromWideChar(s, size);
6648
0
    if (tmp == NULL)
6649
0
        return NULL;
6650
0
    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6651
0
    Py_DECREF(tmp);
6652
0
    return result;
6653
0
}
6654
6655
/* --- Latin-1 Codec ------------------------------------------------------ */
6656
6657
PyObject *
6658
PyUnicode_DecodeLatin1(const char *s,
6659
                       Py_ssize_t size,
6660
                       const char *errors)
6661
1
{
6662
    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6663
1
    return _PyUnicode_FromUCS1((const unsigned char*)s, size);
6664
1
}
6665
6666
/* create or adjust a UnicodeEncodeError */
6667
static void
6668
make_encode_exception(PyObject **exceptionObject,
6669
                      const char *encoding,
6670
                      PyObject *unicode,
6671
                      Py_ssize_t startpos, Py_ssize_t endpos,
6672
                      const char *reason)
6673
0
{
6674
0
    if (*exceptionObject == NULL) {
6675
0
        *exceptionObject = PyObject_CallFunction(
6676
0
            PyExc_UnicodeEncodeError, "sOnns",
6677
0
            encoding, unicode, startpos, endpos, reason);
6678
0
    }
6679
0
    else {
6680
0
        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6681
0
            goto onError;
6682
0
        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6683
0
            goto onError;
6684
0
        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6685
0
            goto onError;
6686
0
        return;
6687
0
      onError:
6688
0
        Py_CLEAR(*exceptionObject);
6689
0
    }
6690
0
}
6691
6692
/* raises a UnicodeEncodeError */
6693
static void
6694
raise_encode_exception(PyObject **exceptionObject,
6695
                       const char *encoding,
6696
                       PyObject *unicode,
6697
                       Py_ssize_t startpos, Py_ssize_t endpos,
6698
                       const char *reason)
6699
0
{
6700
0
    make_encode_exception(exceptionObject,
6701
0
                          encoding, unicode, startpos, endpos, reason);
6702
0
    if (*exceptionObject != NULL)
6703
0
        PyCodec_StrictErrors(*exceptionObject);
6704
0
}
6705
6706
/* error handling callback helper:
6707
   build arguments, call the callback and check the arguments,
6708
   put the result into newpos and return the replacement string, which
6709
   has to be freed by the caller */
6710
static PyObject *
6711
unicode_encode_call_errorhandler(const char *errors,
6712
                                 PyObject **errorHandler,
6713
                                 const char *encoding, const char *reason,
6714
                                 PyObject *unicode, PyObject **exceptionObject,
6715
                                 Py_ssize_t startpos, Py_ssize_t endpos,
6716
                                 Py_ssize_t *newpos)
6717
0
{
6718
0
    static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6719
0
    Py_ssize_t len;
6720
0
    PyObject *restuple;
6721
0
    PyObject *resunicode;
6722
6723
0
    if (*errorHandler == NULL) {
6724
0
        *errorHandler = PyCodec_LookupError(errors);
6725
0
        if (*errorHandler == NULL)
6726
0
            return NULL;
6727
0
    }
6728
6729
0
    if (PyUnicode_READY(unicode) == -1)
6730
0
        return NULL;
6731
0
    len = PyUnicode_GET_LENGTH(unicode);
6732
6733
0
    make_encode_exception(exceptionObject,
6734
0
                          encoding, unicode, startpos, endpos, reason);
6735
0
    if (*exceptionObject == NULL)
6736
0
        return NULL;
6737
6738
0
    restuple = PyObject_CallFunctionObjArgs(
6739
0
        *errorHandler, *exceptionObject, NULL);
6740
0
    if (restuple == NULL)
6741
0
        return NULL;
6742
0
    if (!PyTuple_Check(restuple)) {
6743
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6744
0
        Py_DECREF(restuple);
6745
0
        return NULL;
6746
0
    }
6747
0
    if (!PyArg_ParseTuple(restuple, argparse,
6748
0
                          &resunicode, newpos)) {
6749
0
        Py_DECREF(restuple);
6750
0
        return NULL;
6751
0
    }
6752
0
    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6753
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6754
0
        Py_DECREF(restuple);
6755
0
        return NULL;
6756
0
    }
6757
0
    if (*newpos<0)
6758
0
        *newpos = len + *newpos;
6759
0
    if (*newpos<0 || *newpos>len) {
6760
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6761
0
        Py_DECREF(restuple);
6762
0
        return NULL;
6763
0
    }
6764
0
    Py_INCREF(resunicode);
6765
0
    Py_DECREF(restuple);
6766
0
    return resunicode;
6767
0
}
6768
6769
static PyObject *
6770
unicode_encode_ucs1(PyObject *unicode,
6771
                    const char *errors,
6772
                    const Py_UCS4 limit)
6773
0
{
6774
    /* input state */
6775
0
    Py_ssize_t pos=0, size;
6776
0
    int kind;
6777
0
    void *data;
6778
    /* pointer into the output */
6779
0
    char *str;
6780
0
    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6781
0
    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6782
0
    PyObject *error_handler_obj = NULL;
6783
0
    PyObject *exc = NULL;
6784
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6785
0
    PyObject *rep = NULL;
6786
    /* output object */
6787
0
    _PyBytesWriter writer;
6788
6789
0
    if (PyUnicode_READY(unicode) == -1)
6790
0
        return NULL;
6791
0
    size = PyUnicode_GET_LENGTH(unicode);
6792
0
    kind = PyUnicode_KIND(unicode);
6793
0
    data = PyUnicode_DATA(unicode);
6794
    /* allocate enough for a simple encoding without
6795
       replacements, if we need more, we'll resize */
6796
0
    if (size == 0)
6797
0
        return PyBytes_FromStringAndSize(NULL, 0);
6798
6799
0
    _PyBytesWriter_Init(&writer);
6800
0
    str = _PyBytesWriter_Alloc(&writer, size);
6801
0
    if (str == NULL)
6802
0
        return NULL;
6803
6804
0
    while (pos < size) {
6805
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6806
6807
        /* can we encode this? */
6808
0
        if (ch < limit) {
6809
            /* no overflow check, because we know that the space is enough */
6810
0
            *str++ = (char)ch;
6811
0
            ++pos;
6812
0
        }
6813
0
        else {
6814
0
            Py_ssize_t newpos, i;
6815
            /* startpos for collecting unencodable chars */
6816
0
            Py_ssize_t collstart = pos;
6817
0
            Py_ssize_t collend = collstart + 1;
6818
            /* find all unecodable characters */
6819
6820
0
            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6821
0
                ++collend;
6822
6823
            /* Only overallocate the buffer if it's not the last write */
6824
0
            writer.overallocate = (collend < size);
6825
6826
            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6827
0
            if (error_handler == _Py_ERROR_UNKNOWN)
6828
0
                error_handler = _Py_GetErrorHandler(errors);
6829
6830
0
            switch (error_handler) {
6831
0
            case _Py_ERROR_STRICT:
6832
0
                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6833
0
                goto onError;
6834
6835
0
            case _Py_ERROR_REPLACE:
6836
0
                memset(str, '?', collend - collstart);
6837
0
                str += (collend - collstart);
6838
                /* fall through */
6839
0
            case _Py_ERROR_IGNORE:
6840
0
                pos = collend;
6841
0
                break;
6842
6843
0
            case _Py_ERROR_BACKSLASHREPLACE:
6844
                /* subtract preallocated bytes */
6845
0
                writer.min_size -= (collend - collstart);
6846
0
                str = backslashreplace(&writer, str,
6847
0
                                       unicode, collstart, collend);
6848
0
                if (str == NULL)
6849
0
                    goto onError;
6850
0
                pos = collend;
6851
0
                break;
6852
6853
0
            case _Py_ERROR_XMLCHARREFREPLACE:
6854
                /* subtract preallocated bytes */
6855
0
                writer.min_size -= (collend - collstart);
6856
0
                str = xmlcharrefreplace(&writer, str,
6857
0
                                        unicode, collstart, collend);
6858
0
                if (str == NULL)
6859
0
                    goto onError;
6860
0
                pos = collend;
6861
0
                break;
6862
6863
0
            case _Py_ERROR_SURROGATEESCAPE:
6864
0
                for (i = collstart; i < collend; ++i) {
6865
0
                    ch = PyUnicode_READ(kind, data, i);
6866
0
                    if (ch < 0xdc80 || 0xdcff < ch) {
6867
                        /* Not a UTF-8b surrogate */
6868
0
                        break;
6869
0
                    }
6870
0
                    *str++ = (char)(ch - 0xdc00);
6871
0
                    ++pos;
6872
0
                }
6873
0
                if (i >= collend)
6874
0
                    break;
6875
0
                collstart = pos;
6876
0
                assert(collstart != collend);
6877
                /* fall through */
6878
6879
0
            default:
6880
0
                rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6881
0
                                                       encoding, reason, unicode, &exc,
6882
0
                                                       collstart, collend, &newpos);
6883
0
                if (rep == NULL)
6884
0
                    goto onError;
6885
6886
                /* subtract preallocated bytes */
6887
0
                writer.min_size -= newpos - collstart;
6888
6889
0
                if (PyBytes_Check(rep)) {
6890
                    /* Directly copy bytes result to output. */
6891
0
                    str = _PyBytesWriter_WriteBytes(&writer, str,
6892
0
                                                    PyBytes_AS_STRING(rep),
6893
0
                                                    PyBytes_GET_SIZE(rep));
6894
0
                }
6895
0
                else {
6896
0
                    assert(PyUnicode_Check(rep));
6897
6898
0
                    if (PyUnicode_READY(rep) < 0)
6899
0
                        goto onError;
6900
6901
0
                    if (limit == 256 ?
6902
0
                        PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6903
0
                        !PyUnicode_IS_ASCII(rep))
6904
0
                    {
6905
                        /* Not all characters are smaller than limit */
6906
0
                        raise_encode_exception(&exc, encoding, unicode,
6907
0
                                               collstart, collend, reason);
6908
0
                        goto onError;
6909
0
                    }
6910
0
                    assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6911
0
                    str = _PyBytesWriter_WriteBytes(&writer, str,
6912
0
                                                    PyUnicode_DATA(rep),
6913
0
                                                    PyUnicode_GET_LENGTH(rep));
6914
0
                }
6915
0
                if (str == NULL)
6916
0
                    goto onError;
6917
6918
0
                pos = newpos;
6919
0
                Py_CLEAR(rep);
6920
0
            }
6921
6922
            /* If overallocation was disabled, ensure that it was the last
6923
               write. Otherwise, we missed an optimization */
6924
0
            assert(writer.overallocate || pos == size);
6925
0
        }
6926
0
    }
6927
6928
0
    Py_XDECREF(error_handler_obj);
6929
0
    Py_XDECREF(exc);
6930
0
    return _PyBytesWriter_Finish(&writer, str);
6931
6932
0
  onError:
6933
0
    Py_XDECREF(rep);
6934
0
    _PyBytesWriter_Dealloc(&writer);
6935
0
    Py_XDECREF(error_handler_obj);
6936
0
    Py_XDECREF(exc);
6937
0
    return NULL;
6938
0
}
6939
6940
/* Deprecated */
6941
PyObject *
6942
PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6943
                       Py_ssize_t size,
6944
                       const char *errors)
6945
0
{
6946
0
    PyObject *result;
6947
0
    PyObject *unicode = PyUnicode_FromWideChar(p, size);
6948
0
    if (unicode == NULL)
6949
0
        return NULL;
6950
0
    result = unicode_encode_ucs1(unicode, errors, 256);
6951
0
    Py_DECREF(unicode);
6952
0
    return result;
6953
0
}
6954
6955
PyObject *
6956
_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6957
0
{
6958
0
    if (!PyUnicode_Check(unicode)) {
6959
0
        PyErr_BadArgument();
6960
0
        return NULL;
6961
0
    }
6962
0
    if (PyUnicode_READY(unicode) == -1)
6963
0
        return NULL;
6964
    /* Fast path: if it is a one-byte string, construct
6965
       bytes object directly. */
6966
0
    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6967
0
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6968
0
                                         PyUnicode_GET_LENGTH(unicode));
6969
    /* Non-Latin-1 characters present. Defer to above function to
6970
       raise the exception. */
6971
0
    return unicode_encode_ucs1(unicode, errors, 256);
6972
0
}
6973
6974
PyObject*
6975
PyUnicode_AsLatin1String(PyObject *unicode)
6976
0
{
6977
0
    return _PyUnicode_AsLatin1String(unicode, NULL);
6978
0
}
6979
6980
/* --- 7-bit ASCII Codec -------------------------------------------------- */
6981
6982
PyObject *
6983
PyUnicode_DecodeASCII(const char *s,
6984
                      Py_ssize_t size,
6985
                      const char *errors)
6986
437
{
6987
437
    const char *starts = s;
6988
437
    _PyUnicodeWriter writer;
6989
437
    int kind;
6990
437
    void *data;
6991
437
    Py_ssize_t startinpos;
6992
437
    Py_ssize_t endinpos;
6993
437
    Py_ssize_t outpos;
6994
437
    const char *e;
6995
437
    PyObject *error_handler_obj = NULL;
6996
437
    PyObject *exc = NULL;
6997
437
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6998
6999
437
    if (size == 0)
7000
0
        _Py_RETURN_UNICODE_EMPTY();
7001
7002
    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
7003
437
    if (size == 1 && (unsigned char)s[0] < 128)
7004
0
        return get_latin1_char((unsigned char)s[0]);
7005
7006
437
    _PyUnicodeWriter_Init(&writer);
7007
437
    writer.min_length = size;
7008
437
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
7009
0
        return NULL;
7010
7011
437
    e = s + size;
7012
437
    data = writer.data;
7013
437
    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
7014
437
    writer.pos = outpos;
7015
437
    if (writer.pos == size)
7016
437
        return _PyUnicodeWriter_Finish(&writer);
7017
7018
0
    s += writer.pos;
7019
0
    kind = writer.kind;
7020
0
    while (s < e) {
7021
0
        unsigned char c = (unsigned char)*s;
7022
0
        if (c < 128) {
7023
0
            PyUnicode_WRITE(kind, data, writer.pos, c);
7024
0
            writer.pos++;
7025
0
            ++s;
7026
0
            continue;
7027
0
        }
7028
7029
        /* byte outsize range 0x00..0x7f: call the error handler */
7030
7031
0
        if (error_handler == _Py_ERROR_UNKNOWN)
7032
0
            error_handler = _Py_GetErrorHandler(errors);
7033
7034
0
        switch (error_handler)
7035
0
        {
7036
0
        case _Py_ERROR_REPLACE:
7037
0
        case _Py_ERROR_SURROGATEESCAPE:
7038
            /* Fast-path: the error handler only writes one character,
7039
               but we may switch to UCS2 at the first write */
7040
0
            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7041
0
                goto onError;
7042
0
            kind = writer.kind;
7043
0
            data = writer.data;
7044
7045
0
            if (error_handler == _Py_ERROR_REPLACE)
7046
0
                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7047
0
            else
7048
0
                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7049
0
            writer.pos++;
7050
0
            ++s;
7051
0
            break;
7052
7053
0
        case _Py_ERROR_IGNORE:
7054
0
            ++s;
7055
0
            break;
7056
7057
0
        default:
7058
0
            startinpos = s-starts;
7059
0
            endinpos = startinpos + 1;
7060
0
            if (unicode_decode_call_errorhandler_writer(
7061
0
                    errors, &error_handler_obj,
7062
0
                    "ascii", "ordinal not in range(128)",
7063
0
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7064
0
                    &writer))
7065
0
                goto onError;
7066
0
            kind = writer.kind;
7067
0
            data = writer.data;
7068
0
        }
7069
0
    }
7070
0
    Py_XDECREF(error_handler_obj);
7071
0
    Py_XDECREF(exc);
7072
0
    return _PyUnicodeWriter_Finish(&writer);
7073
7074
0
  onError:
7075
0
    _PyUnicodeWriter_Dealloc(&writer);
7076
0
    Py_XDECREF(error_handler_obj);
7077
0
    Py_XDECREF(exc);
7078
0
    return NULL;
7079
0
}
7080
7081
/* Deprecated */
7082
PyObject *
7083
PyUnicode_EncodeASCII(const Py_UNICODE *p,
7084
                      Py_ssize_t size,
7085
                      const char *errors)
7086
0
{
7087
0
    PyObject *result;
7088
0
    PyObject *unicode = PyUnicode_FromWideChar(p, size);
7089
0
    if (unicode == NULL)
7090
0
        return NULL;
7091
0
    result = unicode_encode_ucs1(unicode, errors, 128);
7092
0
    Py_DECREF(unicode);
7093
0
    return result;
7094
0
}
7095
7096
PyObject *
7097
_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
7098
1.01k
{
7099
1.01k
    if (!PyUnicode_Check(unicode)) {
7100
0
        PyErr_BadArgument();
7101
0
        return NULL;
7102
0
    }
7103
1.01k
    if (PyUnicode_READY(unicode) == -1)
7104
0
        return NULL;
7105
    /* Fast path: if it is an ASCII-only string, construct bytes object
7106
       directly. Else defer to above function to raise the exception. */
7107
1.01k
    if (PyUnicode_IS_ASCII(unicode))
7108
1.01k
        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7109
1.01k
                                         PyUnicode_GET_LENGTH(unicode));
7110
0
    return unicode_encode_ucs1(unicode, errors, 128);
7111
1.01k
}
7112
7113
PyObject *
7114
PyUnicode_AsASCIIString(PyObject *unicode)
7115
2
{
7116
2
    return _PyUnicode_AsASCIIString(unicode, NULL);
7117
2
}
7118
7119
#ifdef MS_WINDOWS
7120
7121
/* --- MBCS codecs for Windows -------------------------------------------- */
7122
7123
#if SIZEOF_INT < SIZEOF_SIZE_T
7124
#define NEED_RETRY
7125
#endif
7126
7127
/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7128
   transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7129
   both cases also and avoids partial characters overrunning the
7130
   length limit in MultiByteToWideChar on Windows */
7131
#define DECODING_CHUNK_SIZE (INT_MAX/4)
7132
7133
#ifndef WC_ERR_INVALID_CHARS
7134
#  define WC_ERR_INVALID_CHARS 0x0080
7135
#endif
7136
7137
static const char*
7138
code_page_name(UINT code_page, PyObject **obj)
7139
{
7140
    *obj = NULL;
7141
    if (code_page == CP_ACP)
7142
        return "mbcs";
7143
    if (code_page == CP_UTF7)
7144
        return "CP_UTF7";
7145
    if (code_page == CP_UTF8)
7146
        return "CP_UTF8";
7147
7148
    *obj = PyBytes_FromFormat("cp%u", code_page);
7149
    if (*obj == NULL)
7150
        return NULL;
7151
    return PyBytes_AS_STRING(*obj);
7152
}
7153
7154
static DWORD
7155
decode_code_page_flags(UINT code_page)
7156
{
7157
    if (code_page == CP_UTF7) {
7158
        /* The CP_UTF7 decoder only supports flags=0 */
7159
        return 0;
7160
    }
7161
    else
7162
        return MB_ERR_INVALID_CHARS;
7163
}
7164
7165
/*
7166
 * Decode a byte string from a Windows code page into unicode object in strict
7167
 * mode.
7168
 *
7169
 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7170
 * OSError and returns -1 on other error.
7171
 */
7172
static int
7173
decode_code_page_strict(UINT code_page,
7174
                        wchar_t **buf,
7175
                        Py_ssize_t *bufsize,
7176
                        const char *in,
7177
                        int insize)
7178
{
7179
    DWORD flags = MB_ERR_INVALID_CHARS;
7180
    wchar_t *out;
7181
    DWORD outsize;
7182
7183
    /* First get the size of the result */
7184
    assert(insize > 0);
7185
    while ((outsize = MultiByteToWideChar(code_page, flags,
7186
                                          in, insize, NULL, 0)) <= 0)
7187
    {
7188
        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7189
            goto error;
7190
        }
7191
        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7192
        flags = 0;
7193
    }
7194
7195
    /* Extend a wchar_t* buffer */
7196
    Py_ssize_t n = *bufsize;   /* Get the current length */
7197
    if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7198
        return -1;
7199
    }
7200
    out = *buf + n;
7201
7202
    /* Do the conversion */
7203
    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7204
    if (outsize <= 0)
7205
        goto error;
7206
    return insize;
7207
7208
error:
7209
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7210
        return -2;
7211
    PyErr_SetFromWindowsErr(0);
7212
    return -1;
7213
}
7214
7215
/*
7216
 * Decode a byte string from a code page into unicode object with an error
7217
 * handler.
7218
 *
7219
 * Returns consumed size if succeed, or raise an OSError or
7220
 * UnicodeDecodeError exception and returns -1 on error.
7221
 */
7222
static int
7223
decode_code_page_errors(UINT code_page,
7224
                        wchar_t **buf,
7225
                        Py_ssize_t *bufsize,
7226
                        const char *in, const int size,
7227
                        const char *errors, int final)
7228
{
7229
    const char *startin = in;
7230
    const char *endin = in + size;
7231
    DWORD flags = MB_ERR_INVALID_CHARS;
7232
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7233
       2000 English version of the message. */
7234
    const char *reason = "No mapping for the Unicode character exists "
7235
                         "in the target code page.";
7236
    /* each step cannot decode more than 1 character, but a character can be
7237
       represented as a surrogate pair */
7238
    wchar_t buffer[2], *out;
7239
    int insize;
7240
    Py_ssize_t outsize;
7241
    PyObject *errorHandler = NULL;
7242
    PyObject *exc = NULL;
7243
    PyObject *encoding_obj = NULL;
7244
    const char *encoding;
7245
    DWORD err;
7246
    int ret = -1;
7247
7248
    assert(size > 0);
7249
7250
    encoding = code_page_name(code_page, &encoding_obj);
7251
    if (encoding == NULL)
7252
        return -1;
7253
7254
    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
7255
        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7256
           UnicodeDecodeError. */
7257
        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7258
        if (exc != NULL) {
7259
            PyCodec_StrictErrors(exc);
7260
            Py_CLEAR(exc);
7261
        }
7262
        goto error;
7263
    }
7264
7265
    /* Extend a wchar_t* buffer */
7266
    Py_ssize_t n = *bufsize;   /* Get the current length */
7267
    if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7268
        PyErr_NoMemory();
7269
        goto error;
7270
    }
7271
    if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7272
        goto error;
7273
    }
7274
    out = *buf + n;
7275
7276
    /* Decode the byte string character per character */
7277
    while (in < endin)
7278
    {
7279
        /* Decode a character */
7280
        insize = 1;
7281
        do
7282
        {
7283
            outsize = MultiByteToWideChar(code_page, flags,
7284
                                          in, insize,
7285
                                          buffer, Py_ARRAY_LENGTH(buffer));
7286
            if (outsize > 0)
7287
                break;
7288
            err = GetLastError();
7289
            if (err == ERROR_INVALID_FLAGS && flags) {
7290
                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7291
                flags = 0;
7292
                continue;
7293
            }
7294
            if (err != ERROR_NO_UNICODE_TRANSLATION
7295
                && err != ERROR_INSUFFICIENT_BUFFER)
7296
            {
7297
                PyErr_SetFromWindowsErr(0);
7298
                goto error;
7299
            }
7300
            insize++;
7301
        }
7302
        /* 4=maximum length of a UTF-8 sequence */
7303
        while (insize <= 4 && (in + insize) <= endin);
7304
7305
        if (outsize <= 0) {
7306
            Py_ssize_t startinpos, endinpos, outpos;
7307
7308
            /* last character in partial decode? */
7309
            if (in + insize >= endin && !final)
7310
                break;
7311
7312
            startinpos = in - startin;
7313
            endinpos = startinpos + 1;
7314
            outpos = out - *buf;
7315
            if (unicode_decode_call_errorhandler_wchar(
7316
                    errors, &errorHandler,
7317
                    encoding, reason,
7318
                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7319
                    buf, bufsize, &outpos))
7320
            {
7321
                goto error;
7322
            }
7323
            out = *buf + outpos;
7324
        }
7325
        else {
7326
            in += insize;
7327
            memcpy(out, buffer, outsize * sizeof(wchar_t));
7328
            out += outsize;
7329
        }
7330
    }
7331
7332
    /* Shrink the buffer */
7333
    assert(out - *buf <= *bufsize);
7334
    *bufsize = out - *buf;
7335
    /* (in - startin) <= size and size is an int */
7336
    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7337
7338
error:
7339
    Py_XDECREF(encoding_obj);
7340
    Py_XDECREF(errorHandler);
7341
    Py_XDECREF(exc);
7342
    return ret;
7343
}
7344
7345
static PyObject *
7346
decode_code_page_stateful(int code_page,
7347
                          const char *s, Py_ssize_t size,
7348
                          const char *errors, Py_ssize_t *consumed)
7349
{
7350
    wchar_t *buf = NULL;
7351
    Py_ssize_t bufsize = 0;
7352
    int chunk_size, final, converted, done;
7353
7354
    if (code_page < 0) {
7355
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7356
        return NULL;
7357
    }
7358
    if (size < 0) {
7359
        PyErr_BadInternalCall();
7360
        return NULL;
7361
    }
7362
7363
    if (consumed)
7364
        *consumed = 0;
7365
7366
    do
7367
    {
7368
#ifdef NEED_RETRY
7369
        if (size > DECODING_CHUNK_SIZE) {
7370
            chunk_size = DECODING_CHUNK_SIZE;
7371
            final = 0;
7372
            done = 0;
7373
        }
7374
        else
7375
#endif
7376
        {
7377
            chunk_size = (int)size;
7378
            final = (consumed == NULL);
7379
            done = 1;
7380
        }
7381
7382
        if (chunk_size == 0 && done) {
7383
            if (buf != NULL)
7384
                break;
7385
            _Py_RETURN_UNICODE_EMPTY();
7386
        }
7387
7388
        converted = decode_code_page_strict(code_page, &buf, &bufsize,
7389
                                            s, chunk_size);
7390
        if (converted == -2)
7391
            converted = decode_code_page_errors(code_page, &buf, &bufsize,
7392
                                                s, chunk_size,
7393
                                                errors, final);
7394
        assert(converted != 0 || done);
7395
7396
        if (converted < 0) {
7397
            PyMem_Free(buf);
7398
            return NULL;
7399
        }
7400
7401
        if (consumed)
7402
            *consumed += converted;
7403
7404
        s += converted;
7405
        size -= converted;
7406
    } while (!done);
7407
7408
    PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7409
    PyMem_Free(buf);
7410
    return v;
7411
}
7412
7413
PyObject *
7414
PyUnicode_DecodeCodePageStateful(int code_page,
7415
                                 const char *s,
7416
                                 Py_ssize_t size,
7417
                                 const char *errors,
7418
                                 Py_ssize_t *consumed)
7419
{
7420
    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7421
}
7422
7423
PyObject *
7424
PyUnicode_DecodeMBCSStateful(const char *s,
7425
                             Py_ssize_t size,
7426
                             const char *errors,
7427
                             Py_ssize_t *consumed)
7428
{
7429
    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7430
}
7431
7432
PyObject *
7433
PyUnicode_DecodeMBCS(const char *s,
7434
                     Py_ssize_t size,
7435
                     const char *errors)
7436
{
7437
    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7438
}
7439
7440
static DWORD
7441
encode_code_page_flags(UINT code_page, const char *errors)
7442
{
7443
    if (code_page == CP_UTF8) {
7444
        return WC_ERR_INVALID_CHARS;
7445
    }
7446
    else if (code_page == CP_UTF7) {
7447
        /* CP_UTF7 only supports flags=0 */
7448
        return 0;
7449
    }
7450
    else {
7451
        if (errors != NULL && strcmp(errors, "replace") == 0)
7452
            return 0;
7453
        else
7454
            return WC_NO_BEST_FIT_CHARS;
7455
    }
7456
}
7457
7458
/*
7459
 * Encode a Unicode string to a Windows code page into a byte string in strict
7460
 * mode.
7461
 *
7462
 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7463
 * an OSError and returns -1 on other error.
7464
 */
7465
static int
7466
encode_code_page_strict(UINT code_page, PyObject **outbytes,
7467
                        PyObject *unicode, Py_ssize_t offset, int len,
7468
                        const char* errors)
7469
{
7470
    BOOL usedDefaultChar = FALSE;
7471
    BOOL *pusedDefaultChar = &usedDefaultChar;
7472
    int outsize;
7473
    wchar_t *p;
7474
    Py_ssize_t size;
7475
    const DWORD flags = encode_code_page_flags(code_page, NULL);
7476
    char *out;
7477
    /* Create a substring so that we can get the UTF-16 representation
7478
       of just the slice under consideration. */
7479
    PyObject *substring;
7480
7481
    assert(len > 0);
7482
7483
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7484
        pusedDefaultChar = &usedDefaultChar;
7485
    else
7486
        pusedDefaultChar = NULL;
7487
7488
    substring = PyUnicode_Substring(unicode, offset, offset+len);
7489
    if (substring == NULL)
7490
        return -1;
7491
    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7492
    if (p == NULL) {
7493
        Py_DECREF(substring);
7494
        return -1;
7495
    }
7496
    assert(size <= INT_MAX);
7497
7498
    /* First get the size of the result */
7499
    outsize = WideCharToMultiByte(code_page, flags,
7500
                                  p, (int)size,
7501
                                  NULL, 0,
7502
                                  NULL, pusedDefaultChar);
7503
    if (outsize <= 0)
7504
        goto error;
7505
    /* If we used a default char, then we failed! */
7506
    if (pusedDefaultChar && *pusedDefaultChar) {
7507
        Py_DECREF(substring);
7508
        return -2;
7509
    }
7510
7511
    if (*outbytes == NULL) {
7512
        /* Create string object */
7513
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7514
        if (*outbytes == NULL) {
7515
            Py_DECREF(substring);
7516
            return -1;
7517
        }
7518
        out = PyBytes_AS_STRING(*outbytes);
7519
    }
7520
    else {
7521
        /* Extend string object */
7522
        const Py_ssize_t n = PyBytes_Size(*outbytes);
7523
        if (outsize > PY_SSIZE_T_MAX - n) {
7524
            PyErr_NoMemory();
7525
            Py_DECREF(substring);
7526
            return -1;
7527
        }
7528
        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7529
            Py_DECREF(substring);
7530
            return -1;
7531
        }
7532
        out = PyBytes_AS_STRING(*outbytes) + n;
7533
    }
7534
7535
    /* Do the conversion */
7536
    outsize = WideCharToMultiByte(code_page, flags,
7537
                                  p, (int)size,
7538
                                  out, outsize,
7539
                                  NULL, pusedDefaultChar);
7540
    Py_CLEAR(substring);
7541
    if (outsize <= 0)
7542
        goto error;
7543
    if (pusedDefaultChar && *pusedDefaultChar)
7544
        return -2;
7545
    return 0;
7546
7547
error:
7548
    Py_XDECREF(substring);
7549
    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7550
        return -2;
7551
    PyErr_SetFromWindowsErr(0);
7552
    return -1;
7553
}
7554
7555
/*
7556
 * Encode a Unicode string to a Windows code page into a byte string using an
7557
 * error handler.
7558
 *
7559
 * Returns consumed characters if succeed, or raise an OSError and returns
7560
 * -1 on other error.
7561
 */
7562
static int
7563
encode_code_page_errors(UINT code_page, PyObject **outbytes,
7564
                        PyObject *unicode, Py_ssize_t unicode_offset,
7565
                        Py_ssize_t insize, const char* errors)
7566
{
7567
    const DWORD flags = encode_code_page_flags(code_page, errors);
7568
    Py_ssize_t pos = unicode_offset;
7569
    Py_ssize_t endin = unicode_offset + insize;
7570
    /* Ideally, we should get reason from FormatMessage. This is the Windows
7571
       2000 English version of the message. */
7572
    const char *reason = "invalid character";
7573
    /* 4=maximum length of a UTF-8 sequence */
7574
    char buffer[4];
7575
    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7576
    Py_ssize_t outsize;
7577
    char *out;
7578
    PyObject *errorHandler = NULL;
7579
    PyObject *exc = NULL;
7580
    PyObject *encoding_obj = NULL;
7581
    const char *encoding;
7582
    Py_ssize_t newpos, newoutsize;
7583
    PyObject *rep;
7584
    int ret = -1;
7585
7586
    assert(insize > 0);
7587
7588
    encoding = code_page_name(code_page, &encoding_obj);
7589
    if (encoding == NULL)
7590
        return -1;
7591
7592
    if (errors == NULL || strcmp(errors, "strict") == 0) {
7593
        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7594
           then we raise a UnicodeEncodeError. */
7595
        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7596
        if (exc != NULL) {
7597
            PyCodec_StrictErrors(exc);
7598
            Py_DECREF(exc);
7599
        }
7600
        Py_XDECREF(encoding_obj);
7601
        return -1;
7602
    }
7603
7604
    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7605
        pusedDefaultChar = &usedDefaultChar;
7606
    else
7607
        pusedDefaultChar = NULL;
7608
7609
    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7610
        PyErr_NoMemory();
7611
        goto error;
7612
    }
7613
    outsize = insize * Py_ARRAY_LENGTH(buffer);
7614
7615
    if (*outbytes == NULL) {
7616
        /* Create string object */
7617
        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7618
        if (*outbytes == NULL)
7619
            goto error;
7620
        out = PyBytes_AS_STRING(*outbytes);
7621
    }
7622
    else {
7623
        /* Extend string object */
7624
        Py_ssize_t n = PyBytes_Size(*outbytes);
7625
        if (n > PY_SSIZE_T_MAX - outsize) {
7626
            PyErr_NoMemory();
7627
            goto error;
7628
        }
7629
        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7630
            goto error;
7631
        out = PyBytes_AS_STRING(*outbytes) + n;
7632
    }
7633
7634
    /* Encode the string character per character */
7635
    while (pos < endin)
7636
    {
7637
        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7638
        wchar_t chars[2];
7639
        int charsize;
7640
        if (ch < 0x10000) {
7641
            chars[0] = (wchar_t)ch;
7642
            charsize = 1;
7643
        }
7644
        else {
7645
            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7646
            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7647
            charsize = 2;
7648
        }
7649
7650
        outsize = WideCharToMultiByte(code_page, flags,
7651
                                      chars, charsize,
7652
                                      buffer, Py_ARRAY_LENGTH(buffer),
7653
                                      NULL, pusedDefaultChar);
7654
        if (outsize > 0) {
7655
            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7656
            {
7657
                pos++;
7658
                memcpy(out, buffer, outsize);
7659
                out += outsize;
7660
                continue;
7661
            }
7662
        }
7663
        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7664
            PyErr_SetFromWindowsErr(0);
7665
            goto error;
7666
        }
7667
7668
        rep = unicode_encode_call_errorhandler(
7669
                  errors, &errorHandler, encoding, reason,
7670
                  unicode, &exc,
7671
                  pos, pos + 1, &newpos);
7672
        if (rep == NULL)
7673
            goto error;
7674
        pos = newpos;
7675
7676
        if (PyBytes_Check(rep)) {
7677
            outsize = PyBytes_GET_SIZE(rep);
7678
            if (outsize != 1) {
7679
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7680
                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7681
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7682
                    Py_DECREF(rep);
7683
                    goto error;
7684
                }
7685
                out = PyBytes_AS_STRING(*outbytes) + offset;
7686
            }
7687
            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7688
            out += outsize;
7689
        }
7690
        else {
7691
            Py_ssize_t i;
7692
            enum PyUnicode_Kind kind;
7693
            void *data;
7694
7695
            if (PyUnicode_READY(rep) == -1) {
7696
                Py_DECREF(rep);
7697
                goto error;
7698
            }
7699
7700
            outsize = PyUnicode_GET_LENGTH(rep);
7701
            if (outsize != 1) {
7702
                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7703
                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7704
                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7705
                    Py_DECREF(rep);
7706
                    goto error;
7707
                }
7708
                out = PyBytes_AS_STRING(*outbytes) + offset;
7709
            }
7710
            kind = PyUnicode_KIND(rep);
7711
            data = PyUnicode_DATA(rep);
7712
            for (i=0; i < outsize; i++) {
7713
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7714
                if (ch > 127) {
7715
                    raise_encode_exception(&exc,
7716
                        encoding, unicode,
7717
                        pos, pos + 1,
7718
                        "unable to encode error handler result to ASCII");
7719
                    Py_DECREF(rep);
7720
                    goto error;
7721
                }
7722
                *out = (unsigned char)ch;
7723
                out++;
7724
            }
7725
        }
7726
        Py_DECREF(rep);
7727
    }
7728
    /* write a NUL byte */
7729
    *out = 0;
7730
    outsize = out - PyBytes_AS_STRING(*outbytes);
7731
    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7732
    if (_PyBytes_Resize(outbytes, outsize) < 0)
7733
        goto error;
7734
    ret = 0;
7735
7736
error:
7737
    Py_XDECREF(encoding_obj);
7738
    Py_XDECREF(errorHandler);
7739
    Py_XDECREF(exc);
7740
    return ret;
7741
}
7742
7743
static PyObject *
7744
encode_code_page(int code_page,
7745
                 PyObject *unicode,
7746
                 const char *errors)
7747
{
7748
    Py_ssize_t len;
7749
    PyObject *outbytes = NULL;
7750
    Py_ssize_t offset;
7751
    int chunk_len, ret, done;
7752
7753
    if (!PyUnicode_Check(unicode)) {
7754
        PyErr_BadArgument();
7755
        return NULL;
7756
    }
7757
7758
    if (PyUnicode_READY(unicode) == -1)
7759
        return NULL;
7760
    len = PyUnicode_GET_LENGTH(unicode);
7761
7762
    if (code_page < 0) {
7763
        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7764
        return NULL;
7765
    }
7766
7767
    if (len == 0)
7768
        return PyBytes_FromStringAndSize(NULL, 0);
7769
7770
    offset = 0;
7771
    do
7772
    {
7773
#ifdef NEED_RETRY
7774
        if (len > DECODING_CHUNK_SIZE) {
7775
            chunk_len = DECODING_CHUNK_SIZE;
7776
            done = 0;
7777
        }
7778
        else
7779
#endif
7780
        {
7781
            chunk_len = (int)len;
7782
            done = 1;
7783
        }
7784
7785
        ret = encode_code_page_strict(code_page, &outbytes,
7786
                                      unicode, offset, chunk_len,
7787
                                      errors);
7788
        if (ret == -2)
7789
            ret = encode_code_page_errors(code_page, &outbytes,
7790
                                          unicode, offset,
7791
                                          chunk_len, errors);
7792
        if (ret < 0) {
7793
            Py_XDECREF(outbytes);
7794
            return NULL;
7795
        }
7796
7797
        offset += chunk_len;
7798
        len -= chunk_len;
7799
    } while (!done);
7800
7801
    return outbytes;
7802
}
7803
7804
PyObject *
7805
PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7806
                     Py_ssize_t size,
7807
                     const char *errors)
7808
{
7809
    PyObject *unicode, *res;
7810
    unicode = PyUnicode_FromWideChar(p, size);
7811
    if (unicode == NULL)
7812
        return NULL;
7813
    res = encode_code_page(CP_ACP, unicode, errors);
7814
    Py_DECREF(unicode);
7815
    return res;
7816
}
7817
7818
PyObject *
7819
PyUnicode_EncodeCodePage(int code_page,
7820
                         PyObject *unicode,
7821
                         const char *errors)
7822
{
7823
    return encode_code_page(code_page, unicode, errors);
7824
}
7825
7826
PyObject *
7827
PyUnicode_AsMBCSString(PyObject *unicode)
7828
{
7829
    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7830
}
7831
7832
#undef NEED_RETRY
7833
7834
#endif /* MS_WINDOWS */
7835
7836
/* --- Character Mapping Codec -------------------------------------------- */
7837
7838
static int
7839
charmap_decode_string(const char *s,
7840
                      Py_ssize_t size,
7841
                      PyObject *mapping,
7842
                      const char *errors,
7843
                      _PyUnicodeWriter *writer)
7844
0
{
7845
0
    const char *starts = s;
7846
0
    const char *e;
7847
0
    Py_ssize_t startinpos, endinpos;
7848
0
    PyObject *errorHandler = NULL, *exc = NULL;
7849
0
    Py_ssize_t maplen;
7850
0
    enum PyUnicode_Kind mapkind;
7851
0
    void *mapdata;
7852
0
    Py_UCS4 x;
7853
0
    unsigned char ch;
7854
7855
0
    if (PyUnicode_READY(mapping) == -1)
7856
0
        return -1;
7857
7858
0
    maplen = PyUnicode_GET_LENGTH(mapping);
7859
0
    mapdata = PyUnicode_DATA(mapping);
7860
0
    mapkind = PyUnicode_KIND(mapping);
7861
7862
0
    e = s + size;
7863
7864
0
    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7865
        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7866
         * is disabled in encoding aliases, latin1 is preferred because
7867
         * its implementation is faster. */
7868
0
        Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7869
0
        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7870
0
        Py_UCS4 maxchar = writer->maxchar;
7871
7872
0
        assert (writer->kind == PyUnicode_1BYTE_KIND);
7873
0
        while (s < e) {
7874
0
            ch = *s;
7875
0
            x = mapdata_ucs1[ch];
7876
0
            if (x > maxchar) {
7877
0
                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7878
0
                    goto onError;
7879
0
                maxchar = writer->maxchar;
7880
0
                outdata = (Py_UCS1 *)writer->data;
7881
0
            }
7882
0
            outdata[writer->pos] = x;
7883
0
            writer->pos++;
7884
0
            ++s;
7885
0
        }
7886
0
        return 0;
7887
0
    }
7888
7889
0
    while (s < e) {
7890
0
        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7891
0
            enum PyUnicode_Kind outkind = writer->kind;
7892
0
            Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7893
0
            if (outkind == PyUnicode_1BYTE_KIND) {
7894
0
                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7895
0
                Py_UCS4 maxchar = writer->maxchar;
7896
0
                while (s < e) {
7897
0
                    ch = *s;
7898
0
                    x = mapdata_ucs2[ch];
7899
0
                    if (x > maxchar)
7900
0
                        goto Error;
7901
0
                    outdata[writer->pos] = x;
7902
0
                    writer->pos++;
7903
0
                    ++s;
7904
0
                }
7905
0
                break;
7906
0
            }
7907
0
            else if (outkind == PyUnicode_2BYTE_KIND) {
7908
0
                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7909
0
                while (s < e) {
7910
0
                    ch = *s;
7911
0
                    x = mapdata_ucs2[ch];
7912
0
                    if (x == 0xFFFE)
7913
0
                        goto Error;
7914
0
                    outdata[writer->pos] = x;
7915
0
                    writer->pos++;
7916
0
                    ++s;
7917
0
                }
7918
0
                break;
7919
0
            }
7920
0
        }
7921
0
        ch = *s;
7922
7923
0
        if (ch < maplen)
7924
0
            x = PyUnicode_READ(mapkind, mapdata, ch);
7925
0
        else
7926
0
            x = 0xfffe; /* invalid value */
7927
0
Error:
7928
0
        if (x == 0xfffe)
7929
0
        {
7930
            /* undefined mapping */
7931
0
            startinpos = s-starts;
7932
0
            endinpos = startinpos+1;
7933
0
            if (unicode_decode_call_errorhandler_writer(
7934
0
                    errors, &errorHandler,
7935
0
                    "charmap", "character maps to <undefined>",
7936
0
                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7937
0
                    writer)) {
7938
0
                goto onError;
7939
0
            }
7940
0
            continue;
7941
0
        }
7942
7943
0
        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7944
0
            goto onError;
7945
0
        ++s;
7946
0
    }
7947
0
    Py_XDECREF(errorHandler);
7948
0
    Py_XDECREF(exc);
7949
0
    return 0;
7950
7951
0
onError:
7952
0
    Py_XDECREF(errorHandler);
7953
0
    Py_XDECREF(exc);
7954
0
    return -1;
7955
0
}
7956
7957
static int
7958
charmap_decode_mapping(const char *s,
7959
                       Py_ssize_t size,
7960
                       PyObject *mapping,
7961
                       const char *errors,
7962
                       _PyUnicodeWriter *writer)
7963
0
{
7964
0
    const char *starts = s;
7965
0
    const char *e;
7966
0
    Py_ssize_t startinpos, endinpos;
7967
0
    PyObject *errorHandler = NULL, *exc = NULL;
7968
0
    unsigned char ch;
7969
0
    PyObject *key, *item = NULL;
7970
7971
0
    e = s + size;
7972
7973
0
    while (s < e) {
7974
0
        ch = *s;
7975
7976
        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7977
0
        key = PyLong_FromLong((long)ch);
7978
0
        if (key == NULL)
7979
0
            goto onError;
7980
7981
0
        item = PyObject_GetItem(mapping, key);
7982
0
        Py_DECREF(key);
7983
0
        if (item == NULL) {
7984
0
            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7985
                /* No mapping found means: mapping is undefined. */
7986
0
                PyErr_Clear();
7987
0
                goto Undefined;
7988
0
            } else
7989
0
                goto onError;
7990
0
        }
7991
7992
        /* Apply mapping */
7993
0
        if (item == Py_None)
7994
0
            goto Undefined;
7995
0
        if (PyLong_Check(item)) {
7996
0
            long value = PyLong_AS_LONG(item);
7997
0
            if (value == 0xFFFE)
7998
0
                goto Undefined;
7999
0
            if (value < 0 || value > MAX_UNICODE) {
8000
0
                PyErr_Format(PyExc_TypeError,
8001
0
                             "character mapping must be in range(0x%lx)",
8002
0
                             (unsigned long)MAX_UNICODE + 1);
8003
0
                goto onError;
8004
0
            }
8005
8006
0
            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8007
0
                goto onError;
8008
0
        }
8009
0
        else if (PyUnicode_Check(item)) {
8010
0
            if (PyUnicode_READY(item) == -1)
8011
0
                goto onError;
8012
0
            if (PyUnicode_GET_LENGTH(item) == 1) {
8013
0
                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8014
0
                if (value == 0xFFFE)
8015
0
                    goto Undefined;
8016
0
                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8017
0
                    goto onError;
8018
0
            }
8019
0
            else {
8020
0
                writer->overallocate = 1;
8021
0
                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8022
0
                    goto onError;
8023
0
            }
8024
0
        }
8025
0
        else {
8026
            /* wrong return value */
8027
0
            PyErr_SetString(PyExc_TypeError,
8028
0
                            "character mapping must return integer, None or str");
8029
0
            goto onError;
8030
0
        }
8031
0
        Py_CLEAR(item);
8032
0
        ++s;
8033
0
        continue;
8034
8035
0
Undefined:
8036
        /* undefined mapping */
8037
0
        Py_CLEAR(item);
8038
0
        startinpos = s-starts;
8039
0
        endinpos = startinpos+1;
8040
0
        if (unicode_decode_call_errorhandler_writer(
8041
0
                errors, &errorHandler,
8042
0
                "charmap", "character maps to <undefined>",
8043
0
                &starts, &e, &startinpos, &endinpos, &exc, &s,
8044
0
                writer)) {
8045
0
            goto onError;
8046
0
        }
8047
0
    }
8048
0
    Py_XDECREF(errorHandler);
8049
0
    Py_XDECREF(exc);
8050
0
    return 0;
8051
8052
0
onError:
8053
0
    Py_XDECREF(item);
8054
0
    Py_XDECREF(errorHandler);
8055
0
    Py_XDECREF(exc);
8056
0
    return -1;
8057
0
}
8058
8059
PyObject *
8060
PyUnicode_DecodeCharmap(const char *s,
8061
                        Py_ssize_t size,
8062
                        PyObject *mapping,
8063
                        const char *errors)
8064
0
{
8065
0
    _PyUnicodeWriter writer;
8066
8067
    /* Default to Latin-1 */
8068
0
    if (mapping == NULL)
8069
0
        return PyUnicode_DecodeLatin1(s, size, errors);
8070
8071
0
    if (size == 0)
8072
0
        _Py_RETURN_UNICODE_EMPTY();
8073
0
    _PyUnicodeWriter_Init(&writer);
8074
0
    writer.min_length = size;
8075
0
    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
8076
0
        goto onError;
8077
8078
0
    if (PyUnicode_CheckExact(mapping)) {
8079
0
        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8080
0
            goto onError;
8081
0
    }
8082
0
    else {
8083
0
        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8084
0
            goto onError;
8085
0
    }
8086
0
    return _PyUnicodeWriter_Finish(&writer);
8087
8088
0
  onError:
8089
0
    _PyUnicodeWriter_Dealloc(&writer);
8090
0
    return NULL;
8091
0
}
8092
8093
/* Charmap encoding: the lookup table */
8094
8095
struct encoding_map {
8096
    PyObject_HEAD
8097
    unsigned char level1[32];
8098
    int count2, count3;
8099
    unsigned char level23[1];
8100
};
8101
8102
static PyObject*
8103
encoding_map_size(PyObject *obj, PyObject* args)
8104
0
{
8105
0
    struct encoding_map *map = (struct encoding_map*)obj;
8106
0
    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
8107
0
                           128*map->count3);
8108
0
}
8109
8110
static PyMethodDef encoding_map_methods[] = {
8111
    {"size", encoding_map_size, METH_NOARGS,
8112
     PyDoc_STR("Return the size (in bytes) of this object") },
8113
    { 0 }
8114
};
8115
8116
static PyTypeObject EncodingMapType = {
8117
    PyVarObject_HEAD_INIT(NULL, 0)
8118
    "EncodingMap",          /*tp_name*/
8119
    sizeof(struct encoding_map),   /*tp_basicsize*/
8120
    0,                      /*tp_itemsize*/
8121
    /* methods */
8122
    0,                      /*tp_dealloc*/
8123
    0,                      /*tp_vectorcall_offset*/
8124
    0,                      /*tp_getattr*/
8125
    0,                      /*tp_setattr*/
8126
    0,                      /*tp_as_async*/
8127
    0,                      /*tp_repr*/
8128
    0,                      /*tp_as_number*/
8129
    0,                      /*tp_as_sequence*/
8130
    0,                      /*tp_as_mapping*/
8131
    0,                      /*tp_hash*/
8132
    0,                      /*tp_call*/
8133
    0,                      /*tp_str*/
8134
    0,                      /*tp_getattro*/
8135
    0,                      /*tp_setattro*/
8136
    0,                      /*tp_as_buffer*/
8137
    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
8138
    0,                      /*tp_doc*/
8139
    0,                      /*tp_traverse*/
8140
    0,                      /*tp_clear*/
8141
    0,                      /*tp_richcompare*/
8142
    0,                      /*tp_weaklistoffset*/
8143
    0,                      /*tp_iter*/
8144
    0,                      /*tp_iternext*/
8145
    encoding_map_methods,   /*tp_methods*/
8146
    0,                      /*tp_members*/
8147
    0,                      /*tp_getset*/
8148
    0,                      /*tp_base*/
8149
    0,                      /*tp_dict*/
8150
    0,                      /*tp_descr_get*/
8151
    0,                      /*tp_descr_set*/
8152
    0,                      /*tp_dictoffset*/
8153
    0,                      /*tp_init*/
8154
    0,                      /*tp_alloc*/
8155
    0,                      /*tp_new*/
8156
    0,                      /*tp_free*/
8157
    0,                      /*tp_is_gc*/
8158
};
8159
8160
PyObject*
8161
PyUnicode_BuildEncodingMap(PyObject* string)
8162
0
{
8163
0
    PyObject *result;
8164
0
    struct encoding_map *mresult;
8165
0
    int i;
8166
0
    int need_dict = 0;
8167
0
    unsigned char level1[32];
8168
0
    unsigned char level2[512];
8169
0
    unsigned char *mlevel1, *mlevel2, *mlevel3;
8170
0
    int count2 = 0, count3 = 0;
8171
0
    int kind;
8172
0
    void *data;
8173
0
    Py_ssize_t length;
8174
0
    Py_UCS4 ch;
8175
8176
0
    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
8177
0
        PyErr_BadArgument();
8178
0
        return NULL;
8179
0
    }
8180
0
    kind = PyUnicode_KIND(string);
8181
0
    data = PyUnicode_DATA(string);
8182
0
    length = PyUnicode_GET_LENGTH(string);
8183
0
    length = Py_MIN(length, 256);
8184
0
    memset(level1, 0xFF, sizeof level1);
8185
0
    memset(level2, 0xFF, sizeof level2);
8186
8187
    /* If there isn't a one-to-one mapping of NULL to \0,
8188
       or if there are non-BMP characters, we need to use
8189
       a mapping dictionary. */
8190
0
    if (PyUnicode_READ(kind, data, 0) != 0)
8191
0
        need_dict = 1;
8192
0
    for (i = 1; i < length; i++) {
8193
0
        int l1, l2;
8194
0
        ch = PyUnicode_READ(kind, data, i);
8195
0
        if (ch == 0 || ch > 0xFFFF) {
8196
0
            need_dict = 1;
8197
0
            break;
8198
0
        }
8199
0
        if (ch == 0xFFFE)
8200
            /* unmapped character */
8201
0
            continue;
8202
0
        l1 = ch >> 11;
8203
0
        l2 = ch >> 7;
8204
0
        if (level1[l1] == 0xFF)
8205
0
            level1[l1] = count2++;
8206
0
        if (level2[l2] == 0xFF)
8207
0
            level2[l2] = count3++;
8208
0
    }
8209
8210
0
    if (count2 >= 0xFF || count3 >= 0xFF)
8211
0
        need_dict = 1;
8212
8213
0
    if (need_dict) {
8214
0
        PyObject *result = PyDict_New();
8215
0
        PyObject *key, *value;
8216
0
        if (!result)
8217
0
            return NULL;
8218
0
        for (i = 0; i < length; i++) {
8219
0
            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
8220
0
            value = PyLong_FromLong(i);
8221
0
            if (!key || !value)
8222
0
                goto failed1;
8223
0
            if (PyDict_SetItem(result, key, value) == -1)
8224
0
                goto failed1;
8225
0
            Py_DECREF(key);
8226
0
            Py_DECREF(value);
8227
0
        }
8228
0
        return result;
8229
0
      failed1:
8230
0
        Py_XDECREF(key);
8231
0
        Py_XDECREF(value);
8232
0
        Py_DECREF(result);
8233
0
        return NULL;
8234
0
    }
8235
8236
    /* Create a three-level trie */
8237
0
    result = PyObject_MALLOC(sizeof(struct encoding_map) +
8238
0
                             16*count2 + 128*count3 - 1);
8239
0
    if (!result)
8240
0
        return PyErr_NoMemory();
8241
0
    PyObject_Init(result, &EncodingMapType);
8242
0
    mresult = (struct encoding_map*)result;
8243
0
    mresult->count2 = count2;
8244
0
    mresult->count3 = count3;
8245
0
    mlevel1 = mresult->level1;
8246
0
    mlevel2 = mresult->level23;
8247
0
    mlevel3 = mresult->level23 + 16*count2;
8248
0
    memcpy(mlevel1, level1, 32);
8249
0
    memset(mlevel2, 0xFF, 16*count2);
8250
0
    memset(mlevel3, 0, 128*count3);
8251
0
    count3 = 0;
8252
0
    for (i = 1; i < length; i++) {
8253
0
        int o1, o2, o3, i2, i3;
8254
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8255
0
        if (ch == 0xFFFE)
8256
            /* unmapped character */
8257
0
            continue;
8258
0
        o1 = ch>>11;
8259
0
        o2 = (ch>>7) & 0xF;
8260
0
        i2 = 16*mlevel1[o1] + o2;
8261
0
        if (mlevel2[i2] == 0xFF)
8262
0
            mlevel2[i2] = count3++;
8263
0
        o3 = ch & 0x7F;
8264
0
        i3 = 128*mlevel2[i2] + o3;
8265
0
        mlevel3[i3] = i;
8266
0
    }
8267
0
    return result;
8268
0
}
8269
8270
static int
8271
encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8272
0
{
8273
0
    struct encoding_map *map = (struct encoding_map*)mapping;
8274
0
    int l1 = c>>11;
8275
0
    int l2 = (c>>7) & 0xF;
8276
0
    int l3 = c & 0x7F;
8277
0
    int i;
8278
8279
0
    if (c > 0xFFFF)
8280
0
        return -1;
8281
0
    if (c == 0)
8282
0
        return 0;
8283
    /* level 1*/
8284
0
    i = map->level1[l1];
8285
0
    if (i == 0xFF) {
8286
0
        return -1;
8287
0
    }
8288
    /* level 2*/
8289
0
    i = map->level23[16*i+l2];
8290
0
    if (i == 0xFF) {
8291
0
        return -1;
8292
0
    }
8293
    /* level 3 */
8294
0
    i = map->level23[16*map->count2 + 128*i + l3];
8295
0
    if (i == 0) {
8296
0
        return -1;
8297
0
    }
8298
0
    return i;
8299
0
}
8300
8301
/* Lookup the character ch in the mapping. If the character
8302
   can't be found, Py_None is returned (or NULL, if another
8303
   error occurred). */
8304
static PyObject *
8305
charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8306
0
{
8307
0
    PyObject *w = PyLong_FromLong((long)c);
8308
0
    PyObject *x;
8309
8310
0
    if (w == NULL)
8311
0
        return NULL;
8312
0
    x = PyObject_GetItem(mapping, w);
8313
0
    Py_DECREF(w);
8314
0
    if (x == NULL) {
8315
0
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8316
            /* No mapping found means: mapping is undefined. */
8317
0
            PyErr_Clear();
8318
0
            Py_RETURN_NONE;
8319
0
        } else
8320
0
            return NULL;
8321
0
    }
8322
0
    else if (x == Py_None)
8323
0
        return x;
8324
0
    else if (PyLong_Check(x)) {
8325
0
        long value = PyLong_AS_LONG(x);
8326
0
        if (value < 0 || value > 255) {
8327
0
            PyErr_SetString(PyExc_TypeError,
8328
0
                            "character mapping must be in range(256)");
8329
0
            Py_DECREF(x);
8330
0
            return NULL;
8331
0
        }
8332
0
        return x;
8333
0
    }
8334
0
    else if (PyBytes_Check(x))
8335
0
        return x;
8336
0
    else {
8337
        /* wrong return value */
8338
0
        PyErr_Format(PyExc_TypeError,
8339
0
                     "character mapping must return integer, bytes or None, not %.400s",
8340
0
                     x->ob_type->tp_name);
8341
0
        Py_DECREF(x);
8342
0
        return NULL;
8343
0
    }
8344
0
}
8345
8346
static int
8347
charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8348
0
{
8349
0
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8350
    /* exponentially overallocate to minimize reallocations */
8351
0
    if (requiredsize < 2*outsize)
8352
0
        requiredsize = 2*outsize;
8353
0
    if (_PyBytes_Resize(outobj, requiredsize))
8354
0
        return -1;
8355
0
    return 0;
8356
0
}
8357
8358
typedef enum charmapencode_result {
8359
    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8360
} charmapencode_result;
8361
/* lookup the character, put the result in the output string and adjust
8362
   various state variables. Resize the output bytes object if not enough
8363
   space is available. Return a new reference to the object that
8364
   was put in the output buffer, or Py_None, if the mapping was undefined
8365
   (in which case no character was written) or NULL, if a
8366
   reallocation error occurred. The caller must decref the result */
8367
static charmapencode_result
8368
charmapencode_output(Py_UCS4 c, PyObject *mapping,
8369
                     PyObject **outobj, Py_ssize_t *outpos)
8370
0
{
8371
0
    PyObject *rep;
8372
0
    char *outstart;
8373
0
    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8374
8375
0
    if (Py_TYPE(mapping) == &EncodingMapType) {
8376
0
        int res = encoding_map_lookup(c, mapping);
8377
0
        Py_ssize_t requiredsize = *outpos+1;
8378
0
        if (res == -1)
8379
0
            return enc_FAILED;
8380
0
        if (outsize<requiredsize)
8381
0
            if (charmapencode_resize(outobj, outpos, requiredsize))
8382
0
                return enc_EXCEPTION;
8383
0
        outstart = PyBytes_AS_STRING(*outobj);
8384
0
        outstart[(*outpos)++] = (char)res;
8385
0
        return enc_SUCCESS;
8386
0
    }
8387
8388
0
    rep = charmapencode_lookup(c, mapping);
8389
0
    if (rep==NULL)
8390
0
        return enc_EXCEPTION;
8391
0
    else if (rep==Py_None) {
8392
0
        Py_DECREF(rep);
8393
0
        return enc_FAILED;
8394
0
    } else {
8395
0
        if (PyLong_Check(rep)) {
8396
0
            Py_ssize_t requiredsize = *outpos+1;
8397
0
            if (outsize<requiredsize)
8398
0
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8399
0
                    Py_DECREF(rep);
8400
0
                    return enc_EXCEPTION;
8401
0
                }
8402
0
            outstart = PyBytes_AS_STRING(*outobj);
8403
0
            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8404
0
        }
8405
0
        else {
8406
0
            const char *repchars = PyBytes_AS_STRING(rep);
8407
0
            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8408
0
            Py_ssize_t requiredsize = *outpos+repsize;
8409
0
            if (outsize<requiredsize)
8410
0
                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8411
0
                    Py_DECREF(rep);
8412
0
                    return enc_EXCEPTION;
8413
0
                }
8414
0
            outstart = PyBytes_AS_STRING(*outobj);
8415
0
            memcpy(outstart + *outpos, repchars, repsize);
8416
0
            *outpos += repsize;
8417
0
        }
8418
0
    }
8419
0
    Py_DECREF(rep);
8420
0
    return enc_SUCCESS;
8421
0
}
8422
8423
/* handle an error in PyUnicode_EncodeCharmap
8424
   Return 0 on success, -1 on error */
8425
static int
8426
charmap_encoding_error(
8427
    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8428
    PyObject **exceptionObject,
8429
    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8430
    PyObject **res, Py_ssize_t *respos)
8431
0
{
8432
0
    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8433
0
    Py_ssize_t size, repsize;
8434
0
    Py_ssize_t newpos;
8435
0
    enum PyUnicode_Kind kind;
8436
0
    void *data;
8437
0
    Py_ssize_t index;
8438
    /* startpos for collecting unencodable chars */
8439
0
    Py_ssize_t collstartpos = *inpos;
8440
0
    Py_ssize_t collendpos = *inpos+1;
8441
0
    Py_ssize_t collpos;
8442
0
    const char *encoding = "charmap";
8443
0
    const char *reason = "character maps to <undefined>";
8444
0
    charmapencode_result x;
8445
0
    Py_UCS4 ch;
8446
0
    int val;
8447
8448
0
    if (PyUnicode_READY(unicode) == -1)
8449
0
        return -1;
8450
0
    size = PyUnicode_GET_LENGTH(unicode);
8451
    /* find all unencodable characters */
8452
0
    while (collendpos < size) {
8453
0
        PyObject *rep;
8454
0
        if (Py_TYPE(mapping) == &EncodingMapType) {
8455
0
            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8456
0
            val = encoding_map_lookup(ch, mapping);
8457
0
            if (val != -1)
8458
0
                break;
8459
0
            ++collendpos;
8460
0
            continue;
8461
0
        }
8462
8463
0
        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8464
0
        rep = charmapencode_lookup(ch, mapping);
8465
0
        if (rep==NULL)
8466
0
            return -1;
8467
0
        else if (rep!=Py_None) {
8468
0
            Py_DECREF(rep);
8469
0
            break;
8470
0
        }
8471
0
        Py_DECREF(rep);
8472
0
        ++collendpos;
8473
0
    }
8474
    /* cache callback name lookup
8475
     * (if not done yet, i.e. it's the first error) */
8476
0
    if (*error_handler == _Py_ERROR_UNKNOWN)
8477
0
        *error_handler = _Py_GetErrorHandler(errors);
8478
8479
0
    switch (*error_handler) {
8480
0
    case _Py_ERROR_STRICT:
8481
0
        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8482
0
        return -1;
8483
8484
0
    case _Py_ERROR_REPLACE:
8485
0
        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8486
0
            x = charmapencode_output('?', mapping, res, respos);
8487
0
            if (x==enc_EXCEPTION) {
8488
0
                return -1;
8489
0
            }
8490
0
            else if (x==enc_FAILED) {
8491
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8492
0
                return -1;
8493
0
            }
8494
0
        }
8495
        /* fall through */
8496
0
    case _Py_ERROR_IGNORE:
8497
0
        *inpos = collendpos;
8498
0
        break;
8499
8500
0
    case _Py_ERROR_XMLCHARREFREPLACE:
8501
        /* generate replacement (temporarily (mis)uses p) */
8502
0
        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8503
0
            char buffer[2+29+1+1];
8504
0
            char *cp;
8505
0
            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8506
0
            for (cp = buffer; *cp; ++cp) {
8507
0
                x = charmapencode_output(*cp, mapping, res, respos);
8508
0
                if (x==enc_EXCEPTION)
8509
0
                    return -1;
8510
0
                else if (x==enc_FAILED) {
8511
0
                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8512
0
                    return -1;
8513
0
                }
8514
0
            }
8515
0
        }
8516
0
        *inpos = collendpos;
8517
0
        break;
8518
8519
0
    default:
8520
0
        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8521
0
                                                      encoding, reason, unicode, exceptionObject,
8522
0
                                                      collstartpos, collendpos, &newpos);
8523
0
        if (repunicode == NULL)
8524
0
            return -1;
8525
0
        if (PyBytes_Check(repunicode)) {
8526
            /* Directly copy bytes result to output. */
8527
0
            Py_ssize_t outsize = PyBytes_Size(*res);
8528
0
            Py_ssize_t requiredsize;
8529
0
            repsize = PyBytes_Size(repunicode);
8530
0
            requiredsize = *respos + repsize;
8531
0
            if (requiredsize > outsize)
8532
                /* Make room for all additional bytes. */
8533
0
                if (charmapencode_resize(res, respos, requiredsize)) {
8534
0
                    Py_DECREF(repunicode);
8535
0
                    return -1;
8536
0
                }
8537
0
            memcpy(PyBytes_AsString(*res) + *respos,
8538
0
                   PyBytes_AsString(repunicode),  repsize);
8539
0
            *respos += repsize;
8540
0
            *inpos = newpos;
8541
0
            Py_DECREF(repunicode);
8542
0
            break;
8543
0
        }
8544
        /* generate replacement  */
8545
0
        if (PyUnicode_READY(repunicode) == -1) {
8546
0
            Py_DECREF(repunicode);
8547
0
            return -1;
8548
0
        }
8549
0
        repsize = PyUnicode_GET_LENGTH(repunicode);
8550
0
        data = PyUnicode_DATA(repunicode);
8551
0
        kind = PyUnicode_KIND(repunicode);
8552
0
        for (index = 0; index < repsize; index++) {
8553
0
            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8554
0
            x = charmapencode_output(repch, mapping, res, respos);
8555
0
            if (x==enc_EXCEPTION) {
8556
0
                Py_DECREF(repunicode);
8557
0
                return -1;
8558
0
            }
8559
0
            else if (x==enc_FAILED) {
8560
0
                Py_DECREF(repunicode);
8561
0
                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8562
0
                return -1;
8563
0
            }
8564
0
        }
8565
0
        *inpos = newpos;
8566
0
        Py_DECREF(repunicode);
8567
0
    }
8568
0
    return 0;
8569
0
}
8570
8571
PyObject *
8572
_PyUnicode_EncodeCharmap(PyObject *unicode,
8573
                         PyObject *mapping,
8574
                         const char *errors)
8575
0
{
8576
    /* output object */
8577
0
    PyObject *res = NULL;
8578
    /* current input position */
8579
0
    Py_ssize_t inpos = 0;
8580
0
    Py_ssize_t size;
8581
    /* current output position */
8582
0
    Py_ssize_t respos = 0;
8583
0
    PyObject *error_handler_obj = NULL;
8584
0
    PyObject *exc = NULL;
8585
0
    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8586
0
    void *data;
8587
0
    int kind;
8588
8589
0
    if (PyUnicode_READY(unicode) == -1)
8590
0
        return NULL;
8591
0
    size = PyUnicode_GET_LENGTH(unicode);
8592
0
    data = PyUnicode_DATA(unicode);
8593
0
    kind = PyUnicode_KIND(unicode);
8594
8595
    /* Default to Latin-1 */
8596
0
    if (mapping == NULL)
8597
0
        return unicode_encode_ucs1(unicode, errors, 256);
8598
8599
    /* allocate enough for a simple encoding without
8600
       replacements, if we need more, we'll resize */
8601
0
    res = PyBytes_FromStringAndSize(NULL, size);
8602
0
    if (res == NULL)
8603
0
        goto onError;
8604
0
    if (size == 0)
8605
0
        return res;
8606
8607
0
    while (inpos<size) {
8608
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8609
        /* try to encode it */
8610
0
        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8611
0
        if (x==enc_EXCEPTION) /* error */
8612
0
            goto onError;
8613
0
        if (x==enc_FAILED) { /* unencodable character */
8614
0
            if (charmap_encoding_error(unicode, &inpos, mapping,
8615
0
                                       &exc,
8616
0
                                       &error_handler, &error_handler_obj, errors,
8617
0
                                       &res, &respos)) {
8618
0
                goto onError;
8619
0
            }
8620
0
        }
8621
0
        else
8622
            /* done with this character => adjust input position */
8623
0
            ++inpos;
8624
0
    }
8625
8626
    /* Resize if we allocated to much */
8627
0
    if (respos<PyBytes_GET_SIZE(res))
8628
0
        if (_PyBytes_Resize(&res, respos) < 0)
8629
0
            goto onError;
8630
8631
0
    Py_XDECREF(exc);
8632
0
    Py_XDECREF(error_handler_obj);
8633
0
    return res;
8634
8635
0
  onError:
8636
0
    Py_XDECREF(res);
8637
0
    Py_XDECREF(exc);
8638
0
    Py_XDECREF(error_handler_obj);
8639
0
    return NULL;
8640
0
}
8641
8642
/* Deprecated */
8643
PyObject *
8644
PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8645
                        Py_ssize_t size,
8646
                        PyObject *mapping,
8647
                        const char *errors)
8648
0
{
8649
0
    PyObject *result;
8650
0
    PyObject *unicode = PyUnicode_FromWideChar(p, size);
8651
0
    if (unicode == NULL)
8652
0
        return NULL;
8653
0
    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8654
0
    Py_DECREF(unicode);
8655
0
    return result;
8656
0
}
8657
8658
PyObject *
8659
PyUnicode_AsCharmapString(PyObject *unicode,
8660
                          PyObject *mapping)
8661
0
{
8662
0
    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8663
0
        PyErr_BadArgument();
8664
0
        return NULL;
8665
0
    }
8666
0
    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8667
0
}
8668
8669
/* create or adjust a UnicodeTranslateError */
8670
static void
8671
make_translate_exception(PyObject **exceptionObject,
8672
                         PyObject *unicode,
8673
                         Py_ssize_t startpos, Py_ssize_t endpos,
8674
                         const char *reason)
8675
0
{
8676
0
    if (*exceptionObject == NULL) {
8677
0
        *exceptionObject = _PyUnicodeTranslateError_Create(
8678
0
            unicode, startpos, endpos, reason);
8679
0
    }
8680
0
    else {
8681
0
        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8682
0
            goto onError;
8683
0
        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8684
0
            goto onError;
8685
0
        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8686
0
            goto onError;
8687
0
        return;
8688
0
      onError:
8689
0
        Py_CLEAR(*exceptionObject);
8690
0
    }
8691
0
}
8692
8693
/* error handling callback helper:
8694
   build arguments, call the callback and check the arguments,
8695
   put the result into newpos and return the replacement string, which
8696
   has to be freed by the caller */
8697
static PyObject *
8698
unicode_translate_call_errorhandler(const char *errors,
8699
                                    PyObject **errorHandler,
8700
                                    const char *reason,
8701
                                    PyObject *unicode, PyObject **exceptionObject,
8702
                                    Py_ssize_t startpos, Py_ssize_t endpos,
8703
                                    Py_ssize_t *newpos)
8704
0
{
8705
0
    static const char *argparse = "Un;translating error handler must return (str, int) tuple";
8706
8707
0
    Py_ssize_t i_newpos;
8708
0
    PyObject *restuple;
8709
0
    PyObject *resunicode;
8710
8711
0
    if (*errorHandler == NULL) {
8712
0
        *errorHandler = PyCodec_LookupError(errors);
8713
0
        if (*errorHandler == NULL)
8714
0
            return NULL;
8715
0
    }
8716
8717
0
    make_translate_exception(exceptionObject,
8718
0
                             unicode, startpos, endpos, reason);
8719
0
    if (*exceptionObject == NULL)
8720
0
        return NULL;
8721
8722
0
    restuple = PyObject_CallFunctionObjArgs(
8723
0
        *errorHandler, *exceptionObject, NULL);
8724
0
    if (restuple == NULL)
8725
0
        return NULL;
8726
0
    if (!PyTuple_Check(restuple)) {
8727
0
        PyErr_SetString(PyExc_TypeError, &argparse[3]);
8728
0
        Py_DECREF(restuple);
8729
0
        return NULL;
8730
0
    }
8731
0
    if (!PyArg_ParseTuple(restuple, argparse,
8732
0
                          &resunicode, &i_newpos)) {
8733
0
        Py_DECREF(restuple);
8734
0
        return NULL;
8735
0
    }
8736
0
    if (i_newpos<0)
8737
0
        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8738
0
    else
8739
0
        *newpos = i_newpos;
8740
0
    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8741
0
        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8742
0
        Py_DECREF(restuple);
8743
0
        return NULL;
8744
0
    }
8745
0
    Py_INCREF(resunicode);
8746
0
    Py_DECREF(restuple);
8747
0
    return resunicode;
8748
0
}
8749
8750
/* Lookup the character ch in the mapping and put the result in result,
8751
   which must be decrefed by the caller.
8752
   Return 0 on success, -1 on error */
8753
static int
8754
charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8755
95
{
8756
95
    PyObject *w = PyLong_FromLong((long)c);
8757
95
    PyObject *x;
8758
8759
95
    if (w == NULL)
8760
0
        return -1;
8761
95
    x = PyObject_GetItem(mapping, w);
8762
95
    Py_DECREF(w);
8763
95
    if (x == NULL) {
8764
41
        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8765
            /* No mapping found means: use 1:1 mapping. */
8766
41
            PyErr_Clear();
8767
41
            *result = NULL;
8768
41
            return 0;
8769
41
        } else
8770
0
            return -1;
8771
41
    }
8772
54
    else if (x == Py_None) {
8773
0
        *result = x;
8774
0
        return 0;
8775
0
    }
8776
54
    else if (PyLong_Check(x)) {
8777
0
        long value = PyLong_AS_LONG(x);
8778
0
        if (value < 0 || value > MAX_UNICODE) {
8779
0
            PyErr_Format(PyExc_ValueError,
8780
0
                         "character mapping must be in range(0x%x)",
8781
0
                         MAX_UNICODE+1);
8782
0
            Py_DECREF(x);
8783
0
            return -1;
8784
0
        }
8785
0
        *result = x;
8786
0
        return 0;
8787
0
    }
8788
54
    else if (PyUnicode_Check(x)) {
8789
54
        *result = x;
8790
54
        return 0;
8791
54
    }
8792
0
    else {
8793
        /* wrong return value */
8794
0
        PyErr_SetString(PyExc_TypeError,
8795
0
                        "character mapping must return integer, None or str");
8796
0
        Py_DECREF(x);
8797
0
        return -1;
8798
0
    }
8799
95
}
8800
8801
/* lookup the character, write the result into the writer.
8802
   Return 1 if the result was written into the writer, return 0 if the mapping
8803
   was undefined, raise an exception return -1 on error. */
8804
static int
8805
charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8806
                        _PyUnicodeWriter *writer)
8807
37
{
8808
37
    PyObject *item;
8809
8810
37
    if (charmaptranslate_lookup(ch, mapping, &item))
8811
0
        return -1;
8812
8813
37
    if (item == NULL) {
8814
        /* not found => default to 1:1 mapping */
8815
8
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8816
0
            return -1;
8817
0
        }
8818
8
        return 1;
8819
8
    }
8820
8821
29
    if (item == Py_None) {
8822
0
        Py_DECREF(item);
8823
0
        return 0;
8824
0
    }
8825
8826
29
    if (PyLong_Check(item)) {
8827
0
        long ch = (Py_UCS4)PyLong_AS_LONG(item);
8828
        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8829
           used it */
8830
0
        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8831
0
            Py_DECREF(item);
8832
0
            return -1;
8833
0
        }
8834
0
        Py_DECREF(item);
8835
0
        return 1;
8836
0
    }
8837
8838
29
    if (!PyUnicode_Check(item)) {
8839
0
        Py_DECREF(item);
8840
0
        return -1;
8841
0
    }
8842
8843
29
    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8844
0
        Py_DECREF(item);
8845
0
        return -1;
8846
0
    }
8847
8848
29
    Py_DECREF(item);
8849
29
    return 1;
8850
29
}
8851
8852
static int
8853
unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8854
                              Py_UCS1 *translate)
8855
58
{
8856
58
    PyObject *item = NULL;
8857
58
    int ret = 0;
8858
8859
58
    if (charmaptranslate_lookup(ch, mapping, &item)) {
8860
0
        return -1;
8861
0
    }
8862
8863
58
    if (item == Py_None) {
8864
        /* deletion */
8865
0
        translate[ch] = 0xfe;
8866
0
    }
8867
58
    else if (item == NULL) {
8868
        /* not found => default to 1:1 mapping */
8869
33
        translate[ch] = ch;
8870
33
        return 1;
8871
33
    }
8872
25
    else if (PyLong_Check(item)) {
8873
0
        long replace = PyLong_AS_LONG(item);
8874
        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8875
           used it */
8876
0
        if (127 < replace) {
8877
            /* invalid character or character outside ASCII:
8878
               skip the fast translate */
8879
0
            goto exit;
8880
0
        }
8881
0
        translate[ch] = (Py_UCS1)replace;
8882
0
    }
8883
25
    else if (PyUnicode_Check(item)) {
8884
25
        Py_UCS4 replace;
8885
8886
25
        if (PyUnicode_READY(item) == -1) {
8887
0
            Py_DECREF(item);
8888
0
            return -1;
8889
0
        }
8890
25
        if (PyUnicode_GET_LENGTH(item) != 1)
8891
25
            goto exit;
8892
8893
0
        replace = PyUnicode_READ_CHAR(item, 0);
8894
0
        if (replace > 127)
8895
0
            goto exit;
8896
0
        translate[ch] = (Py_UCS1)replace;
8897
0
    }
8898
0
    else {
8899
        /* not None, NULL, long or unicode */
8900
0
        goto exit;
8901
0
    }
8902
0
    ret = 1;
8903
8904
25
  exit:
8905
25
    Py_DECREF(item);
8906
25
    return ret;
8907
0
}
8908
8909
/* Fast path for ascii => ascii translation. Return 1 if the whole string
8910
   was translated into writer, return 0 if the input string was partially
8911
   translated into writer, raise an exception and return -1 on error. */
8912
static int
8913
unicode_fast_translate(PyObject *input, PyObject *mapping,
8914
                       _PyUnicodeWriter *writer, int ignore,
8915
                       Py_ssize_t *input_pos)
8916
48
{
8917
48
    Py_UCS1 ascii_table[128], ch, ch2;
8918
48
    Py_ssize_t len;
8919
48
    Py_UCS1 *in, *end, *out;
8920
48
    int res = 0;
8921
8922
48
    len = PyUnicode_GET_LENGTH(input);
8923
8924
48
    memset(ascii_table, 0xff, 128);
8925
8926
48
    in = PyUnicode_1BYTE_DATA(input);
8927
48
    end = in + len;
8928
8929
48
    assert(PyUnicode_IS_ASCII(writer->buffer));
8930
48
    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8931
48
    out = PyUnicode_1BYTE_DATA(writer->buffer);
8932
8933
88
    for (; in < end; in++) {
8934
65
        ch = *in;
8935
65
        ch2 = ascii_table[ch];
8936
65
        if (ch2 == 0xff) {
8937
58
            int translate = unicode_fast_translate_lookup(mapping, ch,
8938
58
                                                          ascii_table);
8939
58
            if (translate < 0)
8940
0
                return -1;
8941
58
            if (translate == 0)
8942
25
                goto exit;
8943
33
            ch2 = ascii_table[ch];
8944
33
        }
8945
40
        if (ch2 == 0xfe) {
8946
0
            if (ignore)
8947
0
                continue;
8948
0
            goto exit;
8949
0
        }
8950
40
        assert(ch2 < 128);
8951
40
        *out = ch2;
8952
40
        out++;
8953
40
    }
8954
23
    res = 1;
8955
8956
48
exit:
8957
48
    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8958
48
    *input_pos = in - PyUnicode_1BYTE_DATA(input);
8959
48
    return res;
8960
23
}
8961
8962
static PyObject *
8963
_PyUnicode_TranslateCharmap(PyObject *input,
8964
                            PyObject *mapping,
8965
                            const char *errors)
8966
48
{
8967
    /* input object */
8968
48
    char *data;
8969
48
    Py_ssize_t size, i;
8970
48
    int kind;
8971
    /* output buffer */
8972
48
    _PyUnicodeWriter writer;
8973
    /* error handler */
8974
48
    const char *reason = "character maps to <undefined>";
8975
48
    PyObject *errorHandler = NULL;
8976
48
    PyObject *exc = NULL;
8977
48
    int ignore;
8978
48
    int res;
8979
8980
48
    if (mapping == NULL) {
8981
0
        PyErr_BadArgument();
8982
0
        return NULL;
8983
0
    }
8984
8985
48
    if (PyUnicode_READY(input) == -1)
8986
0
        return NULL;
8987
48
    data = (char*)PyUnicode_DATA(input);
8988
48
    kind = PyUnicode_KIND(input);
8989
48
    size = PyUnicode_GET_LENGTH(input);
8990
8991
48
    if (size == 0)
8992
0
        return PyUnicode_FromObject(input);
8993
8994
    /* allocate enough for a simple 1:1 translation without
8995
       replacements, if we need more, we'll resize */
8996
48
    _PyUnicodeWriter_Init(&writer);
8997
48
    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8998
0
        goto onError;
8999
9000
48
    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9001
9002
48
    if (PyUnicode_READY(input) == -1)
9003
0
        return NULL;
9004
48
    if (PyUnicode_IS_ASCII(input)) {
9005
48
        res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9006
48
        if (res < 0) {
9007
0
            _PyUnicodeWriter_Dealloc(&writer);
9008
0
            return NULL;
9009
0
        }
9010
48
        if (res == 1)
9011
23
            return _PyUnicodeWriter_Finish(&writer);
9012
48
    }
9013
0
    else {
9014
0
        i = 0;
9015
0
    }
9016
9017
62
    while (i<size) {
9018
        /* try to encode it */
9019
37
        int translate;
9020
37
        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9021
37
        Py_ssize_t newpos;
9022
        /* startpos for collecting untranslatable chars */
9023
37
        Py_ssize_t collstart;
9024
37
        Py_ssize_t collend;
9025
37
        Py_UCS4 ch;
9026
9027
37
        ch = PyUnicode_READ(kind, data, i);
9028
37
        translate = charmaptranslate_output(ch, mapping, &writer);
9029
37
        if (translate < 0)
9030
0
            goto onError;
9031
9032
37
        if (translate != 0) {
9033
            /* it worked => adjust input pointer */
9034
37
            ++i;
9035
37
            continue;
9036
37
        }
9037
9038
        /* untranslatable character */
9039
0
        collstart = i;
9040
0
        collend = i+1;
9041
9042
        /* find all untranslatable characters */
9043
0
        while (collend < size) {
9044
0
            PyObject *x;
9045
0
            ch = PyUnicode_READ(kind, data, collend);
9046
0
            if (charmaptranslate_lookup(ch, mapping, &x))
9047
0
                goto onError;
9048
0
            Py_XDECREF(x);
9049
0
            if (x != Py_None)
9050
0
                break;
9051
0
            ++collend;
9052
0
        }
9053
9054
0
        if (ignore) {
9055
0
            i = collend;
9056
0
        }
9057
0
        else {
9058
0
            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9059
0
                                                             reason, input, &exc,
9060
0
                                                             collstart, collend, &newpos);
9061
0
            if (repunicode == NULL)
9062
0
                goto onError;
9063
0
            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
9064
0
                Py_DECREF(repunicode);
9065
0
                goto onError;
9066
0
            }
9067
0
            Py_DECREF(repunicode);
9068
0
            i = newpos;
9069
0
        }
9070
0
    }
9071
25
    Py_XDECREF(exc);
9072
25
    Py_XDECREF(errorHandler);
9073
25
    return _PyUnicodeWriter_Finish(&writer);
9074
9075
0
  onError:
9076
0
    _PyUnicodeWriter_Dealloc(&writer);
9077
0
    Py_XDECREF(exc);
9078
0
    Py_XDECREF(errorHandler);
9079
0
    return NULL;
9080
25
}
9081
9082
/* Deprecated. Use PyUnicode_Translate instead. */
9083
PyObject *
9084
PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9085
                           Py_ssize_t size,
9086
                           PyObject *mapping,
9087
                           const char *errors)
9088
0
{
9089
0
    PyObject *result;
9090
0
    PyObject *unicode = PyUnicode_FromWideChar(p, size);
9091
0
    if (!unicode)
9092
0
        return NULL;
9093
0
    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9094
0
    Py_DECREF(unicode);
9095
0
    return result;
9096
0
}
9097
9098
PyObject *
9099
PyUnicode_Translate(PyObject *str,
9100
                    PyObject *mapping,
9101
                    const char *errors)
9102
0
{
9103
0
    if (ensure_unicode(str) < 0)
9104
0
        return NULL;
9105
0
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
9106
0
}
9107
9108
PyObject *
9109
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9110
0
{
9111
0
    if (!PyUnicode_Check(unicode)) {
9112
0
        PyErr_BadInternalCall();
9113
0
        return NULL;
9114
0
    }
9115
0
    if (PyUnicode_READY(unicode) == -1)
9116
0
        return NULL;
9117
0
    if (PyUnicode_IS_ASCII(unicode)) {
9118
        /* If the string is already ASCII, just return the same string */
9119
0
        Py_INCREF(unicode);
9120
0
        return unicode;
9121
0
    }
9122
9123
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9124
0
    PyObject *result = PyUnicode_New(len, 127);
9125
0
    if (result == NULL) {
9126
0
        return NULL;
9127
0
    }
9128
9129
0
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9130
0
    int kind = PyUnicode_KIND(unicode);
9131
0
    const void *data = PyUnicode_DATA(unicode);
9132
0
    Py_ssize_t i;
9133
0
    for (i = 0; i < len; ++i) {
9134
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9135
0
        if (ch < 127) {
9136
0
            out[i] = ch;
9137
0
        }
9138
0
        else if (Py_UNICODE_ISSPACE(ch)) {
9139
0
            out[i] = ' ';
9140
0
        }
9141
0
        else {
9142
0
            int decimal = Py_UNICODE_TODECIMAL(ch);
9143
0
            if (decimal < 0) {
9144
0
                out[i] = '?';
9145
0
                out[i+1] = '\0';
9146
0
                _PyUnicode_LENGTH(result) = i + 1;
9147
0
                break;
9148
0
            }
9149
0
            out[i] = '0' + decimal;
9150
0
        }
9151
0
    }
9152
9153
0
    assert(_PyUnicode_CheckConsistency(result, 1));
9154
0
    return result;
9155
0
}
9156
9157
PyObject *
9158
PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9159
                                  Py_ssize_t length)
9160
0
{
9161
0
    PyObject *decimal;
9162
0
    Py_ssize_t i;
9163
0
    Py_UCS4 maxchar;
9164
0
    enum PyUnicode_Kind kind;
9165
0
    void *data;
9166
9167
0
    maxchar = 127;
9168
0
    for (i = 0; i < length; i++) {
9169
0
        Py_UCS4 ch = s[i];
9170
0
        if (ch > 127) {
9171
0
            int decimal = Py_UNICODE_TODECIMAL(ch);
9172
0
            if (decimal >= 0)
9173
0
                ch = '0' + decimal;
9174
0
            maxchar = Py_MAX(maxchar, ch);
9175
0
        }
9176
0
    }
9177
9178
    /* Copy to a new string */
9179
0
    decimal = PyUnicode_New(length, maxchar);
9180
0
    if (decimal == NULL)
9181
0
        return decimal;
9182
0
    kind = PyUnicode_KIND(decimal);
9183
0
    data = PyUnicode_DATA(decimal);
9184
    /* Iterate over code points */
9185
0
    for (i = 0; i < length; i++) {
9186
0
        Py_UCS4 ch = s[i];
9187
0
        if (ch > 127) {
9188
0
            int decimal = Py_UNICODE_TODECIMAL(ch);
9189
0
            if (decimal >= 0)
9190
0
                ch = '0' + decimal;
9191
0
        }
9192
0
        PyUnicode_WRITE(kind, data, i, ch);
9193
0
    }
9194
0
    return unicode_result(decimal);
9195
0
}
9196
/* --- Decimal Encoder ---------------------------------------------------- */
9197
9198
int
9199
PyUnicode_EncodeDecimal(Py_UNICODE *s,
9200
                        Py_ssize_t length,
9201
                        char *output,
9202
                        const char *errors)
9203
0
{
9204
0
    PyObject *unicode;
9205
0
    Py_ssize_t i;
9206
0
    enum PyUnicode_Kind kind;
9207
0
    void *data;
9208
9209
0
    if (output == NULL) {
9210
0
        PyErr_BadArgument();
9211
0
        return -1;
9212
0
    }
9213
9214
0
    unicode = PyUnicode_FromWideChar(s, length);
9215
0
    if (unicode == NULL)
9216
0
        return -1;
9217
9218
0
    kind = PyUnicode_KIND(unicode);
9219
0
    data = PyUnicode_DATA(unicode);
9220
9221
0
    for (i=0; i < length; ) {
9222
0
        PyObject *exc;
9223
0
        Py_UCS4 ch;
9224
0
        int decimal;
9225
0
        Py_ssize_t startpos;
9226
9227
0
        ch = PyUnicode_READ(kind, data, i);
9228
9229
0
        if (Py_UNICODE_ISSPACE(ch)) {
9230
0
            *output++ = ' ';
9231
0
            i++;
9232
0
            continue;
9233
0
        }
9234
0
        decimal = Py_UNICODE_TODECIMAL(ch);
9235
0
        if (decimal >= 0) {
9236
0
            *output++ = '0' + decimal;
9237
0
            i++;
9238
0
            continue;
9239
0
        }
9240
0
        if (0 < ch && ch < 256) {
9241
0
            *output++ = (char)ch;
9242
0
            i++;
9243
0
            continue;
9244
0
        }
9245
9246
0
        startpos = i;
9247
0
        exc = NULL;
9248
0
        raise_encode_exception(&exc, "decimal", unicode,
9249
0
                               startpos, startpos+1,
9250
0
                               "invalid decimal Unicode string");
9251
0
        Py_XDECREF(exc);
9252
0
        Py_DECREF(unicode);
9253
0
        return -1;
9254
0
    }
9255
    /* 0-terminate the output string */
9256
0
    *output++ = '\0';
9257
0
    Py_DECREF(unicode);
9258
0
    return 0;
9259
0
}
9260
9261
/* --- Helpers ------------------------------------------------------------ */
9262
9263
/* helper macro to fixup start/end slice values */
9264
#define ADJUST_INDICES(start, end, len)         \
9265
3.26k
    if (end > len)                              \
9266
3.26k
        end = len;                              \
9267
3.26k
    else if (end < 0) {                         \
9268
0
        end += len;                             \
9269
0
        if (end < 0)                            \
9270
0
            end = 0;                            \
9271
0
    }                                           \
9272
3.26k
    if (start < 0) {                            \
9273
0
        start += len;                           \
9274
0
        if (start < 0)                          \
9275
0
            start = 0;                          \
9276
0
    }
9277
9278
static Py_ssize_t
9279
any_find_slice(PyObject* s1, PyObject* s2,
9280
               Py_ssize_t start,
9281
               Py_ssize_t end,
9282
               int direction)
9283
98
{
9284
98
    int kind1, kind2;
9285
98
    void *buf1, *buf2;
9286
98
    Py_ssize_t len1, len2, result;
9287
9288
98
    kind1 = PyUnicode_KIND(s1);
9289
98
    kind2 = PyUnicode_KIND(s2);
9290
98
    if (kind1 < kind2)
9291
0
        return -1;
9292
9293
98
    len1 = PyUnicode_GET_LENGTH(s1);
9294
98
    len2 = PyUnicode_GET_LENGTH(s2);
9295
98
    ADJUST_INDICES(start, end, len1);
9296
98
    if (end - start < len2)
9297
0
        return -1;
9298
9299
98
    buf1 = PyUnicode_DATA(s1);
9300
98
    buf2 = PyUnicode_DATA(s2);
9301
98
    if (len2 == 1) {
9302
84
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9303
84
        result = findchar((const char *)buf1 + kind1*start,
9304
84
                          kind1, end - start, ch, direction);
9305
84
        if (result == -1)
9306
14
            return -1;
9307
70
        else
9308
70
            return start + result;
9309
84
    }
9310
9311
14
    if (kind2 != kind1) {
9312
0
        buf2 = _PyUnicode_AsKind(s2, kind1);
9313
0
        if (!buf2)
9314
0
            return -2;
9315
0
    }
9316
9317
14
    if (direction > 0) {
9318
14
        switch (kind1) {
9319
14
        case PyUnicode_1BYTE_KIND:
9320
14
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9321
14
                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9322
0
            else
9323
0
                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9324
14
            break;
9325
0
        case PyUnicode_2BYTE_KIND:
9326
0
            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9327
0
            break;
9328
0
        case PyUnicode_4BYTE_KIND:
9329
0
            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9330
0
            break;
9331
0
        default:
9332
0
            Py_UNREACHABLE();
9333
14
        }
9334
14
    }
9335
0
    else {
9336
0
        switch (kind1) {
9337
0
        case PyUnicode_1BYTE_KIND:
9338
0
            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9339
0
                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9340
0
            else
9341
0
                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9342
0
            break;
9343
0
        case PyUnicode_2BYTE_KIND:
9344
0
            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9345
0
            break;
9346
0
        case PyUnicode_4BYTE_KIND:
9347
0
            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9348
0
            break;
9349
0
        default:
9350
0
            Py_UNREACHABLE();
9351
0
        }
9352
0
    }
9353
9354
14
    if (kind2 != kind1)
9355
0
        PyMem_Free(buf2);
9356
9357
14
    return result;
9358
14
}
9359
9360
/* _PyUnicode_InsertThousandsGrouping() helper functions */
9361
#include "stringlib/localeutil.h"
9362
9363
/**
9364
 * InsertThousandsGrouping:
9365
 * @writer: Unicode writer.
9366
 * @n_buffer: Number of characters in @buffer.
9367
 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9368
 * @d_pos: Start of digits string.
9369
 * @n_digits: The number of digits in the string, in which we want
9370
 *            to put the grouping chars.
9371
 * @min_width: The minimum width of the digits in the output string.
9372
 *             Output will be zero-padded on the left to fill.
9373
 * @grouping: see definition in localeconv().
9374
 * @thousands_sep: see definition in localeconv().
9375
 *
9376
 * There are 2 modes: counting and filling. If @writer is NULL,
9377
 *  we are in counting mode, else filling mode.
9378
 * If counting, the required buffer size is returned.
9379
 * If filling, we know the buffer will be large enough, so we don't
9380
 *  need to pass in the buffer size.
9381
 * Inserts thousand grouping characters (as defined by grouping and
9382
 *  thousands_sep) into @writer.
9383
 *
9384
 * Return value: -1 on error, number of characters otherwise.
9385
 **/
9386
Py_ssize_t
9387
_PyUnicode_InsertThousandsGrouping(
9388
    _PyUnicodeWriter *writer,
9389
    Py_ssize_t n_buffer,
9390
    PyObject *digits,
9391
    Py_ssize_t d_pos,
9392
    Py_ssize_t n_digits,
9393
    Py_ssize_t min_width,
9394
    const char *grouping,
9395
    PyObject *thousands_sep,
9396
    Py_UCS4 *maxchar)
9397
0
{
9398
0
    min_width = Py_MAX(0, min_width);
9399
0
    if (writer) {
9400
0
        assert(digits != NULL);
9401
0
        assert(maxchar == NULL);
9402
0
    }
9403
0
    else {
9404
0
        assert(digits == NULL);
9405
0
        assert(maxchar != NULL);
9406
0
    }
9407
0
    assert(0 <= d_pos);
9408
0
    assert(0 <= n_digits);
9409
0
    assert(grouping != NULL);
9410
9411
0
    if (digits != NULL) {
9412
0
        if (PyUnicode_READY(digits) == -1) {
9413
0
            return -1;
9414
0
        }
9415
0
    }
9416
0
    if (PyUnicode_READY(thousands_sep) == -1) {
9417
0
        return -1;
9418
0
    }
9419
9420
0
    Py_ssize_t count = 0;
9421
0
    Py_ssize_t n_zeros;
9422
0
    int loop_broken = 0;
9423
0
    int use_separator = 0; /* First time through, don't append the
9424
                              separator. They only go between
9425
                              groups. */
9426
0
    Py_ssize_t buffer_pos;
9427
0
    Py_ssize_t digits_pos;
9428
0
    Py_ssize_t len;
9429
0
    Py_ssize_t n_chars;
9430
0
    Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9431
                                        be looked at */
9432
    /* A generator that returns all of the grouping widths, until it
9433
       returns 0. */
9434
0
    GroupGenerator groupgen;
9435
0
    GroupGenerator_init(&groupgen, grouping);
9436
0
    const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9437
9438
    /* if digits are not grouped, thousands separator
9439
       should be an empty string */
9440
0
    assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9441
9442
0
    digits_pos = d_pos + n_digits;
9443
0
    if (writer) {
9444
0
        buffer_pos = writer->pos + n_buffer;
9445
0
        assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9446
0
        assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
9447
0
    }
9448
0
    else {
9449
0
        buffer_pos = n_buffer;
9450
0
    }
9451
9452
0
    if (!writer) {
9453
0
        *maxchar = 127;
9454
0
    }
9455
9456
0
    while ((len = GroupGenerator_next(&groupgen)) > 0) {
9457
0
        len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9458
0
        n_zeros = Py_MAX(0, len - remaining);
9459
0
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9460
9461
        /* Use n_zero zero's and n_chars chars */
9462
9463
        /* Count only, don't do anything. */
9464
0
        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9465
9466
        /* Copy into the writer. */
9467
0
        InsertThousandsGrouping_fill(writer, &buffer_pos,
9468
0
                                     digits, &digits_pos,
9469
0
                                     n_chars, n_zeros,
9470
0
                                     use_separator ? thousands_sep : NULL,
9471
0
                                     thousands_sep_len, maxchar);
9472
9473
        /* Use a separator next time. */
9474
0
        use_separator = 1;
9475
9476
0
        remaining -= n_chars;
9477
0
        min_width -= len;
9478
9479
0
        if (remaining <= 0 && min_width <= 0) {
9480
0
            loop_broken = 1;
9481
0
            break;
9482
0
        }
9483
0
        min_width -= thousands_sep_len;
9484
0
    }
9485
0
    if (!loop_broken) {
9486
        /* We left the loop without using a break statement. */
9487
9488
0
        len = Py_MAX(Py_MAX(remaining, min_width), 1);
9489
0
        n_zeros = Py_MAX(0, len - remaining);
9490
0
        n_chars = Py_MAX(0, Py_MIN(remaining, len));
9491
9492
        /* Use n_zero zero's and n_chars chars */
9493
0
        count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9494
9495
        /* Copy into the writer. */
9496
0
        InsertThousandsGrouping_fill(writer, &buffer_pos,
9497
0
                                     digits, &digits_pos,
9498
0
                                     n_chars, n_zeros,
9499
0
                                     use_separator ? thousands_sep : NULL,
9500
0
                                     thousands_sep_len, maxchar);
9501
0
    }
9502
0
    return count;
9503
0
}
9504
9505
9506
Py_ssize_t
9507
PyUnicode_Count(PyObject *str,
9508
                PyObject *substr,
9509
                Py_ssize_t start,
9510
                Py_ssize_t end)
9511
0
{
9512
0
    Py_ssize_t result;
9513
0
    int kind1, kind2;
9514
0
    void *buf1 = NULL, *buf2 = NULL;
9515
0
    Py_ssize_t len1, len2;
9516
9517
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9518
0
        return -1;
9519
9520
0
    kind1 = PyUnicode_KIND(str);
9521
0
    kind2 = PyUnicode_KIND(substr);
9522
0
    if (kind1 < kind2)
9523
0
        return 0;
9524
9525
0
    len1 = PyUnicode_GET_LENGTH(str);
9526
0
    len2 = PyUnicode_GET_LENGTH(substr);
9527
0
    ADJUST_INDICES(start, end, len1);
9528
0
    if (end - start < len2)
9529
0
        return 0;
9530
9531
0
    buf1 = PyUnicode_DATA(str);
9532
0
    buf2 = PyUnicode_DATA(substr);
9533
0
    if (kind2 != kind1) {
9534
0
        buf2 = _PyUnicode_AsKind(substr, kind1);
9535
0
        if (!buf2)
9536
0
            goto onError;
9537
0
    }
9538
9539
0
    switch (kind1) {
9540
0
    case PyUnicode_1BYTE_KIND:
9541
0
        if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
9542
0
            result = asciilib_count(
9543
0
                ((Py_UCS1*)buf1) + start, end - start,
9544
0
                buf2, len2, PY_SSIZE_T_MAX
9545
0
                );
9546
0
        else
9547
0
            result = ucs1lib_count(
9548
0
                ((Py_UCS1*)buf1) + start, end - start,
9549
0
                buf2, len2, PY_SSIZE_T_MAX
9550
0
                );
9551
0
        break;
9552
0
    case PyUnicode_2BYTE_KIND:
9553
0
        result = ucs2lib_count(
9554
0
            ((Py_UCS2*)buf1) + start, end - start,
9555
0
            buf2, len2, PY_SSIZE_T_MAX
9556
0
            );
9557
0
        break;
9558
0
    case PyUnicode_4BYTE_KIND:
9559
0
        result = ucs4lib_count(
9560
0
            ((Py_UCS4*)buf1) + start, end - start,
9561
0
            buf2, len2, PY_SSIZE_T_MAX
9562
0
            );
9563
0
        break;
9564
0
    default:
9565
0
        Py_UNREACHABLE();
9566
0
    }
9567
9568
0
    if (kind2 != kind1)
9569
0
        PyMem_Free(buf2);
9570
9571
0
    return result;
9572
0
  onError:
9573
0
    if (kind2 != kind1 && buf2)
9574
0
        PyMem_Free(buf2);
9575
0
    return -1;
9576
0
}
9577
9578
Py_ssize_t
9579
PyUnicode_Find(PyObject *str,
9580
               PyObject *substr,
9581
               Py_ssize_t start,
9582
               Py_ssize_t end,
9583
               int direction)
9584
0
{
9585
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9586
0
        return -2;
9587
9588
0
    return any_find_slice(str, substr, start, end, direction);
9589
0
}
9590
9591
Py_ssize_t
9592
PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9593
                   Py_ssize_t start, Py_ssize_t end,
9594
                   int direction)
9595
2.14k
{
9596
2.14k
    int kind;
9597
2.14k
    Py_ssize_t len, result;
9598
2.14k
    if (PyUnicode_READY(str) == -1)
9599
0
        return -2;
9600
2.14k
    len = PyUnicode_GET_LENGTH(str);
9601
2.14k
    ADJUST_INDICES(start, end, len);
9602
2.14k
    if (end - start < 1)
9603
0
        return -1;
9604
2.14k
    kind = PyUnicode_KIND(str);
9605
2.14k
    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9606
2.14k
                      kind, end-start, ch, direction);
9607
2.14k
    if (result == -1)
9608
2.06k
        return -1;
9609
85
    else
9610
85
        return start + result;
9611
2.14k
}
9612
9613
static int
9614
tailmatch(PyObject *self,
9615
          PyObject *substring,
9616
          Py_ssize_t start,
9617
          Py_ssize_t end,
9618
          int direction)
9619
1.01k
{
9620
1.01k
    int kind_self;
9621
1.01k
    int kind_sub;
9622
1.01k
    void *data_self;
9623
1.01k
    void *data_sub;
9624
1.01k
    Py_ssize_t offset;
9625
1.01k
    Py_ssize_t i;
9626
1.01k
    Py_ssize_t end_sub;
9627
9628
1.01k
    if (PyUnicode_READY(self) == -1 ||
9629
1.01k
        PyUnicode_READY(substring) == -1)
9630
0
        return -1;
9631
9632
1.01k
    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9633
1.01k
    end -= PyUnicode_GET_LENGTH(substring);
9634
1.01k
    if (end < start)
9635
86
        return 0;
9636
9637
930
    if (PyUnicode_GET_LENGTH(substring) == 0)
9638
0
        return 1;
9639
9640
930
    kind_self = PyUnicode_KIND(self);
9641
930
    data_self = PyUnicode_DATA(self);
9642
930
    kind_sub = PyUnicode_KIND(substring);
9643
930
    data_sub = PyUnicode_DATA(substring);
9644
930
    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9645
9646
930
    if (direction > 0)
9647
404
        offset = end;
9648
526
    else
9649
526
        offset = start;
9650
9651
930
    if (PyUnicode_READ(kind_self, data_self, offset) ==
9652
930
        PyUnicode_READ(kind_sub, data_sub, 0) &&
9653
930
        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9654
446
        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9655
        /* If both are of the same kind, memcmp is sufficient */
9656
376
        if (kind_self == kind_sub) {
9657
376
            return ! memcmp((char *)data_self +
9658
376
                                (offset * PyUnicode_KIND(substring)),
9659
376
                            data_sub,
9660
376
                            PyUnicode_GET_LENGTH(substring) *
9661
376
                                PyUnicode_KIND(substring));
9662
376
        }
9663
        /* otherwise we have to compare each character by first accessing it */
9664
0
        else {
9665
            /* We do not need to compare 0 and len(substring)-1 because
9666
               the if statement above ensured already that they are equal
9667
               when we end up here. */
9668
0
            for (i = 1; i < end_sub; ++i) {
9669
0
                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9670
0
                    PyUnicode_READ(kind_sub, data_sub, i))
9671
0
                    return 0;
9672
0
            }
9673
0
            return 1;
9674
0
        }
9675
376
    }
9676
9677
554
    return 0;
9678
930
}
9679
9680
Py_ssize_t
9681
PyUnicode_Tailmatch(PyObject *str,
9682
                    PyObject *substr,
9683
                    Py_ssize_t start,
9684
                    Py_ssize_t end,
9685
                    int direction)
9686
0
{
9687
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
9688
0
        return -1;
9689
9690
0
    return tailmatch(str, substr, start, end, direction);
9691
0
}
9692
9693
static PyObject *
9694
ascii_upper_or_lower(PyObject *self, int lower)
9695
36
{
9696
36
    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9697
36
    char *resdata, *data = PyUnicode_DATA(self);
9698
36
    PyObject *res;
9699
9700
36
    res = PyUnicode_New(len, 127);
9701
36
    if (res == NULL)
9702
0
        return NULL;
9703
36
    resdata = PyUnicode_DATA(res);
9704
36
    if (lower)
9705
0
        _Py_bytes_lower(resdata, data, len);
9706
36
    else
9707
36
        _Py_bytes_upper(resdata, data, len);
9708
36
    return res;
9709
36
}
9710
9711
static Py_UCS4
9712
handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9713
0
{
9714
0
    Py_ssize_t j;
9715
0
    int final_sigma;
9716
0
    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9717
    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9718
9719
     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9720
9721
    where ! is a negation and \p{xxx} is a character with property xxx.
9722
    */
9723
0
    for (j = i - 1; j >= 0; j--) {
9724
0
        c = PyUnicode_READ(kind, data, j);
9725
0
        if (!_PyUnicode_IsCaseIgnorable(c))
9726
0
            break;
9727
0
    }
9728
0
    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9729
0
    if (final_sigma) {
9730
0
        for (j = i + 1; j < length; j++) {
9731
0
            c = PyUnicode_READ(kind, data, j);
9732
0
            if (!_PyUnicode_IsCaseIgnorable(c))
9733
0
                break;
9734
0
        }
9735
0
        final_sigma = j == length || !_PyUnicode_IsCased(c);
9736
0
    }
9737
0
    return (final_sigma) ? 0x3C2 : 0x3C3;
9738
0
}
9739
9740
static int
9741
lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9742
           Py_UCS4 c, Py_UCS4 *mapped)
9743
0
{
9744
    /* Obscure special case. */
9745
0
    if (c == 0x3A3) {
9746
0
        mapped[0] = handle_capital_sigma(kind, data, length, i);
9747
0
        return 1;
9748
0
    }
9749
0
    return _PyUnicode_ToLowerFull(c, mapped);
9750
0
}
9751
9752
static Py_ssize_t
9753
do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9754
0
{
9755
0
    Py_ssize_t i, k = 0;
9756
0
    int n_res, j;
9757
0
    Py_UCS4 c, mapped[3];
9758
9759
0
    c = PyUnicode_READ(kind, data, 0);
9760
0
    n_res = _PyUnicode_ToTitleFull(c, mapped);
9761
0
    for (j = 0; j < n_res; j++) {
9762
0
        *maxchar = Py_MAX(*maxchar, mapped[j]);
9763
0
        res[k++] = mapped[j];
9764
0
    }
9765
0
    for (i = 1; i < length; i++) {
9766
0
        c = PyUnicode_READ(kind, data, i);
9767
0
        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9768
0
        for (j = 0; j < n_res; j++) {
9769
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9770
0
            res[k++] = mapped[j];
9771
0
        }
9772
0
    }
9773
0
    return k;
9774
0
}
9775
9776
static Py_ssize_t
9777
0
do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9778
0
    Py_ssize_t i, k = 0;
9779
9780
0
    for (i = 0; i < length; i++) {
9781
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9782
0
        int n_res, j;
9783
0
        if (Py_UNICODE_ISUPPER(c)) {
9784
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9785
0
        }
9786
0
        else if (Py_UNICODE_ISLOWER(c)) {
9787
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9788
0
        }
9789
0
        else {
9790
0
            n_res = 1;
9791
0
            mapped[0] = c;
9792
0
        }
9793
0
        for (j = 0; j < n_res; j++) {
9794
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9795
0
            res[k++] = mapped[j];
9796
0
        }
9797
0
    }
9798
0
    return k;
9799
0
}
9800
9801
static Py_ssize_t
9802
do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9803
                  Py_UCS4 *maxchar, int lower)
9804
0
{
9805
0
    Py_ssize_t i, k = 0;
9806
9807
0
    for (i = 0; i < length; i++) {
9808
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9809
0
        int n_res, j;
9810
0
        if (lower)
9811
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9812
0
        else
9813
0
            n_res = _PyUnicode_ToUpperFull(c, mapped);
9814
0
        for (j = 0; j < n_res; j++) {
9815
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9816
0
            res[k++] = mapped[j];
9817
0
        }
9818
0
    }
9819
0
    return k;
9820
0
}
9821
9822
static Py_ssize_t
9823
do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9824
0
{
9825
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9826
0
}
9827
9828
static Py_ssize_t
9829
do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9830
0
{
9831
0
    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9832
0
}
9833
9834
static Py_ssize_t
9835
do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9836
0
{
9837
0
    Py_ssize_t i, k = 0;
9838
9839
0
    for (i = 0; i < length; i++) {
9840
0
        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9841
0
        Py_UCS4 mapped[3];
9842
0
        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9843
0
        for (j = 0; j < n_res; j++) {
9844
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9845
0
            res[k++] = mapped[j];
9846
0
        }
9847
0
    }
9848
0
    return k;
9849
0
}
9850
9851
static Py_ssize_t
9852
do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9853
0
{
9854
0
    Py_ssize_t i, k = 0;
9855
0
    int previous_is_cased;
9856
9857
0
    previous_is_cased = 0;
9858
0
    for (i = 0; i < length; i++) {
9859
0
        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9860
0
        Py_UCS4 mapped[3];
9861
0
        int n_res, j;
9862
9863
0
        if (previous_is_cased)
9864
0
            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9865
0
        else
9866
0
            n_res = _PyUnicode_ToTitleFull(c, mapped);
9867
9868
0
        for (j = 0; j < n_res; j++) {
9869
0
            *maxchar = Py_MAX(*maxchar, mapped[j]);
9870
0
            res[k++] = mapped[j];
9871
0
        }
9872
9873
0
        previous_is_cased = _PyUnicode_IsCased(c);
9874
0
    }
9875
0
    return k;
9876
0
}
9877
9878
static PyObject *
9879
case_operation(PyObject *self,
9880
               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9881
0
{
9882
0
    PyObject *res = NULL;
9883
0
    Py_ssize_t length, newlength = 0;
9884
0
    int kind, outkind;
9885
0
    void *data, *outdata;
9886
0
    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9887
9888
0
    assert(PyUnicode_IS_READY(self));
9889
9890
0
    kind = PyUnicode_KIND(self);
9891
0
    data = PyUnicode_DATA(self);
9892
0
    length = PyUnicode_GET_LENGTH(self);
9893
0
    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9894
0
        PyErr_SetString(PyExc_OverflowError, "string is too long");
9895
0
        return NULL;
9896
0
    }
9897
0
    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9898
0
    if (tmp == NULL)
9899
0
        return PyErr_NoMemory();
9900
0
    newlength = perform(kind, data, length, tmp, &maxchar);
9901
0
    res = PyUnicode_New(newlength, maxchar);
9902
0
    if (res == NULL)
9903
0
        goto leave;
9904
0
    tmpend = tmp + newlength;
9905
0
    outdata = PyUnicode_DATA(res);
9906
0
    outkind = PyUnicode_KIND(res);
9907
0
    switch (outkind) {
9908
0
    case PyUnicode_1BYTE_KIND:
9909
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9910
0
        break;
9911
0
    case PyUnicode_2BYTE_KIND:
9912
0
        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9913
0
        break;
9914
0
    case PyUnicode_4BYTE_KIND:
9915
0
        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9916
0
        break;
9917
0
    default:
9918
0
        Py_UNREACHABLE();
9919
0
    }
9920
0
  leave:
9921
0
    PyMem_FREE(tmp);
9922
0
    return res;
9923
0
}
9924
9925
PyObject *
9926
PyUnicode_Join(PyObject *separator, PyObject *seq)
9927
2.32k
{
9928
2.32k
    PyObject *res;
9929
2.32k
    PyObject *fseq;
9930
2.32k
    Py_ssize_t seqlen;
9931
2.32k
    PyObject **items;
9932
9933
2.32k
    fseq = PySequence_Fast(seq, "can only join an iterable");
9934
2.32k
    if (fseq == NULL) {
9935
0
        return NULL;
9936
0
    }
9937
9938
    /* NOTE: the following code can't call back into Python code,
9939
     * so we are sure that fseq won't be mutated.
9940
     */
9941
9942
2.32k
    items = PySequence_Fast_ITEMS(fseq);
9943
2.32k
    seqlen = PySequence_Fast_GET_SIZE(fseq);
9944
2.32k
    res = _PyUnicode_JoinArray(separator, items, seqlen);
9945
2.32k
    Py_DECREF(fseq);
9946
2.32k
    return res;
9947
2.32k
}
9948
9949
PyObject *
9950
_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
9951
2.39k
{
9952
2.39k
    PyObject *res = NULL; /* the result */
9953
2.39k
    PyObject *sep = NULL;
9954
2.39k
    Py_ssize_t seplen;
9955
2.39k
    PyObject *item;
9956
2.39k
    Py_ssize_t sz, i, res_offset;
9957
2.39k
    Py_UCS4 maxchar;
9958
2.39k
    Py_UCS4 item_maxchar;
9959
2.39k
    int use_memcpy;
9960
2.39k
    unsigned char *res_data = NULL, *sep_data = NULL;
9961
2.39k
    PyObject *last_obj;
9962
2.39k
    unsigned int kind = 0;
9963
9964
    /* If empty sequence, return u"". */
9965
2.39k
    if (seqlen == 0) {
9966
0
        _Py_RETURN_UNICODE_EMPTY();
9967
0
    }
9968
9969
    /* If singleton sequence with an exact Unicode, return that. */
9970
2.39k
    last_obj = NULL;
9971
2.39k
    if (seqlen == 1) {
9972
41
        if (PyUnicode_CheckExact(items[0])) {
9973
41
            res = items[0];
9974
41
            Py_INCREF(res);
9975
41
            return res;
9976
41
        }
9977
0
        seplen = 0;
9978
0
        maxchar = 0;
9979
0
    }
9980
2.34k
    else {
9981
        /* Set up sep and seplen */
9982
2.34k
        if (separator == NULL) {
9983
            /* fall back to a blank space separator */
9984
0
            sep = PyUnicode_FromOrdinal(' ');
9985
0
            if (!sep)
9986
0
                goto onError;
9987
0
            seplen = 1;
9988
0
            maxchar = 32;
9989
0
        }
9990
2.34k
        else {
9991
2.34k
            if (!PyUnicode_Check(separator)) {
9992
0
                PyErr_Format(PyExc_TypeError,
9993
0
                             "separator: expected str instance,"
9994
0
                             " %.80s found",
9995
0
                             Py_TYPE(separator)->tp_name);
9996
0
                goto onError;
9997
0
            }
9998
2.34k
            if (PyUnicode_READY(separator))
9999
0
                goto onError;
10000
2.34k
            sep = separator;
10001
2.34k
            seplen = PyUnicode_GET_LENGTH(separator);
10002
2.34k
            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10003
            /* inc refcount to keep this code path symmetric with the
10004
               above case of a blank separator */
10005
2.34k
            Py_INCREF(sep);
10006
2.34k
        }
10007
2.34k
        last_obj = sep;
10008
2.34k
    }
10009
10010
    /* There are at least two things to join, or else we have a subclass
10011
     * of str in the sequence.
10012
     * Do a pre-pass to figure out the total amount of space we'll
10013
     * need (sz), and see whether all argument are strings.
10014
     */
10015
2.34k
    sz = 0;
10016
#ifdef Py_DEBUG
10017
    use_memcpy = 0;
10018
#else
10019
2.34k
    use_memcpy = 1;
10020
2.34k
#endif
10021
8.74k
    for (i = 0; i < seqlen; i++) {
10022
6.39k
        size_t add_sz;
10023
6.39k
        item = items[i];
10024
6.39k
        if (!PyUnicode_Check(item)) {
10025
0
            PyErr_Format(PyExc_TypeError,
10026
0
                         "sequence item %zd: expected str instance,"
10027
0
                         " %.80s found",
10028
0
                         i, Py_TYPE(item)->tp_name);
10029
0
            goto onError;
10030
0
        }
10031
6.39k
        if (PyUnicode_READY(item) == -1)
10032
0
            goto onError;
10033
6.39k
        add_sz = PyUnicode_GET_LENGTH(item);
10034
6.39k
        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
10035
6.39k
        maxchar = Py_MAX(maxchar, item_maxchar);
10036
6.39k
        if (i != 0) {
10037
4.05k
            add_sz += seplen;
10038
4.05k
        }
10039
6.39k
        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
10040
0
            PyErr_SetString(PyExc_OverflowError,
10041
0
                            "join() result is too long for a Python string");
10042
0
            goto onError;
10043
0
        }
10044
6.39k
        sz += add_sz;
10045
6.39k
        if (use_memcpy && last_obj != NULL) {
10046
6.39k
            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10047
0
                use_memcpy = 0;
10048
6.39k
        }
10049
6.39k
        last_obj = item;
10050
6.39k
    }
10051
10052
2.34k
    res = PyUnicode_New(sz, maxchar);
10053
2.34k
    if (res == NULL)
10054
0
        goto onError;
10055
10056
    /* Catenate everything. */
10057
#ifdef Py_DEBUG
10058
    use_memcpy = 0;
10059
#else
10060
2.34k
    if (use_memcpy) {
10061
2.34k
        res_data = PyUnicode_1BYTE_DATA(res);
10062
2.34k
        kind = PyUnicode_KIND(res);
10063
2.34k
        if (seplen != 0)
10064
1.68k
            sep_data = PyUnicode_1BYTE_DATA(sep);
10065
2.34k
    }
10066
2.34k
#endif
10067
2.34k
    if (use_memcpy) {
10068
8.74k
        for (i = 0; i < seqlen; ++i) {
10069
6.39k
            Py_ssize_t itemlen;
10070
6.39k
            item = items[i];
10071
10072
            /* Copy item, and maybe the separator. */
10073
6.39k
            if (i && seplen != 0) {
10074
2.59k
                memcpy(res_data,
10075
2.59k
                          sep_data,
10076
2.59k
                          kind * seplen);
10077
2.59k
                res_data += kind * seplen;
10078
2.59k
            }
10079
10080
6.39k
            itemlen = PyUnicode_GET_LENGTH(item);
10081
6.39k
            if (itemlen != 0) {
10082
6.39k
                memcpy(res_data,
10083
6.39k
                          PyUnicode_DATA(item),
10084
6.39k
                          kind * itemlen);
10085
6.39k
                res_data += kind * itemlen;
10086
6.39k
            }
10087
6.39k
        }
10088
2.34k
        assert(res_data == PyUnicode_1BYTE_DATA(res)
10089
2.34k
                           + kind * PyUnicode_GET_LENGTH(res));
10090
2.34k
    }
10091
0
    else {
10092
0
        for (i = 0, res_offset = 0; i < seqlen; ++i) {
10093
0
            Py_ssize_t itemlen;
10094
0
            item = items[i];
10095
10096
            /* Copy item, and maybe the separator. */
10097
0
            if (i && seplen != 0) {
10098
0
                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10099
0
                res_offset += seplen;
10100
0
            }
10101
10102
0
            itemlen = PyUnicode_GET_LENGTH(item);
10103
0
            if (itemlen != 0) {
10104
0
                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
10105
0
                res_offset += itemlen;
10106
0
            }
10107
0
        }
10108
0
        assert(res_offset == PyUnicode_GET_LENGTH(res));
10109
0
    }
10110
10111
2.34k
    Py_XDECREF(sep);
10112
2.34k
    assert(_PyUnicode_CheckConsistency(res, 1));
10113
2.34k
    return res;
10114
10115
0
  onError:
10116
0
    Py_XDECREF(sep);
10117
0
    Py_XDECREF(res);
10118
0
    return NULL;
10119
2.34k
}
10120
10121
void
10122
_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10123
                    Py_UCS4 fill_char)
10124
0
{
10125
0
    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
10126
0
    void *data = PyUnicode_DATA(unicode);
10127
0
    assert(PyUnicode_IS_READY(unicode));
10128
0
    assert(unicode_modifiable(unicode));
10129
0
    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10130
0
    assert(start >= 0);
10131
0
    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
10132
0
    unicode_fill(kind, data, fill_char, start, length);
10133
0
}
10134
10135
Py_ssize_t
10136
PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10137
               Py_UCS4 fill_char)
10138
0
{
10139
0
    Py_ssize_t maxlen;
10140
10141
0
    if (!PyUnicode_Check(unicode)) {
10142
0
        PyErr_BadInternalCall();
10143
0
        return -1;
10144
0
    }
10145
0
    if (PyUnicode_READY(unicode) == -1)
10146
0
        return -1;
10147
0
    if (unicode_check_modifiable(unicode))
10148
0
        return -1;
10149
10150
0
    if (start < 0) {
10151
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
10152
0
        return -1;
10153
0
    }
10154
0
    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10155
0
        PyErr_SetString(PyExc_ValueError,
10156
0
                         "fill character is bigger than "
10157
0
                         "the string maximum character");
10158
0
        return -1;
10159
0
    }
10160
10161
0
    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10162
0
    length = Py_MIN(maxlen, length);
10163
0
    if (length <= 0)
10164
0
        return 0;
10165
10166
0
    _PyUnicode_FastFill(unicode, start, length, fill_char);
10167
0
    return length;
10168
0
}
10169
10170
static PyObject *
10171
pad(PyObject *self,
10172
    Py_ssize_t left,
10173
    Py_ssize_t right,
10174
    Py_UCS4 fill)
10175
0
{
10176
0
    PyObject *u;
10177
0
    Py_UCS4 maxchar;
10178
0
    int kind;
10179
0
    void *data;
10180
10181
0
    if (left < 0)
10182
0
        left = 0;
10183
0
    if (right < 0)
10184
0
        right = 0;
10185
10186
0
    if (left == 0 && right == 0)
10187
0
        return unicode_result_unchanged(self);
10188
10189
0
    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10190
0
        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
10191
0
        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10192
0
        return NULL;
10193
0
    }
10194
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10195
0
    maxchar = Py_MAX(maxchar, fill);
10196
0
    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10197
0
    if (!u)
10198
0
        return NULL;
10199
10200
0
    kind = PyUnicode_KIND(u);
10201
0
    data = PyUnicode_DATA(u);
10202
0
    if (left)
10203
0
        unicode_fill(kind, data, fill, 0, left);
10204
0
    if (right)
10205
0
        unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10206
0
    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10207
0
    assert(_PyUnicode_CheckConsistency(u, 1));
10208
0
    return u;
10209
0
}
10210
10211
PyObject *
10212
PyUnicode_Splitlines(PyObject *string, int keepends)
10213
0
{
10214
0
    PyObject *list;
10215
10216
0
    if (ensure_unicode(string) < 0)
10217
0
        return NULL;
10218
10219
0
    switch (PyUnicode_KIND(string)) {
10220
0
    case PyUnicode_1BYTE_KIND:
10221
0
        if (PyUnicode_IS_ASCII(string))
10222
0
            list = asciilib_splitlines(
10223
0
                string, PyUnicode_1BYTE_DATA(string),
10224
0
                PyUnicode_GET_LENGTH(string), keepends);
10225
0
        else
10226
0
            list = ucs1lib_splitlines(
10227
0
                string, PyUnicode_1BYTE_DATA(string),
10228
0
                PyUnicode_GET_LENGTH(string), keepends);
10229
0
        break;
10230
0
    case PyUnicode_2BYTE_KIND:
10231
0
        list = ucs2lib_splitlines(
10232
0
            string, PyUnicode_2BYTE_DATA(string),
10233
0
            PyUnicode_GET_LENGTH(string), keepends);
10234
0
        break;
10235
0
    case PyUnicode_4BYTE_KIND:
10236
0
        list = ucs4lib_splitlines(
10237
0
            string, PyUnicode_4BYTE_DATA(string),
10238
0
            PyUnicode_GET_LENGTH(string), keepends);
10239
0
        break;
10240
0
    default:
10241
0
        Py_UNREACHABLE();
10242
0
    }
10243
0
    return list;
10244
0
}
10245
10246
static PyObject *
10247
split(PyObject *self,
10248
      PyObject *substring,
10249
      Py_ssize_t maxcount)
10250
74
{
10251
74
    int kind1, kind2;
10252
74
    void *buf1, *buf2;
10253
74
    Py_ssize_t len1, len2;
10254
74
    PyObject* out;
10255
10256
74
    if (maxcount < 0)
10257
74
        maxcount = PY_SSIZE_T_MAX;
10258
10259
74
    if (PyUnicode_READY(self) == -1)
10260
0
        return NULL;
10261
10262
74
    if (substring == NULL)
10263
4
        switch (PyUnicode_KIND(self)) {
10264
4
        case PyUnicode_1BYTE_KIND:
10265
4
            if (PyUnicode_IS_ASCII(self))
10266
4
                return asciilib_split_whitespace(
10267
4
                    self,  PyUnicode_1BYTE_DATA(self),
10268
4
                    PyUnicode_GET_LENGTH(self), maxcount
10269
4
                    );
10270
0
            else
10271
0
                return ucs1lib_split_whitespace(
10272
0
                    self,  PyUnicode_1BYTE_DATA(self),
10273
0
                    PyUnicode_GET_LENGTH(self), maxcount
10274
0
                    );
10275
0
        case PyUnicode_2BYTE_KIND:
10276
0
            return ucs2lib_split_whitespace(
10277
0
                self,  PyUnicode_2BYTE_DATA(self),
10278
0
                PyUnicode_GET_LENGTH(self), maxcount
10279
0
                );
10280
0
        case PyUnicode_4BYTE_KIND:
10281
0
            return ucs4lib_split_whitespace(
10282
0
                self,  PyUnicode_4BYTE_DATA(self),
10283
0
                PyUnicode_GET_LENGTH(self), maxcount
10284
0
                );
10285
0
        default:
10286
0
            Py_UNREACHABLE();
10287
4
        }
10288
10289
70
    if (PyUnicode_READY(substring) == -1)
10290
0
        return NULL;
10291
10292
70
    kind1 = PyUnicode_KIND(self);
10293
70
    kind2 = PyUnicode_KIND(substring);
10294
70
    len1 = PyUnicode_GET_LENGTH(self);
10295
70
    len2 = PyUnicode_GET_LENGTH(substring);
10296
70
    if (kind1 < kind2 || len1 < len2) {
10297
0
        out = PyList_New(1);
10298
0
        if (out == NULL)
10299
0
            return NULL;
10300
0
        Py_INCREF(self);
10301
0
        PyList_SET_ITEM(out, 0, self);
10302
0
        return out;
10303
0
    }
10304
70
    buf1 = PyUnicode_DATA(self);
10305
70
    buf2 = PyUnicode_DATA(substring);
10306
70
    if (kind2 != kind1) {
10307
0
        buf2 = _PyUnicode_AsKind(substring, kind1);
10308
0
        if (!buf2)
10309
0
            return NULL;
10310
0
    }
10311
10312
70
    switch (kind1) {
10313
70
    case PyUnicode_1BYTE_KIND:
10314
70
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10315
70
            out = asciilib_split(
10316
70
                self,  buf1, len1, buf2, len2, maxcount);
10317
0
        else
10318
0
            out = ucs1lib_split(
10319
0
                self,  buf1, len1, buf2, len2, maxcount);
10320
70
        break;
10321
0
    case PyUnicode_2BYTE_KIND:
10322
0
        out = ucs2lib_split(
10323
0
            self,  buf1, len1, buf2, len2, maxcount);
10324
0
        break;
10325
0
    case PyUnicode_4BYTE_KIND:
10326
0
        out = ucs4lib_split(
10327
0
            self,  buf1, len1, buf2, len2, maxcount);
10328
0
        break;
10329
0
    default:
10330
0
        out = NULL;
10331
70
    }
10332
70
    if (kind2 != kind1)
10333
0
        PyMem_Free(buf2);
10334
70
    return out;
10335
70
}
10336
10337
static PyObject *
10338
rsplit(PyObject *self,
10339
       PyObject *substring,
10340
       Py_ssize_t maxcount)
10341
0
{
10342
0
    int kind1, kind2;
10343
0
    void *buf1, *buf2;
10344
0
    Py_ssize_t len1, len2;
10345
0
    PyObject* out;
10346
10347
0
    if (maxcount < 0)
10348
0
        maxcount = PY_SSIZE_T_MAX;
10349
10350
0
    if (PyUnicode_READY(self) == -1)
10351
0
        return NULL;
10352
10353
0
    if (substring == NULL)
10354
0
        switch (PyUnicode_KIND(self)) {
10355
0
        case PyUnicode_1BYTE_KIND:
10356
0
            if (PyUnicode_IS_ASCII(self))
10357
0
                return asciilib_rsplit_whitespace(
10358
0
                    self,  PyUnicode_1BYTE_DATA(self),
10359
0
                    PyUnicode_GET_LENGTH(self), maxcount
10360
0
                    );
10361
0
            else
10362
0
                return ucs1lib_rsplit_whitespace(
10363
0
                    self,  PyUnicode_1BYTE_DATA(self),
10364
0
                    PyUnicode_GET_LENGTH(self), maxcount
10365
0
                    );
10366
0
        case PyUnicode_2BYTE_KIND:
10367
0
            return ucs2lib_rsplit_whitespace(
10368
0
                self,  PyUnicode_2BYTE_DATA(self),
10369
0
                PyUnicode_GET_LENGTH(self), maxcount
10370
0
                );
10371
0
        case PyUnicode_4BYTE_KIND:
10372
0
            return ucs4lib_rsplit_whitespace(
10373
0
                self,  PyUnicode_4BYTE_DATA(self),
10374
0
                PyUnicode_GET_LENGTH(self), maxcount
10375
0
                );
10376
0
        default:
10377
0
            Py_UNREACHABLE();
10378
0
        }
10379
10380
0
    if (PyUnicode_READY(substring) == -1)
10381
0
        return NULL;
10382
10383
0
    kind1 = PyUnicode_KIND(self);
10384
0
    kind2 = PyUnicode_KIND(substring);
10385
0
    len1 = PyUnicode_GET_LENGTH(self);
10386
0
    len2 = PyUnicode_GET_LENGTH(substring);
10387
0
    if (kind1 < kind2 || len1 < len2) {
10388
0
        out = PyList_New(1);
10389
0
        if (out == NULL)
10390
0
            return NULL;
10391
0
        Py_INCREF(self);
10392
0
        PyList_SET_ITEM(out, 0, self);
10393
0
        return out;
10394
0
    }
10395
0
    buf1 = PyUnicode_DATA(self);
10396
0
    buf2 = PyUnicode_DATA(substring);
10397
0
    if (kind2 != kind1) {
10398
0
        buf2 = _PyUnicode_AsKind(substring, kind1);
10399
0
        if (!buf2)
10400
0
            return NULL;
10401
0
    }
10402
10403
0
    switch (kind1) {
10404
0
    case PyUnicode_1BYTE_KIND:
10405
0
        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10406
0
            out = asciilib_rsplit(
10407
0
                self,  buf1, len1, buf2, len2, maxcount);
10408
0
        else
10409
0
            out = ucs1lib_rsplit(
10410
0
                self,  buf1, len1, buf2, len2, maxcount);
10411
0
        break;
10412
0
    case PyUnicode_2BYTE_KIND:
10413
0
        out = ucs2lib_rsplit(
10414
0
            self,  buf1, len1, buf2, len2, maxcount);
10415
0
        break;
10416
0
    case PyUnicode_4BYTE_KIND:
10417
0
        out = ucs4lib_rsplit(
10418
0
            self,  buf1, len1, buf2, len2, maxcount);
10419
0
        break;
10420
0
    default:
10421
0
        out = NULL;
10422
0
    }
10423
0
    if (kind2 != kind1)
10424
0
        PyMem_Free(buf2);
10425
0
    return out;
10426
0
}
10427
10428
static Py_ssize_t
10429
anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10430
            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10431
18
{
10432
18
    switch (kind) {
10433
18
    case PyUnicode_1BYTE_KIND:
10434
18
        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10435
18
            return asciilib_find(buf1, len1, buf2, len2, offset);
10436
0
        else
10437
0
            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10438
0
    case PyUnicode_2BYTE_KIND:
10439
0
        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10440
0
    case PyUnicode_4BYTE_KIND:
10441
0
        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10442
18
    }
10443
18
    Py_UNREACHABLE();
10444
18
}
10445
10446
static Py_ssize_t
10447
anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10448
             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10449
2
{
10450
2
    switch (kind) {
10451
2
    case PyUnicode_1BYTE_KIND:
10452
2
        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10453
2
            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10454
0
        else
10455
0
            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10456
0
    case PyUnicode_2BYTE_KIND:
10457
0
        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10458
0
    case PyUnicode_4BYTE_KIND:
10459
0
        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10460
2
    }
10461
2
    Py_UNREACHABLE();
10462
2
}
10463
10464
static void
10465
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10466
                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10467
0
{
10468
0
    int kind = PyUnicode_KIND(u);
10469
0
    void *data = PyUnicode_DATA(u);
10470
0
    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10471
0
    if (kind == PyUnicode_1BYTE_KIND) {
10472
0
        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10473
0
                                      (Py_UCS1 *)data + len,
10474
0
                                      u1, u2, maxcount);
10475
0
    }
10476
0
    else if (kind == PyUnicode_2BYTE_KIND) {
10477
0
        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10478
0
                                      (Py_UCS2 *)data + len,
10479
0
                                      u1, u2, maxcount);
10480
0
    }
10481
0
    else {
10482
0
        assert(kind == PyUnicode_4BYTE_KIND);
10483
0
        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10484
0
                                      (Py_UCS4 *)data + len,
10485
0
                                      u1, u2, maxcount);
10486
0
    }
10487
0
}
10488
10489
static PyObject *
10490
replace(PyObject *self, PyObject *str1,
10491
        PyObject *str2, Py_ssize_t maxcount)
10492
18
{
10493
18
    PyObject *u;
10494
18
    char *sbuf = PyUnicode_DATA(self);
10495
18
    char *buf1 = PyUnicode_DATA(str1);
10496
18
    char *buf2 = PyUnicode_DATA(str2);
10497
18
    int srelease = 0, release1 = 0, release2 = 0;
10498
18
    int skind = PyUnicode_KIND(self);
10499
18
    int kind1 = PyUnicode_KIND(str1);
10500
18
    int kind2 = PyUnicode_KIND(str2);
10501
18
    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10502
18
    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10503
18
    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10504
18
    int mayshrink;
10505
18
    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10506
10507
18
    if (maxcount < 0)
10508
18
        maxcount = PY_SSIZE_T_MAX;
10509
0
    else if (maxcount == 0 || slen == 0)
10510
0
        goto nothing;
10511
10512
18
    if (str1 == str2)
10513
0
        goto nothing;
10514
10515
18
    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10516
18
    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10517
18
    if (maxchar < maxchar_str1)
10518
        /* substring too wide to be present */
10519
0
        goto nothing;
10520
18
    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10521
    /* Replacing str1 with str2 may cause a maxchar reduction in the
10522
       result string. */
10523
18
    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10524
18
    maxchar = Py_MAX(maxchar, maxchar_str2);
10525
10526
18
    if (len1 == len2) {
10527
        /* same length */
10528
16
        if (len1 == 0)
10529
0
            goto nothing;
10530
16
        if (len1 == 1) {
10531
            /* replace characters */
10532
16
            Py_UCS4 u1, u2;
10533
16
            Py_ssize_t pos;
10534
10535
16
            u1 = PyUnicode_READ(kind1, buf1, 0);
10536
16
            pos = findchar(sbuf, skind, slen, u1, 1);
10537
16
            if (pos < 0)
10538
16
                goto nothing;
10539
0
            u2 = PyUnicode_READ(kind2, buf2, 0);
10540
0
            u = PyUnicode_New(slen, maxchar);
10541
0
            if (!u)
10542
0
                goto error;
10543
10544
0
            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10545
0
            replace_1char_inplace(u, pos, u1, u2, maxcount);
10546
0
        }
10547
0
        else {
10548
0
            int rkind = skind;
10549
0
            char *res;
10550
0
            Py_ssize_t i;
10551
10552
0
            if (kind1 < rkind) {
10553
                /* widen substring */
10554
0
                buf1 = _PyUnicode_AsKind(str1, rkind);
10555
0
                if (!buf1) goto error;
10556
0
                release1 = 1;
10557
0
            }
10558
0
            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10559
0
            if (i < 0)
10560
0
                goto nothing;
10561
0
            if (rkind > kind2) {
10562
                /* widen replacement */
10563
0
                buf2 = _PyUnicode_AsKind(str2, rkind);
10564
0
                if (!buf2) goto error;
10565
0
                release2 = 1;
10566
0
            }
10567
0
            else if (rkind < kind2) {
10568
                /* widen self and buf1 */
10569
0
                rkind = kind2;
10570
0
                if (release1) PyMem_Free(buf1);
10571
0
                release1 = 0;
10572
0
                sbuf = _PyUnicode_AsKind(self, rkind);
10573
0
                if (!sbuf) goto error;
10574
0
                srelease = 1;
10575
0
                buf1 = _PyUnicode_AsKind(str1, rkind);
10576
0
                if (!buf1) goto error;
10577
0
                release1 = 1;
10578
0
            }
10579
0
            u = PyUnicode_New(slen, maxchar);
10580
0
            if (!u)
10581
0
                goto error;
10582
0
            assert(PyUnicode_KIND(u) == rkind);
10583
0
            res = PyUnicode_DATA(u);
10584
10585
0
            memcpy(res, sbuf, rkind * slen);
10586
            /* change everything in-place, starting with this one */
10587
0
            memcpy(res + rkind * i,
10588
0
                   buf2,
10589
0
                   rkind * len2);
10590
0
            i += len1;
10591
10592
0
            while ( --maxcount > 0) {
10593
0
                i = anylib_find(rkind, self,
10594
0
                                sbuf+rkind*i, slen-i,
10595
0
                                str1, buf1, len1, i);
10596
0
                if (i == -1)
10597
0
                    break;
10598
0
                memcpy(res + rkind * i,
10599
0
                       buf2,
10600
0
                       rkind * len2);
10601
0
                i += len1;
10602
0
            }
10603
0
        }
10604
16
    }
10605
2
    else {
10606
2
        Py_ssize_t n, i, j, ires;
10607
2
        Py_ssize_t new_size;
10608
2
        int rkind = skind;
10609
2
        char *res;
10610
10611
2
        if (kind1 < rkind) {
10612
            /* widen substring */
10613
0
            buf1 = _PyUnicode_AsKind(str1, rkind);
10614
0
            if (!buf1) goto error;
10615
0
            release1 = 1;
10616
0
        }
10617
2
        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10618
2
        if (n == 0)
10619
0
            goto nothing;
10620
2
        if (kind2 < rkind) {
10621
            /* widen replacement */
10622
0
            buf2 = _PyUnicode_AsKind(str2, rkind);
10623
0
            if (!buf2) goto error;
10624
0
            release2 = 1;
10625
0
        }
10626
2
        else if (kind2 > rkind) {
10627
            /* widen self and buf1 */
10628
0
            rkind = kind2;
10629
0
            sbuf = _PyUnicode_AsKind(self, rkind);
10630
0
            if (!sbuf) goto error;
10631
0
            srelease = 1;
10632
0
            if (release1) PyMem_Free(buf1);
10633
0
            release1 = 0;
10634
0
            buf1 = _PyUnicode_AsKind(str1, rkind);
10635
0
            if (!buf1) goto error;
10636
0
            release1 = 1;
10637
0
        }
10638
        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10639
           PyUnicode_GET_LENGTH(str1))); */
10640
2
        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10641
0
                PyErr_SetString(PyExc_OverflowError,
10642
0
                                "replace string is too long");
10643
0
                goto error;
10644
0
        }
10645
2
        new_size = slen + n * (len2 - len1);
10646
2
        if (new_size == 0) {
10647
0
            _Py_INCREF_UNICODE_EMPTY();
10648
0
            if (!unicode_empty)
10649
0
                goto error;
10650
0
            u = unicode_empty;
10651
0
            goto done;
10652
0
        }
10653
2
        if (new_size > (PY_SSIZE_T_MAX / rkind)) {
10654
0
            PyErr_SetString(PyExc_OverflowError,
10655
0
                            "replace string is too long");
10656
0
            goto error;
10657
0
        }
10658
2
        u = PyUnicode_New(new_size, maxchar);
10659
2
        if (!u)
10660
0
            goto error;
10661
2
        assert(PyUnicode_KIND(u) == rkind);
10662
2
        res = PyUnicode_DATA(u);
10663
2
        ires = i = 0;
10664
2
        if (len1 > 0) {
10665
20
            while (n-- > 0) {
10666
                /* look for next match */
10667
18
                j = anylib_find(rkind, self,
10668
18
                                sbuf + rkind * i, slen-i,
10669
18
                                str1, buf1, len1, i);
10670
18
                if (j == -1)
10671
0
                    break;
10672
18
                else if (j > i) {
10673
                    /* copy unchanged part [i:j] */
10674
18
                    memcpy(res + rkind * ires,
10675
18
                           sbuf + rkind * i,
10676
18
                           rkind * (j-i));
10677
18
                    ires += j - i;
10678
18
                }
10679
                /* copy substitution string */
10680
18
                if (len2 > 0) {
10681
0
                    memcpy(res + rkind * ires,
10682
0
                           buf2,
10683
0
                           rkind * len2);
10684
0
                    ires += len2;
10685
0
                }
10686
18
                i = j + len1;
10687
18
            }
10688
2
            if (i < slen)
10689
                /* copy tail [i:] */
10690
2
                memcpy(res + rkind * ires,
10691
2
                       sbuf + rkind * i,
10692
2
                       rkind * (slen-i));
10693
2
        }
10694
0
        else {
10695
            /* interleave */
10696
0
            while (n > 0) {
10697
0
                memcpy(res + rkind * ires,
10698
0
                       buf2,
10699
0
                       rkind * len2);
10700
0
                ires += len2;
10701
0
                if (--n <= 0)
10702
0
                    break;
10703
0
                memcpy(res + rkind * ires,
10704
0
                       sbuf + rkind * i,
10705
0
                       rkind);
10706
0
                ires++;
10707
0
                i++;
10708
0
            }
10709
0
            memcpy(res + rkind * ires,
10710
0
                   sbuf + rkind * i,
10711
0
                   rkind * (slen-i));
10712
0
        }
10713
2
    }
10714
10715
2
    if (mayshrink) {
10716
0
        unicode_adjust_maxchar(&u);
10717
0
        if (u == NULL)
10718
0
            goto error;
10719
0
    }
10720
10721
2
  done:
10722
2
    if (srelease)
10723
0
        PyMem_FREE(sbuf);
10724
2
    if (release1)
10725
0
        PyMem_FREE(buf1);
10726
2
    if (release2)
10727
0
        PyMem_FREE(buf2);
10728
2
    assert(_PyUnicode_CheckConsistency(u, 1));
10729
2
    return u;
10730
10731
16
  nothing:
10732
    /* nothing to replace; return original string (when possible) */
10733
16
    if (srelease)
10734
0
        PyMem_FREE(sbuf);
10735
16
    if (release1)
10736
0
        PyMem_FREE(buf1);
10737
16
    if (release2)
10738
0
        PyMem_FREE(buf2);
10739
16
    return unicode_result_unchanged(self);
10740
10741
0
  error:
10742
0
    if (srelease && sbuf)
10743
0
        PyMem_FREE(sbuf);
10744
0
    if (release1 && buf1)
10745
0
        PyMem_FREE(buf1);
10746
0
    if (release2 && buf2)
10747
0
        PyMem_FREE(buf2);
10748
0
    return NULL;
10749
2
}
10750
10751
/* --- Unicode Object Methods --------------------------------------------- */
10752
10753
/*[clinic input]
10754
str.title as unicode_title
10755
10756
Return a version of the string where each word is titlecased.
10757
10758
More specifically, words start with uppercased characters and all remaining
10759
cased characters have lower case.
10760
[clinic start generated code]*/
10761
10762
static PyObject *
10763
unicode_title_impl(PyObject *self)
10764
/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
10765
0
{
10766
0
    if (PyUnicode_READY(self) == -1)
10767
0
        return NULL;
10768
0
    return case_operation(self, do_title);
10769
0
}
10770
10771
/*[clinic input]
10772
str.capitalize as unicode_capitalize
10773
10774
Return a capitalized version of the string.
10775
10776
More specifically, make the first character have upper case and the rest lower
10777
case.
10778
[clinic start generated code]*/
10779
10780
static PyObject *
10781
unicode_capitalize_impl(PyObject *self)
10782
/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
10783
0
{
10784
0
    if (PyUnicode_READY(self) == -1)
10785
0
        return NULL;
10786
0
    if (PyUnicode_GET_LENGTH(self) == 0)
10787
0
        return unicode_result_unchanged(self);
10788
0
    return case_operation(self, do_capitalize);
10789
0
}
10790
10791
/*[clinic input]
10792
str.casefold as unicode_casefold
10793
10794
Return a version of the string suitable for caseless comparisons.
10795
[clinic start generated code]*/
10796
10797
static PyObject *
10798
unicode_casefold_impl(PyObject *self)
10799
/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
10800
0
{
10801
0
    if (PyUnicode_READY(self) == -1)
10802
0
        return NULL;
10803
0
    if (PyUnicode_IS_ASCII(self))
10804
0
        return ascii_upper_or_lower(self, 1);
10805
0
    return case_operation(self, do_casefold);
10806
0
}
10807
10808
10809
/* Argument converter. Accepts a single Unicode character. */
10810
10811
static int
10812
convert_uc(PyObject *obj, void *addr)
10813
0
{
10814
0
    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10815
10816
0
    if (!PyUnicode_Check(obj)) {
10817
0
        PyErr_Format(PyExc_TypeError,
10818
0
                     "The fill character must be a unicode character, "
10819
0
                     "not %.100s", Py_TYPE(obj)->tp_name);
10820
0
        return 0;
10821
0
    }
10822
0
    if (PyUnicode_READY(obj) < 0)
10823
0
        return 0;
10824
0
    if (PyUnicode_GET_LENGTH(obj) != 1) {
10825
0
        PyErr_SetString(PyExc_TypeError,
10826
0
                        "The fill character must be exactly one character long");
10827
0
        return 0;
10828
0
    }
10829
0
    *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
10830
0
    return 1;
10831
0
}
10832
10833
/*[clinic input]
10834
str.center as unicode_center
10835
10836
    width: Py_ssize_t
10837
    fillchar: Py_UCS4 = ' '
10838
    /
10839
10840
Return a centered string of length width.
10841
10842
Padding is done using the specified fill character (default is a space).
10843
[clinic start generated code]*/
10844
10845
static PyObject *
10846
unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10847
/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
10848
0
{
10849
0
    Py_ssize_t marg, left;
10850
10851
0
    if (PyUnicode_READY(self) == -1)
10852
0
        return NULL;
10853
10854
0
    if (PyUnicode_GET_LENGTH(self) >= width)
10855
0
        return unicode_result_unchanged(self);
10856
10857
0
    marg = width - PyUnicode_GET_LENGTH(self);
10858
0
    left = marg / 2 + (marg & width & 1);
10859
10860
0
    return pad(self, left, marg - left, fillchar);
10861
0
}
10862
10863
/* This function assumes that str1 and str2 are readied by the caller. */
10864
10865
static int
10866
unicode_compare(PyObject *str1, PyObject *str2)
10867
690
{
10868
690
#define COMPARE(TYPE1, TYPE2) \
10869
690
    do { \
10870
0
        TYPE1* p1 = (TYPE1 *)data1; \
10871
0
        TYPE2* p2 = (TYPE2 *)data2; \
10872
0
        TYPE1* end = p1 + len; \
10873
0
        Py_UCS4 c1, c2; \
10874
0
        for (; p1 != end; p1++, p2++) { \
10875
0
            c1 = *p1; \
10876
0
            c2 = *p2; \
10877
0
            if (c1 != c2) \
10878
0
                return (c1 < c2) ? -1 : 1; \
10879
0
        } \
10880
0
    } \
10881
0
    while (0)
10882
10883
690
    int kind1, kind2;
10884
690
    void *data1, *data2;
10885
690
    Py_ssize_t len1, len2, len;
10886
10887
690
    kind1 = PyUnicode_KIND(str1);
10888
690
    kind2 = PyUnicode_KIND(str2);
10889
690
    data1 = PyUnicode_DATA(str1);
10890
690
    data2 = PyUnicode_DATA(str2);
10891
690
    len1 = PyUnicode_GET_LENGTH(str1);
10892
690
    len2 = PyUnicode_GET_LENGTH(str2);
10893
690
    len = Py_MIN(len1, len2);
10894
10895
690
    switch(kind1) {
10896
690
    case PyUnicode_1BYTE_KIND:
10897
690
    {
10898
690
        switch(kind2) {
10899
690
        case PyUnicode_1BYTE_KIND:
10900
690
        {
10901
690
            int cmp = memcmp(data1, data2, len);
10902
            /* normalize result of memcmp() into the range [-1; 1] */
10903
690
            if (cmp < 0)
10904
254
                return -1;
10905
436
            if (cmp > 0)
10906
182
                return 1;
10907
254
            break;
10908
436
        }
10909
254
        case PyUnicode_2BYTE_KIND:
10910
0
            COMPARE(Py_UCS1, Py_UCS2);
10911
0
            break;
10912
0
        case PyUnicode_4BYTE_KIND:
10913
0
            COMPARE(Py_UCS1, Py_UCS4);
10914
0
            break;
10915
0
        default:
10916
0
            Py_UNREACHABLE();
10917
690
        }
10918
254
        break;
10919
690
    }
10920
254
    case PyUnicode_2BYTE_KIND:
10921
0
    {
10922
0
        switch(kind2) {
10923
0
        case PyUnicode_1BYTE_KIND:
10924
0
            COMPARE(Py_UCS2, Py_UCS1);
10925
0
            break;
10926
0
        case PyUnicode_2BYTE_KIND:
10927
0
        {
10928
0
            COMPARE(Py_UCS2, Py_UCS2);
10929
0
            break;
10930
0
        }
10931
0
        case PyUnicode_4BYTE_KIND:
10932
0
            COMPARE(Py_UCS2, Py_UCS4);
10933
0
            break;
10934
0
        default:
10935
0
            Py_UNREACHABLE();
10936
0
        }
10937
0
        break;
10938
0
    }
10939
0
    case PyUnicode_4BYTE_KIND:
10940
0
    {
10941
0
        switch(kind2) {
10942
0
        case PyUnicode_1BYTE_KIND:
10943
0
            COMPARE(Py_UCS4, Py_UCS1);
10944
0
            break;
10945
0
        case PyUnicode_2BYTE_KIND:
10946
0
            COMPARE(Py_UCS4, Py_UCS2);
10947
0
            break;
10948
0
        case PyUnicode_4BYTE_KIND:
10949
0
        {
10950
0
#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10951
0
            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10952
            /* normalize result of wmemcmp() into the range [-1; 1] */
10953
0
            if (cmp < 0)
10954
0
                return -1;
10955
0
            if (cmp > 0)
10956
0
                return 1;
10957
#else
10958
            COMPARE(Py_UCS4, Py_UCS4);
10959
#endif
10960
0
            break;
10961
0
        }
10962
0
        default:
10963
0
            Py_UNREACHABLE();
10964
0
        }
10965
0
        break;
10966
0
    }
10967
0
    default:
10968
0
        Py_UNREACHABLE();
10969
690
    }
10970
10971
254
    if (len1 == len2)
10972
235
        return 0;
10973
19
    if (len1 < len2)
10974
17
        return -1;
10975
2
    else
10976
2
        return 1;
10977
10978
19
#undef COMPARE
10979
19
}
10980
10981
static int
10982
unicode_compare_eq(PyObject *str1, PyObject *str2)
10983
30.9k
{
10984
30.9k
    int kind;
10985
30.9k
    void *data1, *data2;
10986
30.9k
    Py_ssize_t len;
10987
30.9k
    int cmp;
10988
10989
30.9k
    len = PyUnicode_GET_LENGTH(str1);
10990
30.9k
    if (PyUnicode_GET_LENGTH(str2) != len)
10991
21.2k
        return 0;
10992
9.70k
    kind = PyUnicode_KIND(str1);
10993
9.70k
    if (PyUnicode_KIND(str2) != kind)
10994
0
        return 0;
10995
9.70k
    data1 = PyUnicode_DATA(str1);
10996
9.70k
    data2 = PyUnicode_DATA(str2);
10997
10998
9.70k
    cmp = memcmp(data1, data2, len * kind);
10999
9.70k
    return (cmp == 0);
11000
9.70k
}
11001
11002
11003
int
11004
PyUnicode_Compare(PyObject *left, PyObject *right)
11005
892
{
11006
892
    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11007
892
        if (PyUnicode_READY(left) == -1 ||
11008
892
            PyUnicode_READY(right) == -1)
11009
0
            return -1;
11010
11011
        /* a string is equal to itself */
11012
892
        if (left == right)
11013
202
            return 0;
11014
11015
690
        return unicode_compare(left, right);
11016
892
    }
11017
0
    PyErr_Format(PyExc_TypeError,
11018
0
                 "Can't compare %.100s and %.100s",
11019
0
                 left->ob_type->tp_name,
11020
0
                 right->ob_type->tp_name);
11021
0
    return -1;
11022
892
}
11023
11024
int
11025
PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11026
0
{
11027
0
    Py_ssize_t i;
11028
0
    int kind;
11029
0
    Py_UCS4 chr;
11030
0
    const unsigned char *ustr = (const unsigned char *)str;
11031
11032
0
    assert(_PyUnicode_CHECK(uni));
11033
0
    if (!PyUnicode_IS_READY(uni)) {
11034
0
        const wchar_t *ws = _PyUnicode_WSTR(uni);
11035
        /* Compare Unicode string and source character set string */
11036
0
        for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11037
0
            if (chr != ustr[i])
11038
0
                return (chr < ustr[i]) ? -1 : 1;
11039
0
        }
11040
        /* This check keeps Python strings that end in '\0' from comparing equal
11041
         to C strings identical up to that point. */
11042
0
        if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11043
0
            return 1; /* uni is longer */
11044
0
        if (ustr[i])
11045
0
            return -1; /* str is longer */
11046
0
        return 0;
11047
0
    }
11048
0
    kind = PyUnicode_KIND(uni);
11049
0
    if (kind == PyUnicode_1BYTE_KIND) {
11050
0
        const void *data = PyUnicode_1BYTE_DATA(uni);
11051
0
        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
11052
0
        size_t len, len2 = strlen(str);
11053
0
        int cmp;
11054
11055
0
        len = Py_MIN(len1, len2);
11056
0
        cmp = memcmp(data, str, len);
11057
0
        if (cmp != 0) {
11058
0
            if (cmp < 0)
11059
0
                return -1;
11060
0
            else
11061
0
                return 1;
11062
0
        }
11063
0
        if (len1 > len2)
11064
0
            return 1; /* uni is longer */
11065
0
        if (len1 < len2)
11066
0
            return -1; /* str is longer */
11067
0
        return 0;
11068
0
    }
11069
0
    else {
11070
0
        void *data = PyUnicode_DATA(uni);
11071
        /* Compare Unicode string and source character set string */
11072
0
        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
11073
0
            if (chr != (unsigned char)str[i])
11074
0
                return (chr < (unsigned char)(str[i])) ? -1 : 1;
11075
        /* This check keeps Python strings that end in '\0' from comparing equal
11076
         to C strings identical up to that point. */
11077
0
        if (PyUnicode_GET_LENGTH(uni) != i || chr)
11078
0
            return 1; /* uni is longer */
11079
0
        if (str[i])
11080
0
            return -1; /* str is longer */
11081
0
        return 0;
11082
0
    }
11083
0
}
11084
11085
static int
11086
non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11087
0
{
11088
0
    size_t i, len;
11089
0
    const wchar_t *p;
11090
0
    len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11091
0
    if (strlen(str) != len)
11092
0
        return 0;
11093
0
    p = _PyUnicode_WSTR(unicode);
11094
0
    assert(p);
11095
0
    for (i = 0; i < len; i++) {
11096
0
        unsigned char c = (unsigned char)str[i];
11097
0
        if (c >= 128 || p[i] != (wchar_t)c)
11098
0
            return 0;
11099
0
    }
11100
0
    return 1;
11101
0
}
11102
11103
int
11104
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11105
14.1k
{
11106
14.1k
    size_t len;
11107
14.1k
    assert(_PyUnicode_CHECK(unicode));
11108
14.1k
    assert(str);
11109
#ifndef NDEBUG
11110
    for (const char *p = str; *p; p++) {
11111
        assert((unsigned char)*p < 128);
11112
    }
11113
#endif
11114
14.1k
    if (PyUnicode_READY(unicode) == -1) {
11115
        /* Memory error or bad data */
11116
0
        PyErr_Clear();
11117
0
        return non_ready_unicode_equal_to_ascii_string(unicode, str);
11118
0
    }
11119
14.1k
    if (!PyUnicode_IS_ASCII(unicode))
11120
0
        return 0;
11121
14.1k
    len = (size_t)PyUnicode_GET_LENGTH(unicode);
11122
14.1k
    return strlen(str) == len &&
11123
14.1k
           memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11124
14.1k
}
11125
11126
int
11127
_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11128
1.36k
{
11129
1.36k
    PyObject *right_uni;
11130
1.36k
    Py_hash_t hash;
11131
11132
1.36k
    assert(_PyUnicode_CHECK(left));
11133
1.36k
    assert(right->string);
11134
#ifndef NDEBUG
11135
    for (const char *p = right->string; *p; p++) {
11136
        assert((unsigned char)*p < 128);
11137
    }
11138
#endif
11139
11140
1.36k
    if (PyUnicode_READY(left) == -1) {
11141
        /* memory error or bad data */
11142
0
        PyErr_Clear();
11143
0
        return non_ready_unicode_equal_to_ascii_string(left, right->string);
11144
0
    }
11145
11146
1.36k
    if (!PyUnicode_IS_ASCII(left))
11147
0
        return 0;
11148
11149
1.36k
    right_uni = _PyUnicode_FromId(right);       /* borrowed */
11150
1.36k
    if (right_uni == NULL) {
11151
        /* memory error or bad data */
11152
0
        PyErr_Clear();
11153
0
        return _PyUnicode_EqualToASCIIString(left, right->string);
11154
0
    }
11155
11156
1.36k
    if (left == right_uni)
11157
1.31k
        return 1;
11158
11159
50
    if (PyUnicode_CHECK_INTERNED(left))
11160
50
        return 0;
11161
11162
0
    assert(_PyUnicode_HASH(right_uni) != -1);
11163
0
    hash = _PyUnicode_HASH(left);
11164
0
    if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11165
0
        return 0;
11166
11167
0
    return unicode_compare_eq(left, right_uni);
11168
0
}
11169
11170
PyObject *
11171
PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
11172
32.2k
{
11173
32.2k
    int result;
11174
11175
32.2k
    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11176
8
        Py_RETURN_NOTIMPLEMENTED;
11177
11178
32.2k
    if (PyUnicode_READY(left) == -1 ||
11179
32.2k
        PyUnicode_READY(right) == -1)
11180
0
        return NULL;
11181
11182
32.2k
    if (left == right) {
11183
1.28k
        switch (op) {
11184
416
        case Py_EQ:
11185
416
        case Py_LE:
11186
416
        case Py_GE:
11187
            /* a string is equal to itself */
11188
416
            Py_RETURN_TRUE;
11189
867
        case Py_NE:
11190
867
        case Py_LT:
11191
867
        case Py_GT:
11192
867
            Py_RETURN_FALSE;
11193
0
        default:
11194
0
            PyErr_BadArgument();
11195
0
            return NULL;
11196
1.28k
        }
11197
1.28k
    }
11198
30.9k
    else if (op == Py_EQ || op == Py_NE) {
11199
30.9k
        result = unicode_compare_eq(left, right);
11200
30.9k
        result ^= (op == Py_NE);
11201
30.9k
        return PyBool_FromLong(result);
11202
30.9k
    }
11203
0
    else {
11204
0
        result = unicode_compare(left, right);
11205
0
        Py_RETURN_RICHCOMPARE(result, 0, op);
11206
0
    }
11207
32.2k
}
11208
11209
int
11210
_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11211
424
{
11212
424
    return unicode_eq(aa, bb);
11213
424
}
11214
11215
int
11216
PyUnicode_Contains(PyObject *str, PyObject *substr)
11217
438
{
11218
438
    int kind1, kind2;
11219
438
    void *buf1, *buf2;
11220
438
    Py_ssize_t len1, len2;
11221
438
    int result;
11222
11223
438
    if (!PyUnicode_Check(substr)) {
11224
0
        PyErr_Format(PyExc_TypeError,
11225
0
                     "'in <string>' requires string as left operand, not %.100s",
11226
0
                     Py_TYPE(substr)->tp_name);
11227
0
        return -1;
11228
0
    }
11229
438
    if (PyUnicode_READY(substr) == -1)
11230
0
        return -1;
11231
438
    if (ensure_unicode(str) < 0)
11232
0
        return -1;
11233
11234
438
    kind1 = PyUnicode_KIND(str);
11235
438
    kind2 = PyUnicode_KIND(substr);
11236
438
    if (kind1 < kind2)
11237
0
        return 0;
11238
438
    len1 = PyUnicode_GET_LENGTH(str);
11239
438
    len2 = PyUnicode_GET_LENGTH(substr);
11240
438
    if (len1 < len2)
11241
0
        return 0;
11242
438
    buf1 = PyUnicode_DATA(str);
11243
438
    buf2 = PyUnicode_DATA(substr);
11244
438
    if (len2 == 1) {
11245
417
        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11246
417
        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11247
417
        return result;
11248
417
    }
11249
21
    if (kind2 != kind1) {
11250
0
        buf2 = _PyUnicode_AsKind(substr, kind1);
11251
0
        if (!buf2)
11252
0
            return -1;
11253
0
    }
11254
11255
21
    switch (kind1) {
11256
21
    case PyUnicode_1BYTE_KIND:
11257
21
        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11258
21
        break;
11259
0
    case PyUnicode_2BYTE_KIND:
11260
0
        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11261
0
        break;
11262
0
    case PyUnicode_4BYTE_KIND:
11263
0
        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11264
0
        break;
11265
0
    default:
11266
0
        Py_UNREACHABLE();
11267
21
    }
11268
11269
21
    if (kind2 != kind1)
11270
0
        PyMem_Free(buf2);
11271
11272
21
    return result;
11273
21
}
11274
11275
/* Concat to string or Unicode object giving a new Unicode object. */
11276
11277
PyObject *
11278
PyUnicode_Concat(PyObject *left, PyObject *right)
11279
0
{
11280
0
    PyObject *result;
11281
0
    Py_UCS4 maxchar, maxchar2;
11282
0
    Py_ssize_t left_len, right_len, new_len;
11283
11284
0
    if (ensure_unicode(left) < 0)
11285
0
        return NULL;
11286
11287
0
    if (!PyUnicode_Check(right)) {
11288
0
        PyErr_Format(PyExc_TypeError,
11289
0
                     "can only concatenate str (not \"%.200s\") to str",
11290
0
                     right->ob_type->tp_name);
11291
0
        return NULL;
11292
0
    }
11293
0
    if (PyUnicode_READY(right) < 0)
11294
0
        return NULL;
11295
11296
    /* Shortcuts */
11297
0
    if (left == unicode_empty)
11298
0
        return PyUnicode_FromObject(right);
11299
0
    if (right == unicode_empty)
11300
0
        return PyUnicode_FromObject(left);
11301
11302
0
    left_len = PyUnicode_GET_LENGTH(left);
11303
0
    right_len = PyUnicode_GET_LENGTH(right);
11304
0
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11305
0
        PyErr_SetString(PyExc_OverflowError,
11306
0
                        "strings are too large to concat");
11307
0
        return NULL;
11308
0
    }
11309
0
    new_len = left_len + right_len;
11310
11311
0
    maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11312
0
    maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11313
0
    maxchar = Py_MAX(maxchar, maxchar2);
11314
11315
    /* Concat the two Unicode strings */
11316
0
    result = PyUnicode_New(new_len, maxchar);
11317
0
    if (result == NULL)
11318
0
        return NULL;
11319
0
    _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11320
0
    _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11321
0
    assert(_PyUnicode_CheckConsistency(result, 1));
11322
0
    return result;
11323
0
}
11324
11325
void
11326
PyUnicode_Append(PyObject **p_left, PyObject *right)
11327
3.57k
{
11328
3.57k
    PyObject *left, *res;
11329
3.57k
    Py_UCS4 maxchar, maxchar2;
11330
3.57k
    Py_ssize_t left_len, right_len, new_len;
11331
11332
3.57k
    if (p_left == NULL) {
11333
0
        if (!PyErr_Occurred())
11334
0
            PyErr_BadInternalCall();
11335
0
        return;
11336
0
    }
11337
3.57k
    left = *p_left;
11338
3.57k
    if (right == NULL || left == NULL
11339
3.57k
        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11340
0
        if (!PyErr_Occurred())
11341
0
            PyErr_BadInternalCall();
11342
0
        goto error;
11343
0
    }
11344
11345
3.57k
    if (PyUnicode_READY(left) == -1)
11346
0
        goto error;
11347
3.57k
    if (PyUnicode_READY(right) == -1)
11348
0
        goto error;
11349
11350
    /* Shortcuts */
11351
3.57k
    if (left == unicode_empty) {
11352
68
        Py_DECREF(left);
11353
68
        Py_INCREF(right);
11354
68
        *p_left = right;
11355
68
        return;
11356
68
    }
11357
3.50k
    if (right == unicode_empty)
11358
0
        return;
11359
11360
3.50k
    left_len = PyUnicode_GET_LENGTH(left);
11361
3.50k
    right_len = PyUnicode_GET_LENGTH(right);
11362
3.50k
    if (left_len > PY_SSIZE_T_MAX - right_len) {
11363
0
        PyErr_SetString(PyExc_OverflowError,
11364
0
                        "strings are too large to concat");
11365
0
        goto error;
11366
0
    }
11367
3.50k
    new_len = left_len + right_len;
11368
11369
3.50k
    if (unicode_modifiable(left)
11370
3.50k
        && PyUnicode_CheckExact(right)
11371
3.50k
        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11372
        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11373
           to change the structure size, but characters are stored just after
11374
           the structure, and so it requires to move all characters which is
11375
           not so different than duplicating the string. */
11376
3.50k
        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11377
104
    {
11378
        /* append inplace */
11379
104
        if (unicode_resize(p_left, new_len) != 0)
11380
0
            goto error;
11381
11382
        /* copy 'right' into the newly allocated area of 'left' */
11383
104
        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11384
104
    }
11385
3.40k
    else {
11386
3.40k
        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11387
3.40k
        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11388
3.40k
        maxchar = Py_MAX(maxchar, maxchar2);
11389
11390
        /* Concat the two Unicode strings */
11391
3.40k
        res = PyUnicode_New(new_len, maxchar);
11392
3.40k
        if (res == NULL)
11393
0
            goto error;
11394
3.40k
        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11395
3.40k
        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11396
3.40k
        Py_DECREF(left);
11397
3.40k
        *p_left = res;
11398
3.40k
    }
11399
3.50k
    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11400
3.50k
    return;
11401
11402
0
error:
11403
0
    Py_CLEAR(*p_left);
11404
0
}
11405
11406
void
11407
PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11408
0
{
11409
0
    PyUnicode_Append(pleft, right);
11410
0
    Py_XDECREF(right);
11411
0
}
11412
11413
/*
11414
Wraps stringlib_parse_args_finds() and additionally ensures that the
11415
first argument is a unicode object.
11416
*/
11417
11418
static inline int
11419
parse_args_finds_unicode(const char * function_name, PyObject *args,
11420
                         PyObject **substring,
11421
                         Py_ssize_t *start, Py_ssize_t *end)
11422
98
{
11423
98
    if(stringlib_parse_args_finds(function_name, args, substring,
11424
98
                                  start, end)) {
11425
98
        if (ensure_unicode(*substring) < 0)
11426
0
            return 0;
11427
98
        return 1;
11428
98
    }
11429
0
    return 0;
11430
98
}
11431
11432
PyDoc_STRVAR(count__doc__,
11433
             "S.count(sub[, start[, end]]) -> int\n\
11434
\n\
11435
Return the number of non-overlapping occurrences of substring sub in\n\
11436
string S[start:end].  Optional arguments start and end are\n\
11437
interpreted as in slice notation.");
11438
11439
static PyObject *
11440
unicode_count(PyObject *self, PyObject *args)
11441
0
{
11442
0
    PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11443
0
    Py_ssize_t start = 0;
11444
0
    Py_ssize_t end = PY_SSIZE_T_MAX;
11445
0
    PyObject *result;
11446
0
    int kind1, kind2;
11447
0
    void *buf1, *buf2;
11448
0
    Py_ssize_t len1, len2, iresult;
11449
11450
0
    if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
11451
0
        return NULL;
11452
11453
0
    kind1 = PyUnicode_KIND(self);
11454
0
    kind2 = PyUnicode_KIND(substring);
11455
0
    if (kind1 < kind2)
11456
0
        return PyLong_FromLong(0);
11457
11458
0
    len1 = PyUnicode_GET_LENGTH(self);
11459
0
    len2 = PyUnicode_GET_LENGTH(substring);
11460
0
    ADJUST_INDICES(start, end, len1);
11461
0
    if (end - start < len2)
11462
0
        return PyLong_FromLong(0);
11463
11464
0
    buf1 = PyUnicode_DATA(self);
11465
0
    buf2 = PyUnicode_DATA(substring);
11466
0
    if (kind2 != kind1) {
11467
0
        buf2 = _PyUnicode_AsKind(substring, kind1);
11468
0
        if (!buf2)
11469
0
            return NULL;
11470
0
    }
11471
0
    switch (kind1) {
11472
0
    case PyUnicode_1BYTE_KIND:
11473
0
        iresult = ucs1lib_count(
11474
0
            ((Py_UCS1*)buf1) + start, end - start,
11475
0
            buf2, len2, PY_SSIZE_T_MAX
11476
0
            );
11477
0
        break;
11478
0
    case PyUnicode_2BYTE_KIND:
11479
0
        iresult = ucs2lib_count(
11480
0
            ((Py_UCS2*)buf1) + start, end - start,
11481
0
            buf2, len2, PY_SSIZE_T_MAX
11482
0
            );
11483
0
        break;
11484
0
    case PyUnicode_4BYTE_KIND:
11485
0
        iresult = ucs4lib_count(
11486
0
            ((Py_UCS4*)buf1) + start, end - start,
11487
0
            buf2, len2, PY_SSIZE_T_MAX
11488
0
            );
11489
0
        break;
11490
0
    default:
11491
0
        Py_UNREACHABLE();
11492
0
    }
11493
11494
0
    result = PyLong_FromSsize_t(iresult);
11495
11496
0
    if (kind2 != kind1)
11497
0
        PyMem_Free(buf2);
11498
11499
0
    return result;
11500
0
}
11501
11502
/*[clinic input]
11503
str.encode as unicode_encode
11504
11505
    encoding: str(c_default="NULL") = 'utf-8'
11506
        The encoding in which to encode the string.
11507
    errors: str(c_default="NULL") = 'strict'
11508
        The error handling scheme to use for encoding errors.
11509
        The default is 'strict' meaning that encoding errors raise a
11510
        UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
11511
        'xmlcharrefreplace' as well as any other name registered with
11512
        codecs.register_error that can handle UnicodeEncodeErrors.
11513
11514
Encode the string using the codec registered for encoding.
11515
[clinic start generated code]*/
11516
11517
static PyObject *
11518
unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
11519
/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
11520
14
{
11521
14
    return PyUnicode_AsEncodedString(self, encoding, errors);
11522
14
}
11523
11524
/*[clinic input]
11525
str.expandtabs as unicode_expandtabs
11526
11527
    tabsize: int = 8
11528
11529
Return a copy where all tab characters are expanded using spaces.
11530
11531
If tabsize is not given, a tab size of 8 characters is assumed.
11532
[clinic start generated code]*/
11533
11534
static PyObject *
11535
unicode_expandtabs_impl(PyObject *self, int tabsize)
11536
/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
11537
0
{
11538
0
    Py_ssize_t i, j, line_pos, src_len, incr;
11539
0
    Py_UCS4 ch;
11540
0
    PyObject *u;
11541
0
    void *src_data, *dest_data;
11542
0
    int kind;
11543
0
    int found;
11544
11545
0
    if (PyUnicode_READY(self) == -1)
11546
0
        return NULL;
11547
11548
    /* First pass: determine size of output string */
11549
0
    src_len = PyUnicode_GET_LENGTH(self);
11550
0
    i = j = line_pos = 0;
11551
0
    kind = PyUnicode_KIND(self);
11552
0
    src_data = PyUnicode_DATA(self);
11553
0
    found = 0;
11554
0
    for (; i < src_len; i++) {
11555
0
        ch = PyUnicode_READ(kind, src_data, i);
11556
0
        if (ch == '\t') {
11557
0
            found = 1;
11558
0
            if (tabsize > 0) {
11559
0
                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11560
0
                if (j > PY_SSIZE_T_MAX - incr)
11561
0
                    goto overflow;
11562
0
                line_pos += incr;
11563
0
                j += incr;
11564
0
            }
11565
0
        }
11566
0
        else {
11567
0
            if (j > PY_SSIZE_T_MAX - 1)
11568
0
                goto overflow;
11569
0
            line_pos++;
11570
0
            j++;
11571
0
            if (ch == '\n' || ch == '\r')
11572
0
                line_pos = 0;
11573
0
        }
11574
0
    }
11575
0
    if (!found)
11576
0
        return unicode_result_unchanged(self);
11577
11578
    /* Second pass: create output string and fill it */
11579
0
    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11580
0
    if (!u)
11581
0
        return NULL;
11582
0
    dest_data = PyUnicode_DATA(u);
11583
11584
0
    i = j = line_pos = 0;
11585
11586
0
    for (; i < src_len; i++) {
11587
0
        ch = PyUnicode_READ(kind, src_data, i);
11588
0
        if (ch == '\t') {
11589
0
            if (tabsize > 0) {
11590
0
                incr = tabsize - (line_pos % tabsize);
11591
0
                line_pos += incr;
11592
0
                unicode_fill(kind, dest_data, ' ', j, incr);
11593
0
                j += incr;
11594
0
            }
11595
0
        }
11596
0
        else {
11597
0
            line_pos++;
11598
0
            PyUnicode_WRITE(kind, dest_data, j, ch);
11599
0
            j++;
11600
0
            if (ch == '\n' || ch == '\r')
11601
0
                line_pos = 0;
11602
0
        }
11603
0
    }
11604
0
    assert (j == PyUnicode_GET_LENGTH(u));
11605
0
    return unicode_result(u);
11606
11607
0
  overflow:
11608
0
    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11609
0
    return NULL;
11610
0
}
11611
11612
PyDoc_STRVAR(find__doc__,
11613
             "S.find(sub[, start[, end]]) -> int\n\
11614
\n\
11615
Return the lowest index in S where substring sub is found,\n\
11616
such that sub is contained within S[start:end].  Optional\n\
11617
arguments start and end are interpreted as in slice notation.\n\
11618
\n\
11619
Return -1 on failure.");
11620
11621
static PyObject *
11622
unicode_find(PyObject *self, PyObject *args)
11623
14
{
11624
    /* initialize variables to prevent gcc warning */
11625
14
    PyObject *substring = NULL;
11626
14
    Py_ssize_t start = 0;
11627
14
    Py_ssize_t end = 0;
11628
14
    Py_ssize_t result;
11629
11630
14
    if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
11631
0
        return NULL;
11632
11633
14
    if (PyUnicode_READY(self) == -1)
11634
0
        return NULL;
11635
11636
14
    result = any_find_slice(self, substring, start, end, 1);
11637
11638
14
    if (result == -2)
11639
0
        return NULL;
11640
11641
14
    return PyLong_FromSsize_t(result);
11642
14
}
11643
11644
static PyObject *
11645
unicode_getitem(PyObject *self, Py_ssize_t index)
11646
5.15k
{
11647
5.15k
    void *data;
11648
5.15k
    enum PyUnicode_Kind kind;
11649
5.15k
    Py_UCS4 ch;
11650
11651
5.15k
    if (!PyUnicode_Check(self)) {
11652
0
        PyErr_BadArgument();
11653
0
        return NULL;
11654
0
    }
11655
5.15k
    if (PyUnicode_READY(self) == -1) {
11656
0
        return NULL;
11657
0
    }
11658
5.15k
    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11659
8
        PyErr_SetString(PyExc_IndexError, "string index out of range");
11660
8
        return NULL;
11661
8
    }
11662
5.14k
    kind = PyUnicode_KIND(self);
11663
5.14k
    data = PyUnicode_DATA(self);
11664
5.14k
    ch = PyUnicode_READ(kind, data, index);
11665
5.14k
    return unicode_char(ch);
11666
5.15k
}
11667
11668
/* Believe it or not, this produces the same value for ASCII strings
11669
   as bytes_hash(). */
11670
static Py_hash_t
11671
unicode_hash(PyObject *self)
11672
104k
{
11673
104k
    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11674
11675
#ifdef Py_DEBUG
11676
    assert(_Py_HashSecret_Initialized);
11677
#endif
11678
104k
    if (_PyUnicode_HASH(self) != -1)
11679
1.28k
        return _PyUnicode_HASH(self);
11680
102k
    if (PyUnicode_READY(self) == -1)
11681
0
        return -1;
11682
11683
102k
    x = _Py_HashBytes(PyUnicode_DATA(self),
11684
102k
                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11685
102k
    _PyUnicode_HASH(self) = x;
11686
102k
    return x;
11687
102k
}
11688
11689
PyDoc_STRVAR(index__doc__,
11690
             "S.index(sub[, start[, end]]) -> int\n\
11691
\n\
11692
Return the lowest index in S where substring sub is found,\n\
11693
such that sub is contained within S[start:end].  Optional\n\
11694
arguments start and end are interpreted as in slice notation.\n\
11695
\n\
11696
Raises ValueError when the substring is not found.");
11697
11698
static PyObject *
11699
unicode_index(PyObject *self, PyObject *args)
11700
0
{
11701
    /* initialize variables to prevent gcc warning */
11702
0
    Py_ssize_t result;
11703
0
    PyObject *substring = NULL;
11704
0
    Py_ssize_t start = 0;
11705
0
    Py_ssize_t end = 0;
11706
11707
0
    if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
11708
0
        return NULL;
11709
11710
0
    if (PyUnicode_READY(self) == -1)
11711
0
        return NULL;
11712
11713
0
    result = any_find_slice(self, substring, start, end, 1);
11714
11715
0
    if (result == -2)
11716
0
        return NULL;
11717
11718
0
    if (result < 0) {
11719
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
11720
0
        return NULL;
11721
0
    }
11722
11723
0
    return PyLong_FromSsize_t(result);
11724
0
}
11725
11726
/*[clinic input]
11727
str.isascii as unicode_isascii
11728
11729
Return True if all characters in the string are ASCII, False otherwise.
11730
11731
ASCII characters have code points in the range U+0000-U+007F.
11732
Empty string is ASCII too.
11733
[clinic start generated code]*/
11734
11735
static PyObject *
11736
unicode_isascii_impl(PyObject *self)
11737
/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11738
0
{
11739
0
    if (PyUnicode_READY(self) == -1) {
11740
0
        return NULL;
11741
0
    }
11742
0
    return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11743
0
}
11744
11745
/*[clinic input]
11746
str.islower as unicode_islower
11747
11748
Return True if the string is a lowercase string, False otherwise.
11749
11750
A string is lowercase if all cased characters in the string are lowercase and
11751
there is at least one cased character in the string.
11752
[clinic start generated code]*/
11753
11754
static PyObject *
11755
unicode_islower_impl(PyObject *self)
11756
/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
11757
0
{
11758
0
    Py_ssize_t i, length;
11759
0
    int kind;
11760
0
    void *data;
11761
0
    int cased;
11762
11763
0
    if (PyUnicode_READY(self) == -1)
11764
0
        return NULL;
11765
0
    length = PyUnicode_GET_LENGTH(self);
11766
0
    kind = PyUnicode_KIND(self);
11767
0
    data = PyUnicode_DATA(self);
11768
11769
    /* Shortcut for single character strings */
11770
0
    if (length == 1)
11771
0
        return PyBool_FromLong(
11772
0
            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11773
11774
    /* Special case for empty strings */
11775
0
    if (length == 0)
11776
0
        Py_RETURN_FALSE;
11777
11778
0
    cased = 0;
11779
0
    for (i = 0; i < length; i++) {
11780
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11781
11782
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11783
0
            Py_RETURN_FALSE;
11784
0
        else if (!cased && Py_UNICODE_ISLOWER(ch))
11785
0
            cased = 1;
11786
0
    }
11787
0
    return PyBool_FromLong(cased);
11788
0
}
11789
11790
/*[clinic input]
11791
str.isupper as unicode_isupper
11792
11793
Return True if the string is an uppercase string, False otherwise.
11794
11795
A string is uppercase if all cased characters in the string are uppercase and
11796
there is at least one cased character in the string.
11797
[clinic start generated code]*/
11798
11799
static PyObject *
11800
unicode_isupper_impl(PyObject *self)
11801
/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
11802
0
{
11803
0
    Py_ssize_t i, length;
11804
0
    int kind;
11805
0
    void *data;
11806
0
    int cased;
11807
11808
0
    if (PyUnicode_READY(self) == -1)
11809
0
        return NULL;
11810
0
    length = PyUnicode_GET_LENGTH(self);
11811
0
    kind = PyUnicode_KIND(self);
11812
0
    data = PyUnicode_DATA(self);
11813
11814
    /* Shortcut for single character strings */
11815
0
    if (length == 1)
11816
0
        return PyBool_FromLong(
11817
0
            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11818
11819
    /* Special case for empty strings */
11820
0
    if (length == 0)
11821
0
        Py_RETURN_FALSE;
11822
11823
0
    cased = 0;
11824
0
    for (i = 0; i < length; i++) {
11825
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11826
11827
0
        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11828
0
            Py_RETURN_FALSE;
11829
0
        else if (!cased && Py_UNICODE_ISUPPER(ch))
11830
0
            cased = 1;
11831
0
    }
11832
0
    return PyBool_FromLong(cased);
11833
0
}
11834
11835
/*[clinic input]
11836
str.istitle as unicode_istitle
11837
11838
Return True if the string is a title-cased string, False otherwise.
11839
11840
In a title-cased string, upper- and title-case characters may only
11841
follow uncased characters and lowercase characters only cased ones.
11842
[clinic start generated code]*/
11843
11844
static PyObject *
11845
unicode_istitle_impl(PyObject *self)
11846
/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
11847
0
{
11848
0
    Py_ssize_t i, length;
11849
0
    int kind;
11850
0
    void *data;
11851
0
    int cased, previous_is_cased;
11852
11853
0
    if (PyUnicode_READY(self) == -1)
11854
0
        return NULL;
11855
0
    length = PyUnicode_GET_LENGTH(self);
11856
0
    kind = PyUnicode_KIND(self);
11857
0
    data = PyUnicode_DATA(self);
11858
11859
    /* Shortcut for single character strings */
11860
0
    if (length == 1) {
11861
0
        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11862
0
        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11863
0
                               (Py_UNICODE_ISUPPER(ch) != 0));
11864
0
    }
11865
11866
    /* Special case for empty strings */
11867
0
    if (length == 0)
11868
0
        Py_RETURN_FALSE;
11869
11870
0
    cased = 0;
11871
0
    previous_is_cased = 0;
11872
0
    for (i = 0; i < length; i++) {
11873
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11874
11875
0
        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11876
0
            if (previous_is_cased)
11877
0
                Py_RETURN_FALSE;
11878
0
            previous_is_cased = 1;
11879
0
            cased = 1;
11880
0
        }
11881
0
        else if (Py_UNICODE_ISLOWER(ch)) {
11882
0
            if (!previous_is_cased)
11883
0
                Py_RETURN_FALSE;
11884
0
            previous_is_cased = 1;
11885
0
            cased = 1;
11886
0
        }
11887
0
        else
11888
0
            previous_is_cased = 0;
11889
0
    }
11890
0
    return PyBool_FromLong(cased);
11891
0
}
11892
11893
/*[clinic input]
11894
str.isspace as unicode_isspace
11895
11896
Return True if the string is a whitespace string, False otherwise.
11897
11898
A string is whitespace if all characters in the string are whitespace and there
11899
is at least one character in the string.
11900
[clinic start generated code]*/
11901
11902
static PyObject *
11903
unicode_isspace_impl(PyObject *self)
11904
/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
11905
0
{
11906
0
    Py_ssize_t i, length;
11907
0
    int kind;
11908
0
    void *data;
11909
11910
0
    if (PyUnicode_READY(self) == -1)
11911
0
        return NULL;
11912
0
    length = PyUnicode_GET_LENGTH(self);
11913
0
    kind = PyUnicode_KIND(self);
11914
0
    data = PyUnicode_DATA(self);
11915
11916
    /* Shortcut for single character strings */
11917
0
    if (length == 1)
11918
0
        return PyBool_FromLong(
11919
0
            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11920
11921
    /* Special case for empty strings */
11922
0
    if (length == 0)
11923
0
        Py_RETURN_FALSE;
11924
11925
0
    for (i = 0; i < length; i++) {
11926
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11927
0
        if (!Py_UNICODE_ISSPACE(ch))
11928
0
            Py_RETURN_FALSE;
11929
0
    }
11930
0
    Py_RETURN_TRUE;
11931
0
}
11932
11933
/*[clinic input]
11934
str.isalpha as unicode_isalpha
11935
11936
Return True if the string is an alphabetic string, False otherwise.
11937
11938
A string is alphabetic if all characters in the string are alphabetic and there
11939
is at least one character in the string.
11940
[clinic start generated code]*/
11941
11942
static PyObject *
11943
unicode_isalpha_impl(PyObject *self)
11944
/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
11945
0
{
11946
0
    Py_ssize_t i, length;
11947
0
    int kind;
11948
0
    void *data;
11949
11950
0
    if (PyUnicode_READY(self) == -1)
11951
0
        return NULL;
11952
0
    length = PyUnicode_GET_LENGTH(self);
11953
0
    kind = PyUnicode_KIND(self);
11954
0
    data = PyUnicode_DATA(self);
11955
11956
    /* Shortcut for single character strings */
11957
0
    if (length == 1)
11958
0
        return PyBool_FromLong(
11959
0
            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11960
11961
    /* Special case for empty strings */
11962
0
    if (length == 0)
11963
0
        Py_RETURN_FALSE;
11964
11965
0
    for (i = 0; i < length; i++) {
11966
0
        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11967
0
            Py_RETURN_FALSE;
11968
0
    }
11969
0
    Py_RETURN_TRUE;
11970
0
}
11971
11972
/*[clinic input]
11973
str.isalnum as unicode_isalnum
11974
11975
Return True if the string is an alpha-numeric string, False otherwise.
11976
11977
A string is alpha-numeric if all characters in the string are alpha-numeric and
11978
there is at least one character in the string.
11979
[clinic start generated code]*/
11980
11981
static PyObject *
11982
unicode_isalnum_impl(PyObject *self)
11983
/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
11984
271
{
11985
271
    int kind;
11986
271
    void *data;
11987
271
    Py_ssize_t len, i;
11988
11989
271
    if (PyUnicode_READY(self) == -1)
11990
0
        return NULL;
11991
11992
271
    kind = PyUnicode_KIND(self);
11993
271
    data = PyUnicode_DATA(self);
11994
271
    len = PyUnicode_GET_LENGTH(self);
11995
11996
    /* Shortcut for single character strings */
11997
271
    if (len == 1) {
11998
271
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11999
271
        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12000
271
    }
12001
12002
    /* Special case for empty strings */
12003
0
    if (len == 0)
12004
0
        Py_RETURN_FALSE;
12005
12006
0
    for (i = 0; i < len; i++) {
12007
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12008
0
        if (!Py_UNICODE_ISALNUM(ch))
12009
0
            Py_RETURN_FALSE;
12010
0
    }
12011
0
    Py_RETURN_TRUE;
12012
0
}
12013
12014
/*[clinic input]
12015
str.isdecimal as unicode_isdecimal
12016
12017
Return True if the string is a decimal string, False otherwise.
12018
12019
A string is a decimal string if all characters in the string are decimal and
12020
there is at least one character in the string.
12021
[clinic start generated code]*/
12022
12023
static PyObject *
12024
unicode_isdecimal_impl(PyObject *self)
12025
/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
12026
0
{
12027
0
    Py_ssize_t i, length;
12028
0
    int kind;
12029
0
    void *data;
12030
12031
0
    if (PyUnicode_READY(self) == -1)
12032
0
        return NULL;
12033
0
    length = PyUnicode_GET_LENGTH(self);
12034
0
    kind = PyUnicode_KIND(self);
12035
0
    data = PyUnicode_DATA(self);
12036
12037
    /* Shortcut for single character strings */
12038
0
    if (length == 1)
12039
0
        return PyBool_FromLong(
12040
0
            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
12041
12042
    /* Special case for empty strings */
12043
0
    if (length == 0)
12044
0
        Py_RETURN_FALSE;
12045
12046
0
    for (i = 0; i < length; i++) {
12047
0
        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
12048
0
            Py_RETURN_FALSE;
12049
0
    }
12050
0
    Py_RETURN_TRUE;
12051
0
}
12052
12053
/*[clinic input]
12054
str.isdigit as unicode_isdigit
12055
12056
Return True if the string is a digit string, False otherwise.
12057
12058
A string is a digit string if all characters in the string are digits and there
12059
is at least one character in the string.
12060
[clinic start generated code]*/
12061
12062
static PyObject *
12063
unicode_isdigit_impl(PyObject *self)
12064
/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
12065
0
{
12066
0
    Py_ssize_t i, length;
12067
0
    int kind;
12068
0
    void *data;
12069
12070
0
    if (PyUnicode_READY(self) == -1)
12071
0
        return NULL;
12072
0
    length = PyUnicode_GET_LENGTH(self);
12073
0
    kind = PyUnicode_KIND(self);
12074
0
    data = PyUnicode_DATA(self);
12075
12076
    /* Shortcut for single character strings */
12077
0
    if (length == 1) {
12078
0
        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12079
0
        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12080
0
    }
12081
12082
    /* Special case for empty strings */
12083
0
    if (length == 0)
12084
0
        Py_RETURN_FALSE;
12085
12086
0
    for (i = 0; i < length; i++) {
12087
0
        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
12088
0
            Py_RETURN_FALSE;
12089
0
    }
12090
0
    Py_RETURN_TRUE;
12091
0
}
12092
12093
/*[clinic input]
12094
str.isnumeric as unicode_isnumeric
12095
12096
Return True if the string is a numeric string, False otherwise.
12097
12098
A string is numeric if all characters in the string are numeric and there is at
12099
least one character in the string.
12100
[clinic start generated code]*/
12101
12102
static PyObject *
12103
unicode_isnumeric_impl(PyObject *self)
12104
/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
12105
0
{
12106
0
    Py_ssize_t i, length;
12107
0
    int kind;
12108
0
    void *data;
12109
12110
0
    if (PyUnicode_READY(self) == -1)
12111
0
        return NULL;
12112
0
    length = PyUnicode_GET_LENGTH(self);
12113
0
    kind = PyUnicode_KIND(self);
12114
0
    data = PyUnicode_DATA(self);
12115
12116
    /* Shortcut for single character strings */
12117
0
    if (length == 1)
12118
0
        return PyBool_FromLong(
12119
0
            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
12120
12121
    /* Special case for empty strings */
12122
0
    if (length == 0)
12123
0
        Py_RETURN_FALSE;
12124
12125
0
    for (i = 0; i < length; i++) {
12126
0
        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
12127
0
            Py_RETURN_FALSE;
12128
0
    }
12129
0
    Py_RETURN_TRUE;
12130
0
}
12131
12132
int
12133
PyUnicode_IsIdentifier(PyObject *self)
12134
62
{
12135
62
    int kind;
12136
62
    void *data;
12137
62
    Py_ssize_t i;
12138
62
    Py_UCS4 first;
12139
12140
62
    if (PyUnicode_READY(self) == -1) {
12141
0
        Py_FatalError("identifier not ready");
12142
0
        return 0;
12143
0
    }
12144
12145
    /* Special case for empty strings */
12146
62
    if (PyUnicode_GET_LENGTH(self) == 0)
12147
0
        return 0;
12148
62
    kind = PyUnicode_KIND(self);
12149
62
    data = PyUnicode_DATA(self);
12150
12151
    /* PEP 3131 says that the first character must be in
12152
       XID_Start and subsequent characters in XID_Continue,
12153
       and for the ASCII range, the 2.x rules apply (i.e
12154
       start with letters and underscore, continue with
12155
       letters, digits, underscore). However, given the current
12156
       definition of XID_Start and XID_Continue, it is sufficient
12157
       to check just for these, except that _ must be allowed
12158
       as starting an identifier.  */
12159
62
    first = PyUnicode_READ(kind, data, 0);
12160
62
    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
12161
0
        return 0;
12162
12163
407
    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
12164
345
        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
12165
0
            return 0;
12166
62
    return 1;
12167
62
}
12168
12169
/*[clinic input]
12170
str.isidentifier as unicode_isidentifier
12171
12172
Return True if the string is a valid Python identifier, False otherwise.
12173
12174
Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
12175
such as "def" or "class".
12176
[clinic start generated code]*/
12177
12178
static PyObject *
12179
unicode_isidentifier_impl(PyObject *self)
12180
/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
12181
15
{
12182
15
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12183
15
}
12184
12185
/*[clinic input]
12186
str.isprintable as unicode_isprintable
12187
12188
Return True if the string is printable, False otherwise.
12189
12190
A string is printable if all of its characters are considered printable in
12191
repr() or if it is empty.
12192
[clinic start generated code]*/
12193
12194
static PyObject *
12195
unicode_isprintable_impl(PyObject *self)
12196
/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
12197
0
{
12198
0
    Py_ssize_t i, length;
12199
0
    int kind;
12200
0
    void *data;
12201
12202
0
    if (PyUnicode_READY(self) == -1)
12203
0
        return NULL;
12204
0
    length = PyUnicode_GET_LENGTH(self);
12205
0
    kind = PyUnicode_KIND(self);
12206
0
    data = PyUnicode_DATA(self);
12207
12208
    /* Shortcut for single character strings */
12209
0
    if (length == 1)
12210
0
        return PyBool_FromLong(
12211
0
            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
12212
12213
0
    for (i = 0; i < length; i++) {
12214
0
        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
12215
0
            Py_RETURN_FALSE;
12216
0
        }
12217
0
    }
12218
0
    Py_RETURN_TRUE;
12219
0
}
12220
12221
/*[clinic input]
12222
str.join as unicode_join
12223
12224
    iterable: object
12225
    /
12226
12227
Concatenate any number of strings.
12228
12229
The string whose method is called is inserted in between each given string.
12230
The result is returned as a new string.
12231
12232
Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12233
[clinic start generated code]*/
12234
12235
static PyObject *
12236
unicode_join(PyObject *self, PyObject *iterable)
12237
/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
12238
2.30k
{
12239
2.30k
    return PyUnicode_Join(self, iterable);
12240
2.30k
}
12241
12242
static Py_ssize_t
12243
unicode_length(PyObject *self)
12244
6.78k
{
12245
6.78k
    if (PyUnicode_READY(self) == -1)
12246
0
        return -1;
12247
6.78k
    return PyUnicode_GET_LENGTH(self);
12248
6.78k
}
12249
12250
/*[clinic input]
12251
str.ljust as unicode_ljust
12252
12253
    width: Py_ssize_t
12254
    fillchar: Py_UCS4 = ' '
12255
    /
12256
12257
Return a left-justified string of length width.
12258
12259
Padding is done using the specified fill character (default is a space).
12260
[clinic start generated code]*/
12261
12262
static PyObject *
12263
unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12264
/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
12265
0
{
12266
0
    if (PyUnicode_READY(self) == -1)
12267
0
        return NULL;
12268
12269
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12270
0
        return unicode_result_unchanged(self);
12271
12272
0
    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
12273
0
}
12274
12275
/*[clinic input]
12276
str.lower as unicode_lower
12277
12278
Return a copy of the string converted to lowercase.
12279
[clinic start generated code]*/
12280
12281
static PyObject *
12282
unicode_lower_impl(PyObject *self)
12283
/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
12284
0
{
12285
0
    if (PyUnicode_READY(self) == -1)
12286
0
        return NULL;
12287
0
    if (PyUnicode_IS_ASCII(self))
12288
0
        return ascii_upper_or_lower(self, 1);
12289
0
    return case_operation(self, do_lower);
12290
0
}
12291
12292
3.77k
#define LEFTSTRIP 0
12293
7.48k
#define RIGHTSTRIP 1
12294
59
#define BOTHSTRIP 2
12295
12296
/* Arrays indexed by above */
12297
static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
12298
12299
0
#define STRIPNAME(i) (stripfuncnames[i])
12300
12301
/* externally visible for str.strip(unicode) */
12302
PyObject *
12303
_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12304
3.71k
{
12305
3.71k
    void *data;
12306
3.71k
    int kind;
12307
3.71k
    Py_ssize_t i, j, len;
12308
3.71k
    BLOOM_MASK sepmask;
12309
3.71k
    Py_ssize_t seplen;
12310
12311
3.71k
    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12312
0
        return NULL;
12313
12314
3.71k
    kind = PyUnicode_KIND(self);
12315
3.71k
    data = PyUnicode_DATA(self);
12316
3.71k
    len = PyUnicode_GET_LENGTH(self);
12317
3.71k
    seplen = PyUnicode_GET_LENGTH(sepobj);
12318
3.71k
    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12319
3.71k
                              PyUnicode_DATA(sepobj),
12320
3.71k
                              seplen);
12321
12322
3.71k
    i = 0;
12323
3.71k
    if (striptype != RIGHTSTRIP) {
12324
0
        while (i < len) {
12325
0
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12326
0
            if (!BLOOM(sepmask, ch))
12327
0
                break;
12328
0
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12329
0
                break;
12330
0
            i++;
12331
0
        }
12332
0
    }
12333
12334
3.71k
    j = len;
12335
3.71k
    if (striptype != LEFTSTRIP) {
12336
3.71k
        j--;
12337
3.75k
        while (j >= i) {
12338
3.75k
            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12339
3.75k
            if (!BLOOM(sepmask, ch))
12340
2.90k
                break;
12341
852
            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12342
810
                break;
12343
42
            j--;
12344
42
        }
12345
12346
3.71k
        j++;
12347
3.71k
    }
12348
12349
3.71k
    return PyUnicode_Substring(self, i, j);
12350
3.71k
}
12351
12352
PyObject*
12353
PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12354
4.52k
{
12355
4.52k
    unsigned char *data;
12356
4.52k
    int kind;
12357
4.52k
    Py_ssize_t length;
12358
12359
4.52k
    if (PyUnicode_READY(self) == -1)
12360
0
        return NULL;
12361
12362
4.52k
    length = PyUnicode_GET_LENGTH(self);
12363
4.52k
    end = Py_MIN(end, length);
12364
12365
4.52k
    if (start == 0 && end == length)
12366
3.71k
        return unicode_result_unchanged(self);
12367
12368
807
    if (start < 0 || end < 0) {
12369
0
        PyErr_SetString(PyExc_IndexError, "string index out of range");
12370
0
        return NULL;
12371
0
    }
12372
807
    if (start >= length || end < start)
12373
0
        _Py_RETURN_UNICODE_EMPTY();
12374
12375
807
    length = end - start;
12376
807
    if (PyUnicode_IS_ASCII(self)) {
12377
807
        data = PyUnicode_1BYTE_DATA(self);
12378
807
        return _PyUnicode_FromASCII((char*)(data + start), length);
12379
807
    }
12380
0
    else {
12381
0
        kind = PyUnicode_KIND(self);
12382
0
        data = PyUnicode_1BYTE_DATA(self);
12383
0
        return PyUnicode_FromKindAndData(kind,
12384
0
                                         data + kind * start,
12385
0
                                         length);
12386
0
    }
12387
807
}
12388
12389
static PyObject *
12390
do_strip(PyObject *self, int striptype)
12391
59
{
12392
59
    Py_ssize_t len, i, j;
12393
12394
59
    if (PyUnicode_READY(self) == -1)
12395
0
        return NULL;
12396
12397
59
    len = PyUnicode_GET_LENGTH(self);
12398
12399
59
    if (PyUnicode_IS_ASCII(self)) {
12400
59
        Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12401
12402
59
        i = 0;
12403
59
        if (striptype != RIGHTSTRIP) {
12404
242
            while (i < len) {
12405
242
                Py_UCS1 ch = data[i];
12406
242
                if (!_Py_ascii_whitespace[ch])
12407
59
                    break;
12408
183
                i++;
12409
183
            }
12410
59
        }
12411
12412
59
        j = len;
12413
59
        if (striptype != LEFTSTRIP) {
12414
59
            j--;
12415
90
            while (j >= i) {
12416
90
                Py_UCS1 ch = data[j];
12417
90
                if (!_Py_ascii_whitespace[ch])
12418
59
                    break;
12419
31
                j--;
12420
31
            }
12421
59
            j++;
12422
59
        }
12423
59
    }
12424
0
    else {
12425
0
        int kind = PyUnicode_KIND(self);
12426
0
        void *data = PyUnicode_DATA(self);
12427
12428
0
        i = 0;
12429
0
        if (striptype != RIGHTSTRIP) {
12430
0
            while (i < len) {
12431
0
                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12432
0
                if (!Py_UNICODE_ISSPACE(ch))
12433
0
                    break;
12434
0
                i++;
12435
0
            }
12436
0
        }
12437
12438
0
        j = len;
12439
0
        if (striptype != LEFTSTRIP) {
12440
0
            j--;
12441
0
            while (j >= i) {
12442
0
                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12443
0
                if (!Py_UNICODE_ISSPACE(ch))
12444
0
                    break;
12445
0
                j--;
12446
0
            }
12447
0
            j++;
12448
0
        }
12449
0
    }
12450
12451
59
    return PyUnicode_Substring(self, i, j);
12452
59
}
12453
12454
12455
static PyObject *
12456
do_argstrip(PyObject *self, int striptype, PyObject *sep)
12457
3.77k
{
12458
3.77k
    if (sep != Py_None) {
12459
3.71k
        if (PyUnicode_Check(sep))
12460
3.71k
            return _PyUnicode_XStrip(self, striptype, sep);
12461
0
        else {
12462
0
            PyErr_Format(PyExc_TypeError,
12463
0
                         "%s arg must be None or str",
12464
0
                         STRIPNAME(striptype));
12465
0
            return NULL;
12466
0
        }
12467
3.71k
    }
12468
12469
59
    return do_strip(self, striptype);
12470
3.77k
}
12471
12472
12473
/*[clinic input]
12474
str.strip as unicode_strip
12475
12476
    chars: object = None
12477
    /
12478
12479
Return a copy of the string with leading and trailing whitespace removed.
12480
12481
If chars is given and not None, remove characters in chars instead.
12482
[clinic start generated code]*/
12483
12484
static PyObject *
12485
unicode_strip_impl(PyObject *self, PyObject *chars)
12486
/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
12487
59
{
12488
59
    return do_argstrip(self, BOTHSTRIP, chars);
12489
59
}
12490
12491
12492
/*[clinic input]
12493
str.lstrip as unicode_lstrip
12494
12495
    chars: object = None
12496
    /
12497
12498
Return a copy of the string with leading whitespace removed.
12499
12500
If chars is given and not None, remove characters in chars instead.
12501
[clinic start generated code]*/
12502
12503
static PyObject *
12504
unicode_lstrip_impl(PyObject *self, PyObject *chars)
12505
/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
12506
0
{
12507
0
    return do_argstrip(self, LEFTSTRIP, chars);
12508
0
}
12509
12510
12511
/*[clinic input]
12512
str.rstrip as unicode_rstrip
12513
12514
    chars: object = None
12515
    /
12516
12517
Return a copy of the string with trailing whitespace removed.
12518
12519
If chars is given and not None, remove characters in chars instead.
12520
[clinic start generated code]*/
12521
12522
static PyObject *
12523
unicode_rstrip_impl(PyObject *self, PyObject *chars)
12524
/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
12525
3.71k
{
12526
3.71k
    return do_argstrip(self, RIGHTSTRIP, chars);
12527
3.71k
}
12528
12529
12530
static PyObject*
12531
unicode_repeat(PyObject *str, Py_ssize_t len)
12532
182
{
12533
182
    PyObject *u;
12534
182
    Py_ssize_t nchars, n;
12535
12536
182
    if (len < 1)
12537
0
        _Py_RETURN_UNICODE_EMPTY();
12538
12539
    /* no repeat, return original string */
12540
182
    if (len == 1)
12541
70
        return unicode_result_unchanged(str);
12542
12543
112
    if (PyUnicode_READY(str) == -1)
12544
0
        return NULL;
12545
12546
112
    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12547
0
        PyErr_SetString(PyExc_OverflowError,
12548
0
                        "repeated string is too long");
12549
0
        return NULL;
12550
0
    }
12551
112
    nchars = len * PyUnicode_GET_LENGTH(str);
12552
12553
112
    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12554
112
    if (!u)
12555
0
        return NULL;
12556
112
    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12557
12558
112
    if (PyUnicode_GET_LENGTH(str) == 1) {
12559
112
        const int kind = PyUnicode_KIND(str);
12560
112
        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12561
112
        if (kind == PyUnicode_1BYTE_KIND) {
12562
112
            void *to = PyUnicode_DATA(u);
12563
112
            memset(to, (unsigned char)fill_char, len);
12564
112
        }
12565
0
        else if (kind == PyUnicode_2BYTE_KIND) {
12566
0
            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12567
0
            for (n = 0; n < len; ++n)
12568
0
                ucs2[n] = fill_char;
12569
0
        } else {
12570
0
            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12571
0
            assert(kind == PyUnicode_4BYTE_KIND);
12572
0
            for (n = 0; n < len; ++n)
12573
0
                ucs4[n] = fill_char;
12574
0
        }
12575
112
    }
12576
0
    else {
12577
        /* number of characters copied this far */
12578
0
        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12579
0
        const Py_ssize_t char_size = PyUnicode_KIND(str);
12580
0
        char *to = (char *) PyUnicode_DATA(u);
12581
0
        memcpy(to, PyUnicode_DATA(str),
12582
0
                  PyUnicode_GET_LENGTH(str) * char_size);
12583
0
        while (done < nchars) {
12584
0
            n = (done <= nchars-done) ? done : nchars-done;
12585
0
            memcpy(to + (done * char_size), to, n * char_size);
12586
0
            done += n;
12587
0
        }
12588
0
    }
12589
12590
112
    assert(_PyUnicode_CheckConsistency(u, 1));
12591
112
    return u;
12592
112
}
12593
12594
PyObject *
12595
PyUnicode_Replace(PyObject *str,
12596
                  PyObject *substr,
12597
                  PyObject *replstr,
12598
                  Py_ssize_t maxcount)
12599
0
{
12600
0
    if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12601
0
            ensure_unicode(replstr) < 0)
12602
0
        return NULL;
12603
0
    return replace(str, substr, replstr, maxcount);
12604
0
}
12605
12606
/*[clinic input]
12607
str.replace as unicode_replace
12608
12609
    old: unicode
12610
    new: unicode
12611
    count: Py_ssize_t = -1
12612
        Maximum number of occurrences to replace.
12613
        -1 (the default value) means replace all occurrences.
12614
    /
12615
12616
Return a copy with all occurrences of substring old replaced by new.
12617
12618
If the optional argument count is given, only the first count occurrences are
12619
replaced.
12620
[clinic start generated code]*/
12621
12622
static PyObject *
12623
unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12624
                     Py_ssize_t count)
12625
/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
12626
18
{
12627
18
    if (PyUnicode_READY(self) == -1)
12628
0
        return NULL;
12629
18
    return replace(self, old, new, count);
12630
18
}
12631
12632
static PyObject *
12633
unicode_repr(PyObject *unicode)
12634
72
{
12635
72
    PyObject *repr;
12636
72
    Py_ssize_t isize;
12637
72
    Py_ssize_t osize, squote, dquote, i, o;
12638
72
    Py_UCS4 max, quote;
12639
72
    int ikind, okind, unchanged;
12640
72
    void *idata, *odata;
12641
12642
72
    if (PyUnicode_READY(unicode) == -1)
12643
0
        return NULL;
12644
12645
72
    isize = PyUnicode_GET_LENGTH(unicode);
12646
72
    idata = PyUnicode_DATA(unicode);
12647
12648
    /* Compute length of output, quote characters, and
12649
       maximum character */
12650
72
    osize = 0;
12651
72
    max = 127;
12652
72
    squote = dquote = 0;
12653
72
    ikind = PyUnicode_KIND(unicode);
12654
696
    for (i = 0; i < isize; i++) {
12655
624
        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12656
624
        Py_ssize_t incr = 1;
12657
624
        switch (ch) {
12658
0
        case '\'': squote++; break;
12659
0
        case '"':  dquote++; break;
12660
0
        case '\\': case '\t': case '\r': case '\n':
12661
0
            incr = 2;
12662
0
            break;
12663
624
        default:
12664
            /* Fast-path ASCII */
12665
624
            if (ch < ' ' || ch == 0x7f)
12666
0
                incr = 4; /* \xHH */
12667
624
            else if (ch < 0x7f)
12668
624
                ;
12669
0
            else if (Py_UNICODE_ISPRINTABLE(ch))
12670
0
                max = ch > max ? ch : max;
12671
0
            else if (ch < 0x100)
12672
0
                incr = 4; /* \xHH */
12673
0
            else if (ch < 0x10000)
12674
0
                incr = 6; /* \uHHHH */
12675
0
            else
12676
0
                incr = 10; /* \uHHHHHHHH */
12677
624
        }
12678
624
        if (osize > PY_SSIZE_T_MAX - incr) {
12679
0
            PyErr_SetString(PyExc_OverflowError,
12680
0
                            "string is too long to generate repr");
12681
0
            return NULL;
12682
0
        }
12683
624
        osize += incr;
12684
624
    }
12685
12686
72
    quote = '\'';
12687
72
    unchanged = (osize == isize);
12688
72
    if (squote) {
12689
0
        unchanged = 0;
12690
0
        if (dquote)
12691
            /* Both squote and dquote present. Use squote,
12692
               and escape them */
12693
0
            osize += squote;
12694
0
        else
12695
0
            quote = '"';
12696
0
    }
12697
72
    osize += 2;   /* quotes */
12698
12699
72
    repr = PyUnicode_New(osize, max);
12700
72
    if (repr == NULL)
12701
0
        return NULL;
12702
72
    okind = PyUnicode_KIND(repr);
12703
72
    odata = PyUnicode_DATA(repr);
12704
12705
72
    PyUnicode_WRITE(okind, odata, 0, quote);
12706
72
    PyUnicode_WRITE(okind, odata, osize-1, quote);
12707
72
    if (unchanged) {
12708
72
        _PyUnicode_FastCopyCharacters(repr, 1,
12709
72
                                      unicode, 0,
12710
72
                                      isize);
12711
72
    }
12712
0
    else {
12713
0
        for (i = 0, o = 1; i < isize; i++) {
12714
0
            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12715
12716
            /* Escape quotes and backslashes */
12717
0
            if ((ch == quote) || (ch == '\\')) {
12718
0
                PyUnicode_WRITE(okind, odata, o++, '\\');
12719
0
                PyUnicode_WRITE(okind, odata, o++, ch);
12720
0
                continue;
12721
0
            }
12722
12723
            /* Map special whitespace to '\t', \n', '\r' */
12724
0
            if (ch == '\t') {
12725
0
                PyUnicode_WRITE(okind, odata, o++, '\\');
12726
0
                PyUnicode_WRITE(okind, odata, o++, 't');
12727
0
            }
12728
0
            else if (ch == '\n') {
12729
0
                PyUnicode_WRITE(okind, odata, o++, '\\');
12730
0
                PyUnicode_WRITE(okind, odata, o++, 'n');
12731
0
            }
12732
0
            else if (ch == '\r') {
12733
0
                PyUnicode_WRITE(okind, odata, o++, '\\');
12734
0
                PyUnicode_WRITE(okind, odata, o++, 'r');
12735
0
            }
12736
12737
            /* Map non-printable US ASCII to '\xhh' */
12738
0
            else if (ch < ' ' || ch == 0x7F) {
12739
0
                PyUnicode_WRITE(okind, odata, o++, '\\');
12740
0
                PyUnicode_WRITE(okind, odata, o++, 'x');
12741
0
                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12742
0
                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12743
0
            }
12744
12745
            /* Copy ASCII characters as-is */
12746
0
            else if (ch < 0x7F) {
12747
0
                PyUnicode_WRITE(okind, odata, o++, ch);
12748
0
            }
12749
12750
            /* Non-ASCII characters */
12751
0
            else {
12752
                /* Map Unicode whitespace and control characters
12753
                   (categories Z* and C* except ASCII space)
12754
                */
12755
0
                if (!Py_UNICODE_ISPRINTABLE(ch)) {
12756
0
                    PyUnicode_WRITE(okind, odata, o++, '\\');
12757
                    /* Map 8-bit characters to '\xhh' */
12758
0
                    if (ch <= 0xff) {
12759
0
                        PyUnicode_WRITE(okind, odata, o++, 'x');
12760
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12761
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12762
0
                    }
12763
                    /* Map 16-bit characters to '\uxxxx' */
12764
0
                    else if (ch <= 0xffff) {
12765
0
                        PyUnicode_WRITE(okind, odata, o++, 'u');
12766
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12767
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12768
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12769
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12770
0
                    }
12771
                    /* Map 21-bit characters to '\U00xxxxxx' */
12772
0
                    else {
12773
0
                        PyUnicode_WRITE(okind, odata, o++, 'U');
12774
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12775
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12776
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12777
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12778
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12779
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12780
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12781
0
                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12782
0
                    }
12783
0
                }
12784
                /* Copy characters as-is */
12785
0
                else {
12786
0
                    PyUnicode_WRITE(okind, odata, o++, ch);
12787
0
                }
12788
0
            }
12789
0
        }
12790
0
    }
12791
    /* Closing quote already added at the beginning */
12792
72
    assert(_PyUnicode_CheckConsistency(repr, 1));
12793
72
    return repr;
12794
72
}
12795
12796
PyDoc_STRVAR(rfind__doc__,
12797
             "S.rfind(sub[, start[, end]]) -> int\n\
12798
\n\
12799
Return the highest index in S where substring sub is found,\n\
12800
such that sub is contained within S[start:end].  Optional\n\
12801
arguments start and end are interpreted as in slice notation.\n\
12802
\n\
12803
Return -1 on failure.");
12804
12805
static PyObject *
12806
unicode_rfind(PyObject *self, PyObject *args)
12807
84
{
12808
    /* initialize variables to prevent gcc warning */
12809
84
    PyObject *substring = NULL;
12810
84
    Py_ssize_t start = 0;
12811
84
    Py_ssize_t end = 0;
12812
84
    Py_ssize_t result;
12813
12814
84
    if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
12815
0
        return NULL;
12816
12817
84
    if (PyUnicode_READY(self) == -1)
12818
0
        return NULL;
12819
12820
84
    result = any_find_slice(self, substring, start, end, -1);
12821
12822
84
    if (result == -2)
12823
0
        return NULL;
12824
12825
84
    return PyLong_FromSsize_t(result);
12826
84
}
12827
12828
PyDoc_STRVAR(rindex__doc__,
12829
             "S.rindex(sub[, start[, end]]) -> int\n\
12830
\n\
12831
Return the highest index in S where substring sub is found,\n\
12832
such that sub is contained within S[start:end].  Optional\n\
12833
arguments start and end are interpreted as in slice notation.\n\
12834
\n\
12835
Raises ValueError when the substring is not found.");
12836
12837
static PyObject *
12838
unicode_rindex(PyObject *self, PyObject *args)
12839
0
{
12840
    /* initialize variables to prevent gcc warning */
12841
0
    PyObject *substring = NULL;
12842
0
    Py_ssize_t start = 0;
12843
0
    Py_ssize_t end = 0;
12844
0
    Py_ssize_t result;
12845
12846
0
    if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
12847
0
        return NULL;
12848
12849
0
    if (PyUnicode_READY(self) == -1)
12850
0
        return NULL;
12851
12852
0
    result = any_find_slice(self, substring, start, end, -1);
12853
12854
0
    if (result == -2)
12855
0
        return NULL;
12856
12857
0
    if (result < 0) {
12858
0
        PyErr_SetString(PyExc_ValueError, "substring not found");
12859
0
        return NULL;
12860
0
    }
12861
12862
0
    return PyLong_FromSsize_t(result);
12863
0
}
12864
12865
/*[clinic input]
12866
str.rjust as unicode_rjust
12867
12868
    width: Py_ssize_t
12869
    fillchar: Py_UCS4 = ' '
12870
    /
12871
12872
Return a right-justified string of length width.
12873
12874
Padding is done using the specified fill character (default is a space).
12875
[clinic start generated code]*/
12876
12877
static PyObject *
12878
unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12879
/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
12880
0
{
12881
0
    if (PyUnicode_READY(self) == -1)
12882
0
        return NULL;
12883
12884
0
    if (PyUnicode_GET_LENGTH(self) >= width)
12885
0
        return unicode_result_unchanged(self);
12886
12887
0
    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12888
0
}
12889
12890
PyObject *
12891
PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12892
0
{
12893
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
12894
0
        return NULL;
12895
12896
0
    return split(s, sep, maxsplit);
12897
0
}
12898
12899
/*[clinic input]
12900
str.split as unicode_split
12901
12902
    sep: object = None
12903
        The delimiter according which to split the string.
12904
        None (the default value) means split according to any whitespace,
12905
        and discard empty strings from the result.
12906
    maxsplit: Py_ssize_t = -1
12907
        Maximum number of splits to do.
12908
        -1 (the default value) means no limit.
12909
12910
Return a list of the words in the string, using sep as the delimiter string.
12911
[clinic start generated code]*/
12912
12913
static PyObject *
12914
unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12915
/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
12916
74
{
12917
74
    if (sep == Py_None)
12918
4
        return split(self, NULL, maxsplit);
12919
70
    if (PyUnicode_Check(sep))
12920
70
        return split(self, sep, maxsplit);
12921
12922
0
    PyErr_Format(PyExc_TypeError,
12923
0
                 "must be str or None, not %.100s",
12924
0
                 Py_TYPE(sep)->tp_name);
12925
0
    return NULL;
12926
70
}
12927
12928
PyObject *
12929
PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
12930
0
{
12931
0
    PyObject* out;
12932
0
    int kind1, kind2;
12933
0
    void *buf1, *buf2;
12934
0
    Py_ssize_t len1, len2;
12935
12936
0
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12937
0
        return NULL;
12938
12939
0
    kind1 = PyUnicode_KIND(str_obj);
12940
0
    kind2 = PyUnicode_KIND(sep_obj);
12941
0
    len1 = PyUnicode_GET_LENGTH(str_obj);
12942
0
    len2 = PyUnicode_GET_LENGTH(sep_obj);
12943
0
    if (kind1 < kind2 || len1 < len2) {
12944
0
        _Py_INCREF_UNICODE_EMPTY();
12945
0
        if (!unicode_empty)
12946
0
            out = NULL;
12947
0
        else {
12948
0
            out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12949
0
            Py_DECREF(unicode_empty);
12950
0
        }
12951
0
        return out;
12952
0
    }
12953
0
    buf1 = PyUnicode_DATA(str_obj);
12954
0
    buf2 = PyUnicode_DATA(sep_obj);
12955
0
    if (kind2 != kind1) {
12956
0
        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12957
0
        if (!buf2)
12958
0
            return NULL;
12959
0
    }
12960
12961
0
    switch (kind1) {
12962
0
    case PyUnicode_1BYTE_KIND:
12963
0
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12964
0
            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965
0
        else
12966
0
            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12967
0
        break;
12968
0
    case PyUnicode_2BYTE_KIND:
12969
0
        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12970
0
        break;
12971
0
    case PyUnicode_4BYTE_KIND:
12972
0
        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12973
0
        break;
12974
0
    default:
12975
0
        Py_UNREACHABLE();
12976
0
    }
12977
12978
0
    if (kind2 != kind1)
12979
0
        PyMem_Free(buf2);
12980
12981
0
    return out;
12982
0
}
12983
12984
12985
PyObject *
12986
PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
12987
2.20k
{
12988
2.20k
    PyObject* out;
12989
2.20k
    int kind1, kind2;
12990
2.20k
    void *buf1, *buf2;
12991
2.20k
    Py_ssize_t len1, len2;
12992
12993
2.20k
    if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
12994
0
        return NULL;
12995
12996
2.20k
    kind1 = PyUnicode_KIND(str_obj);
12997
2.20k
    kind2 = PyUnicode_KIND(sep_obj);
12998
2.20k
    len1 = PyUnicode_GET_LENGTH(str_obj);
12999
2.20k
    len2 = PyUnicode_GET_LENGTH(sep_obj);
13000
2.20k
    if (kind1 < kind2 || len1 < len2) {
13001
0
        _Py_INCREF_UNICODE_EMPTY();
13002
0
        if (!unicode_empty)
13003
0
            out = NULL;
13004
0
        else {
13005
0
            out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13006
0
            Py_DECREF(unicode_empty);
13007
0
        }
13008
0
        return out;
13009
0
    }
13010
2.20k
    buf1 = PyUnicode_DATA(str_obj);
13011
2.20k
    buf2 = PyUnicode_DATA(sep_obj);
13012
2.20k
    if (kind2 != kind1) {
13013
0
        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13014
0
        if (!buf2)
13015
0
            return NULL;
13016
0
    }
13017
13018
2.20k
    switch (kind1) {
13019
2.20k
    case PyUnicode_1BYTE_KIND:
13020
2.20k
        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13021
2.20k
            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022
0
        else
13023
0
            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13024
2.20k
        break;
13025
0
    case PyUnicode_2BYTE_KIND:
13026
0
        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13027
0
        break;
13028
0
    case PyUnicode_4BYTE_KIND:
13029
0
        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13030
0
        break;
13031
0
    default:
13032
0
        Py_UNREACHABLE();
13033
2.20k
    }
13034
13035
2.20k
    if (kind2 != kind1)
13036
0
        PyMem_Free(buf2);
13037
13038
2.20k
    return out;
13039
2.20k
}
13040
13041
/*[clinic input]
13042
str.partition as unicode_partition
13043
13044
    sep: object
13045
    /
13046
13047
Partition the string into three parts using the given separator.
13048
13049
This will search for the separator in the string.  If the separator is found,
13050
returns a 3-tuple containing the part before the separator, the separator
13051
itself, and the part after it.
13052
13053
If the separator is not found, returns a 3-tuple containing the original string
13054
and two empty strings.
13055
[clinic start generated code]*/
13056
13057
static PyObject *
13058
unicode_partition(PyObject *self, PyObject *sep)
13059
/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
13060
0
{
13061
0
    return PyUnicode_Partition(self, sep);
13062
0
}
13063
13064
/*[clinic input]
13065
str.rpartition as unicode_rpartition = str.partition
13066
13067
Partition the string into three parts using the given separator.
13068
13069
This will search for the separator in the string, starting at the end. If
13070
the separator is found, returns a 3-tuple containing the part before the
13071
separator, the separator itself, and the part after it.
13072
13073
If the separator is not found, returns a 3-tuple containing two empty strings
13074
and the original string.
13075
[clinic start generated code]*/
13076
13077
static PyObject *
13078
unicode_rpartition(PyObject *self, PyObject *sep)
13079
/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
13080
2.20k
{
13081
2.20k
    return PyUnicode_RPartition(self, sep);
13082
2.20k
}
13083
13084
PyObject *
13085
PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
13086
0
{
13087
0
    if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
13088
0
        return NULL;
13089
13090
0
    return rsplit(s, sep, maxsplit);
13091
0
}
13092
13093
/*[clinic input]
13094
str.rsplit as unicode_rsplit = str.split
13095
13096
Return a list of the words in the string, using sep as the delimiter string.
13097
13098
Splits are done starting at the end of the string and working to the front.
13099
[clinic start generated code]*/
13100
13101
static PyObject *
13102
unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13103
/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
13104
0
{
13105
0
    if (sep == Py_None)
13106
0
        return rsplit(self, NULL, maxsplit);
13107
0
    if (PyUnicode_Check(sep))
13108
0
        return rsplit(self, sep, maxsplit);
13109
13110
0
    PyErr_Format(PyExc_TypeError,
13111
0
                 "must be str or None, not %.100s",
13112
0
                 Py_TYPE(sep)->tp_name);
13113
0
    return NULL;
13114
0
}
13115
13116
/*[clinic input]
13117
str.splitlines as unicode_splitlines
13118
13119
    keepends: bool(accept={int}) = False
13120
13121
Return a list of the lines in the string, breaking at line boundaries.
13122
13123
Line breaks are not included in the resulting list unless keepends is given and
13124
true.
13125
[clinic start generated code]*/
13126
13127
static PyObject *
13128
unicode_splitlines_impl(PyObject *self, int keepends)
13129
/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
13130
0
{
13131
0
    return PyUnicode_Splitlines(self, keepends);
13132
0
}
13133
13134
static
13135
PyObject *unicode_str(PyObject *self)
13136
0
{
13137
0
    return unicode_result_unchanged(self);
13138
0
}
13139
13140
/*[clinic input]
13141
str.swapcase as unicode_swapcase
13142
13143
Convert uppercase characters to lowercase and lowercase characters to uppercase.
13144
[clinic start generated code]*/
13145
13146
static PyObject *
13147
unicode_swapcase_impl(PyObject *self)
13148
/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
13149
0
{
13150
0
    if (PyUnicode_READY(self) == -1)
13151
0
        return NULL;
13152
0
    return case_operation(self, do_swapcase);
13153
0
}
13154
13155
/*[clinic input]
13156
13157
@staticmethod
13158
str.maketrans as unicode_maketrans
13159
13160
  x: object
13161
13162
  y: unicode=NULL
13163
13164
  z: unicode=NULL
13165
13166
  /
13167
13168
Return a translation table usable for str.translate().
13169
13170
If there is only one argument, it must be a dictionary mapping Unicode
13171
ordinals (integers) or characters to Unicode ordinals, strings or None.
13172
Character keys will be then converted to ordinals.
13173
If there are two arguments, they must be strings of equal length, and
13174
in the resulting dictionary, each character in x will be mapped to the
13175
character at the same position in y. If there is a third argument, it
13176
must be a string, whose characters will be mapped to None in the result.
13177
[clinic start generated code]*/
13178
13179
static PyObject *
13180
unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
13181
/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
13182
0
{
13183
0
    PyObject *new = NULL, *key, *value;
13184
0
    Py_ssize_t i = 0;
13185
0
    int res;
13186
13187
0
    new = PyDict_New();
13188
0
    if (!new)
13189
0
        return NULL;
13190
0
    if (y != NULL) {
13191
0
        int x_kind, y_kind, z_kind;
13192
0
        void *x_data, *y_data, *z_data;
13193
13194
        /* x must be a string too, of equal length */
13195
0
        if (!PyUnicode_Check(x)) {
13196
0
            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13197
0
                            "be a string if there is a second argument");
13198
0
            goto err;
13199
0
        }
13200
0
        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13201
0
            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13202
0
                            "arguments must have equal length");
13203
0
            goto err;
13204
0
        }
13205
        /* create entries for translating chars in x to those in y */
13206
0
        x_kind = PyUnicode_KIND(x);
13207
0
        y_kind = PyUnicode_KIND(y);
13208
0
        x_data = PyUnicode_DATA(x);
13209
0
        y_data = PyUnicode_DATA(y);
13210
0
        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13211
0
            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13212
0
            if (!key)
13213
0
                goto err;
13214
0
            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13215
0
            if (!value) {
13216
0
                Py_DECREF(key);
13217
0
                goto err;
13218
0
            }
13219
0
            res = PyDict_SetItem(new, key, value);
13220
0
            Py_DECREF(key);
13221
0
            Py_DECREF(value);
13222
0
            if (res < 0)
13223
0
                goto err;
13224
0
        }
13225
        /* create entries for deleting chars in z */
13226
0
        if (z != NULL) {
13227
0
            z_kind = PyUnicode_KIND(z);
13228
0
            z_data = PyUnicode_DATA(z);
13229
0
            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13230
0
                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13231
0
                if (!key)
13232
0
                    goto err;
13233
0
                res = PyDict_SetItem(new, key, Py_None);
13234
0
                Py_DECREF(key);
13235
0
                if (res < 0)
13236
0
                    goto err;
13237
0
            }
13238
0
        }
13239
0
    } else {
13240
0
        int kind;
13241
0
        void *data;
13242
13243
        /* x must be a dict */
13244
0
        if (!PyDict_CheckExact(x)) {
13245
0
            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13246
0
                            "to maketrans it must be a dict");
13247
0
            goto err;
13248
0
        }
13249
        /* copy entries into the new dict, converting string keys to int keys */
13250
0
        while (PyDict_Next(x, &i, &key, &value)) {
13251
0
            if (PyUnicode_Check(key)) {
13252
                /* convert string keys to integer keys */
13253
0
                PyObject *newkey;
13254
0
                if (PyUnicode_GET_LENGTH(key) != 1) {
13255
0
                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13256
0
                                    "table must be of length 1");
13257
0
                    goto err;
13258
0
                }
13259
0
                kind = PyUnicode_KIND(key);
13260
0
                data = PyUnicode_DATA(key);
13261
0
                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13262
0
                if (!newkey)
13263
0
                    goto err;
13264
0
                res = PyDict_SetItem(new, newkey, value);
13265
0
                Py_DECREF(newkey);
13266
0
                if (res < 0)
13267
0
                    goto err;
13268
0
            } else if (PyLong_Check(key)) {
13269
                /* just keep integer keys */
13270
0
                if (PyDict_SetItem(new, key, value) < 0)
13271
0
                    goto err;
13272
0
            } else {
13273
0
                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13274
0
                                "be strings or integers");
13275
0
                goto err;
13276
0
            }
13277
0
        }
13278
0
    }
13279
0
    return new;
13280
0
  err:
13281
0
    Py_DECREF(new);
13282
0
    return NULL;
13283
0
}
13284
13285
/*[clinic input]
13286
str.translate as unicode_translate
13287
13288
    table: object
13289
        Translation table, which must be a mapping of Unicode ordinals to
13290
        Unicode ordinals, strings, or None.
13291
    /
13292
13293
Replace each character in the string using the given translation table.
13294
13295
The table must implement lookup/indexing via __getitem__, for instance a
13296
dictionary or list.  If this operation raises LookupError, the character is
13297
left untouched.  Characters mapped to None are deleted.
13298
[clinic start generated code]*/
13299
13300
static PyObject *
13301
unicode_translate(PyObject *self, PyObject *table)
13302
/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
13303
48
{
13304
48
    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13305
48
}
13306
13307
/*[clinic input]
13308
str.upper as unicode_upper
13309
13310
Return a copy of the string converted to uppercase.
13311
[clinic start generated code]*/
13312
13313
static PyObject *
13314
unicode_upper_impl(PyObject *self)
13315
/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
13316
36
{
13317
36
    if (PyUnicode_READY(self) == -1)
13318
0
        return NULL;
13319
36
    if (PyUnicode_IS_ASCII(self))
13320
36
        return ascii_upper_or_lower(self, 0);
13321
0
    return case_operation(self, do_upper);
13322
36
}
13323
13324
/*[clinic input]
13325
str.zfill as unicode_zfill
13326
13327
    width: Py_ssize_t
13328
    /
13329
13330
Pad a numeric string with zeros on the left, to fill a field of the given width.
13331
13332
The string is never truncated.
13333
[clinic start generated code]*/
13334
13335
static PyObject *
13336
unicode_zfill_impl(PyObject *self, Py_ssize_t width)
13337
/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
13338
0
{
13339
0
    Py_ssize_t fill;
13340
0
    PyObject *u;
13341
0
    int kind;
13342
0
    void *data;
13343
0
    Py_UCS4 chr;
13344
13345
0
    if (PyUnicode_READY(self) == -1)
13346
0
        return NULL;
13347
13348
0
    if (PyUnicode_GET_LENGTH(self) >= width)
13349
0
        return unicode_result_unchanged(self);
13350
13351
0
    fill = width - PyUnicode_GET_LENGTH(self);
13352
13353
0
    u = pad(self, fill, 0, '0');
13354
13355
0
    if (u == NULL)
13356
0
        return NULL;
13357
13358
0
    kind = PyUnicode_KIND(u);
13359
0
    data = PyUnicode_DATA(u);
13360
0
    chr = PyUnicode_READ(kind, data, fill);
13361
13362
0
    if (chr == '+' || chr == '-') {
13363
        /* move sign to beginning of string */
13364
0
        PyUnicode_WRITE(kind, data, 0, chr);
13365
0
        PyUnicode_WRITE(kind, data, fill, '0');
13366
0
    }
13367
13368
0
    assert(_PyUnicode_CheckConsistency(u, 1));
13369
0
    return u;
13370
0
}
13371
13372
#if 0
13373
static PyObject *
13374
unicode__decimal2ascii(PyObject *self)
13375
{
13376
    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13377
}
13378
#endif
13379
13380
PyDoc_STRVAR(startswith__doc__,
13381
             "S.startswith(prefix[, start[, end]]) -> bool\n\
13382
\n\
13383
Return True if S starts with the specified prefix, False otherwise.\n\
13384
With optional start, test S beginning at that position.\n\
13385
With optional end, stop comparing S at that position.\n\
13386
prefix can also be a tuple of strings to try.");
13387
13388
static PyObject *
13389
unicode_startswith(PyObject *self,
13390
                   PyObject *args)
13391
526
{
13392
526
    PyObject *subobj;
13393
526
    PyObject *substring;
13394
526
    Py_ssize_t start = 0;
13395
526
    Py_ssize_t end = PY_SSIZE_T_MAX;
13396
526
    int result;
13397
13398
526
    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13399
0
        return NULL;
13400
526
    if (PyTuple_Check(subobj)) {
13401
43
        Py_ssize_t i;
13402
172
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13403
129
            substring = PyTuple_GET_ITEM(subobj, i);
13404
129
            if (!PyUnicode_Check(substring)) {
13405
0
                PyErr_Format(PyExc_TypeError,
13406
0
                             "tuple for startswith must only contain str, "
13407
0
                             "not %.100s",
13408
0
                             Py_TYPE(substring)->tp_name);
13409
0
                return NULL;
13410
0
            }
13411
129
            result = tailmatch(self, substring, start, end, -1);
13412
129
            if (result == -1)
13413
0
                return NULL;
13414
129
            if (result) {
13415
0
                Py_RETURN_TRUE;
13416
0
            }
13417
129
        }
13418
        /* nothing matched */
13419
43
        Py_RETURN_FALSE;
13420
43
    }
13421
483
    if (!PyUnicode_Check(subobj)) {
13422
0
        PyErr_Format(PyExc_TypeError,
13423
0
                     "startswith first arg must be str or "
13424
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13425
0
        return NULL;
13426
0
    }
13427
483
    result = tailmatch(self, subobj, start, end, -1);
13428
483
    if (result == -1)
13429
0
        return NULL;
13430
483
    return PyBool_FromLong(result);
13431
483
}
13432
13433
13434
PyDoc_STRVAR(endswith__doc__,
13435
             "S.endswith(suffix[, start[, end]]) -> bool\n\
13436
\n\
13437
Return True if S ends with the specified suffix, False otherwise.\n\
13438
With optional start, test S beginning at that position.\n\
13439
With optional end, stop comparing S at that position.\n\
13440
suffix can also be a tuple of strings to try.");
13441
13442
static PyObject *
13443
unicode_endswith(PyObject *self,
13444
                 PyObject *args)
13445
404
{
13446
404
    PyObject *subobj;
13447
404
    PyObject *substring;
13448
404
    Py_ssize_t start = 0;
13449
404
    Py_ssize_t end = PY_SSIZE_T_MAX;
13450
404
    int result;
13451
13452
404
    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13453
0
        return NULL;
13454
404
    if (PyTuple_Check(subobj)) {
13455
235
        Py_ssize_t i;
13456
235
        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13457
235
            substring = PyTuple_GET_ITEM(subobj, i);
13458
235
            if (!PyUnicode_Check(substring)) {
13459
0
                PyErr_Format(PyExc_TypeError,
13460
0
                             "tuple for endswith must only contain str, "
13461
0
                             "not %.100s",
13462
0
                             Py_TYPE(substring)->tp_name);
13463
0
                return NULL;
13464
0
            }
13465
235
            result = tailmatch(self, substring, start, end, +1);
13466
235
            if (result == -1)
13467
0
                return NULL;
13468
235
            if (result) {
13469
235
                Py_RETURN_TRUE;
13470
235
            }
13471
235
        }
13472
235
        Py_RETURN_FALSE;
13473
235
    }
13474
169
    if (!PyUnicode_Check(subobj)) {
13475
0
        PyErr_Format(PyExc_TypeError,
13476
0
                     "endswith first arg must be str or "
13477
0
                     "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
13478
0
        return NULL;
13479
0
    }
13480
169
    result = tailmatch(self, subobj, start, end, +1);
13481
169
    if (result == -1)
13482
0
        return NULL;
13483
169
    return PyBool_FromLong(result);
13484
169
}
13485
13486
static inline void
13487
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13488
70.4k
{
13489
70.4k
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13490
70.4k
    writer->data = PyUnicode_DATA(writer->buffer);
13491
13492
70.4k
    if (!writer->readonly) {
13493
70.4k
        writer->kind = PyUnicode_KIND(writer->buffer);
13494
70.4k
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13495
70.4k
    }
13496
0
    else {
13497
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
13498
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13499
0
        writer->kind = PyUnicode_WCHAR_KIND;
13500
0
        assert(writer->kind <= PyUnicode_1BYTE_KIND);
13501
13502
        /* Copy-on-write mode: set buffer size to 0 so
13503
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13504
         * next write. */
13505
0
        writer->size = 0;
13506
0
    }
13507
70.4k
}
13508
13509
void
13510
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13511
70.3k
{
13512
70.3k
    memset(writer, 0, sizeof(*writer));
13513
13514
    /* ASCII is the bare minimum */
13515
70.3k
    writer->min_char = 127;
13516
13517
    /* use a value smaller than PyUnicode_1BYTE_KIND() so
13518
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13519
70.3k
    writer->kind = PyUnicode_WCHAR_KIND;
13520
70.3k
    assert(writer->kind <= PyUnicode_1BYTE_KIND);
13521
70.3k
}
13522
13523
int
13524
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13525
                                 Py_ssize_t length, Py_UCS4 maxchar)
13526
70.4k
{
13527
70.4k
    Py_ssize_t newlen;
13528
70.4k
    PyObject *newbuffer;
13529
13530
70.4k
    assert(maxchar <= MAX_UNICODE);
13531
13532
    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13533
70.4k
    assert((maxchar > writer->maxchar && length >= 0)
13534
70.4k
           || length > 0);
13535
13536
70.4k
    if (length > PY_SSIZE_T_MAX - writer->pos) {
13537
0
        PyErr_NoMemory();
13538
0
        return -1;
13539
0
    }
13540
70.4k
    newlen = writer->pos + length;
13541
13542
70.4k
    maxchar = Py_MAX(maxchar, writer->min_char);
13543
13544
70.4k
    if (writer->buffer == NULL) {
13545
70.3k
        assert(!writer->readonly);
13546
70.3k
        if (writer->overallocate
13547
70.3k
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13548
            /* overallocate to limit the number of realloc() */
13549
6.27k
            newlen += newlen / OVERALLOCATE_FACTOR;
13550
6.27k
        }
13551
70.3k
        if (newlen < writer->min_length)
13552
6.27k
            newlen = writer->min_length;
13553
13554
70.3k
        writer->buffer = PyUnicode_New(newlen, maxchar);
13555
70.3k
        if (writer->buffer == NULL)
13556
0
            return -1;
13557
70.3k
    }
13558
63
    else if (newlen > writer->size) {
13559
34
        if (writer->overallocate
13560
34
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13561
            /* overallocate to limit the number of realloc() */
13562
6
            newlen += newlen / OVERALLOCATE_FACTOR;
13563
6
        }
13564
34
        if (newlen < writer->min_length)
13565
0
            newlen = writer->min_length;
13566
13567
34
        if (maxchar > writer->maxchar || writer->readonly) {
13568
            /* resize + widen */
13569
0
            maxchar = Py_MAX(maxchar, writer->maxchar);
13570
0
            newbuffer = PyUnicode_New(newlen, maxchar);
13571
0
            if (newbuffer == NULL)
13572
0
                return -1;
13573
0
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13574
0
                                          writer->buffer, 0, writer->pos);
13575
0
            Py_DECREF(writer->buffer);
13576
0
            writer->readonly = 0;
13577
0
        }
13578
34
        else {
13579
34
            newbuffer = resize_compact(writer->buffer, newlen);
13580
34
            if (newbuffer == NULL)
13581
0
                return -1;
13582
34
        }
13583
34
        writer->buffer = newbuffer;
13584
34
    }
13585
29
    else if (maxchar > writer->maxchar) {
13586
29
        assert(!writer->readonly);
13587
29
        newbuffer = PyUnicode_New(writer->size, maxchar);
13588
29
        if (newbuffer == NULL)
13589
0
            return -1;
13590
29
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13591
29
                                      writer->buffer, 0, writer->pos);
13592
29
        Py_SETREF(writer->buffer, newbuffer);
13593
29
    }
13594
70.4k
    _PyUnicodeWriter_Update(writer);
13595
70.4k
    return 0;
13596
13597
70.4k
#undef OVERALLOCATE_FACTOR
13598
70.4k
}
13599
13600
int
13601
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13602
                                     enum PyUnicode_Kind kind)
13603
0
{
13604
0
    Py_UCS4 maxchar;
13605
13606
    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13607
0
    assert(writer->kind < kind);
13608
13609
0
    switch (kind)
13610
0
    {
13611
0
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13612
0
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13613
0
    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13614
0
    default:
13615
0
        Py_UNREACHABLE();
13616
0
    }
13617
13618
0
    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13619
0
}
13620
13621
static inline int
13622
_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13623
85
{
13624
85
    assert(ch <= MAX_UNICODE);
13625
85
    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13626
0
        return -1;
13627
85
    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13628
85
    writer->pos++;
13629
85
    return 0;
13630
85
}
13631
13632
int
13633
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13634
4
{
13635
4
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13636
4
}
13637
13638
int
13639
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13640
12.0k
{
13641
12.0k
    Py_UCS4 maxchar;
13642
12.0k
    Py_ssize_t len;
13643
13644
12.0k
    if (PyUnicode_READY(str) == -1)
13645
0
        return -1;
13646
12.0k
    len = PyUnicode_GET_LENGTH(str);
13647
12.0k
    if (len == 0)
13648
260
        return 0;
13649
11.8k
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13650
11.8k
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13651
341
        if (writer->buffer == NULL && !writer->overallocate) {
13652
0
            assert(_PyUnicode_CheckConsistency(str, 1));
13653
0
            writer->readonly = 1;
13654
0
            Py_INCREF(str);
13655
0
            writer->buffer = str;
13656
0
            _PyUnicodeWriter_Update(writer);
13657
0
            writer->pos += len;
13658
0
            return 0;
13659
0
        }
13660
341
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13661
0
            return -1;
13662
341
    }
13663
11.8k
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13664
11.8k
                                  str, 0, len);
13665
11.8k
    writer->pos += len;
13666
11.8k
    return 0;
13667
11.8k
}
13668
13669
int
13670
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13671
                                Py_ssize_t start, Py_ssize_t end)
13672
334
{
13673
334
    Py_UCS4 maxchar;
13674
334
    Py_ssize_t len;
13675
13676
334
    if (PyUnicode_READY(str) == -1)
13677
0
        return -1;
13678
13679
334
    assert(0 <= start);
13680
334
    assert(end <= PyUnicode_GET_LENGTH(str));
13681
334
    assert(start <= end);
13682
13683
334
    if (end == 0)
13684
0
        return 0;
13685
13686
334
    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13687
0
        return _PyUnicodeWriter_WriteStr(writer, str);
13688
13689
334
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13690
120
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13691
214
    else
13692
214
        maxchar = writer->maxchar;
13693
334
    len = end - start;
13694
13695
334
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13696
0
        return -1;
13697
13698
334
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13699
334
                                  str, start, len);
13700
334
    writer->pos += len;
13701
334
    return 0;
13702
334
}
13703
13704
int
13705
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13706
                                  const char *ascii, Py_ssize_t len)
13707
18.3k
{
13708
18.3k
    if (len == -1)
13709
0
        len = strlen(ascii);
13710
13711
18.3k
    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
13712
13713
18.3k
    if (writer->buffer == NULL && !writer->overallocate) {
13714
0
        PyObject *str;
13715
13716
0
        str = _PyUnicode_FromASCII(ascii, len);
13717
0
        if (str == NULL)
13718
0
            return -1;
13719
13720
0
        writer->readonly = 1;
13721
0
        writer->buffer = str;
13722
0
        _PyUnicodeWriter_Update(writer);
13723
0
        writer->pos += len;
13724
0
        return 0;
13725
0
    }
13726
13727
18.3k
    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13728
0
        return -1;
13729
13730
18.3k
    switch (writer->kind)
13731
18.3k
    {
13732
18.3k
    case PyUnicode_1BYTE_KIND:
13733
18.3k
    {
13734
18.3k
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13735
18.3k
        Py_UCS1 *data = writer->data;
13736
13737
18.3k
        memcpy(data + writer->pos, str, len);
13738
18.3k
        break;
13739
0
    }
13740
0
    case PyUnicode_2BYTE_KIND:
13741
0
    {
13742
0
        _PyUnicode_CONVERT_BYTES(
13743
0
            Py_UCS1, Py_UCS2,
13744
0
            ascii, ascii + len,
13745
0
            (Py_UCS2 *)writer->data + writer->pos);
13746
0
        break;
13747
0
    }
13748
0
    case PyUnicode_4BYTE_KIND:
13749
0
    {
13750
0
        _PyUnicode_CONVERT_BYTES(
13751
0
            Py_UCS1, Py_UCS4,
13752
0
            ascii, ascii + len,
13753
0
            (Py_UCS4 *)writer->data + writer->pos);
13754
0
        break;
13755
0
    }
13756
0
    default:
13757
0
        Py_UNREACHABLE();
13758
18.3k
    }
13759
13760
18.3k
    writer->pos += len;
13761
18.3k
    return 0;
13762
18.3k
}
13763
13764
int
13765
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13766
                                   const char *str, Py_ssize_t len)
13767
0
{
13768
0
    Py_UCS4 maxchar;
13769
13770
0
    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
13771
0
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13772
0
        return -1;
13773
0
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13774
0
    writer->pos += len;
13775
0
    return 0;
13776
0
}
13777
13778
PyObject *
13779
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13780
70.3k
{
13781
70.3k
    PyObject *str;
13782
13783
70.3k
    if (writer->pos == 0) {
13784
0
        Py_CLEAR(writer->buffer);
13785
0
        _Py_RETURN_UNICODE_EMPTY();
13786
0
    }
13787
13788
70.3k
    str = writer->buffer;
13789
70.3k
    writer->buffer = NULL;
13790
13791
70.3k
    if (writer->readonly) {
13792
0
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13793
0
        return str;
13794
0
    }
13795
13796
70.3k
    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13797
6.25k
        PyObject *str2;
13798
6.25k
        str2 = resize_compact(str, writer->pos);
13799
6.25k
        if (str2 == NULL) {
13800
0
            Py_DECREF(str);
13801
0
            return NULL;
13802
0
        }
13803
6.25k
        str = str2;
13804
6.25k
    }
13805
13806
70.3k
    assert(_PyUnicode_CheckConsistency(str, 1));
13807
70.3k
    return unicode_result_ready(str);
13808
70.3k
}
13809
13810
void
13811
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13812
36
{
13813
36
    Py_CLEAR(writer->buffer);
13814
36
}
13815
13816
#include "stringlib/unicode_format.h"
13817
13818
PyDoc_STRVAR(format__doc__,
13819
             "S.format(*args, **kwargs) -> str\n\
13820
\n\
13821
Return a formatted version of S, using substitutions from args and kwargs.\n\
13822
The substitutions are identified by braces ('{' and '}').");
13823
13824
PyDoc_STRVAR(format_map__doc__,
13825
             "S.format_map(mapping) -> str\n\
13826
\n\
13827
Return a formatted version of S, using substitutions from mapping.\n\
13828
The substitutions are identified by braces ('{' and '}').");
13829
13830
/*[clinic input]
13831
str.__format__ as unicode___format__
13832
13833
    format_spec: unicode
13834
    /
13835
13836
Return a formatted version of the string as described by format_spec.
13837
[clinic start generated code]*/
13838
13839
static PyObject *
13840
unicode___format___impl(PyObject *self, PyObject *format_spec)
13841
/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
13842
0
{
13843
0
    _PyUnicodeWriter writer;
13844
0
    int ret;
13845
13846
0
    if (PyUnicode_READY(self) == -1)
13847
0
        return NULL;
13848
0
    _PyUnicodeWriter_Init(&writer);
13849
0
    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13850
0
                                          self, format_spec, 0,
13851
0
                                          PyUnicode_GET_LENGTH(format_spec));
13852
0
    if (ret == -1) {
13853
0
        _PyUnicodeWriter_Dealloc(&writer);
13854
0
        return NULL;
13855
0
    }
13856
0
    return _PyUnicodeWriter_Finish(&writer);
13857
0
}
13858
13859
/*[clinic input]
13860
str.__sizeof__ as unicode_sizeof
13861
13862
Return the size of the string in memory, in bytes.
13863
[clinic start generated code]*/
13864
13865
static PyObject *
13866
unicode_sizeof_impl(PyObject *self)
13867
/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
13868
0
{
13869
0
    Py_ssize_t size;
13870
13871
    /* If it's a compact object, account for base structure +
13872
       character data. */
13873
0
    if (PyUnicode_IS_COMPACT_ASCII(self))
13874
0
        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13875
0
    else if (PyUnicode_IS_COMPACT(self))
13876
0
        size = sizeof(PyCompactUnicodeObject) +
13877
0
            (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
13878
0
    else {
13879
        /* If it is a two-block object, account for base object, and
13880
           for character block if present. */
13881
0
        size = sizeof(PyUnicodeObject);
13882
0
        if (_PyUnicode_DATA_ANY(self))
13883
0
            size += (PyUnicode_GET_LENGTH(self) + 1) *
13884
0
                PyUnicode_KIND(self);
13885
0
    }
13886
    /* If the wstr pointer is present, account for it unless it is shared
13887
       with the data pointer. Check if the data is not shared. */
13888
0
    if (_PyUnicode_HAS_WSTR_MEMORY(self))
13889
0
        size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13890
0
    if (_PyUnicode_HAS_UTF8_MEMORY(self))
13891
0
        size += PyUnicode_UTF8_LENGTH(self) + 1;
13892
13893
0
    return PyLong_FromSsize_t(size);
13894
0
}
13895
13896
static PyObject *
13897
unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
13898
0
{
13899
0
    PyObject *copy = _PyUnicode_Copy(v);
13900
0
    if (!copy)
13901
0
        return NULL;
13902
0
    return Py_BuildValue("(N)", copy);
13903
0
}
13904
13905
static PyMethodDef unicode_methods[] = {
13906
    UNICODE_ENCODE_METHODDEF
13907
    UNICODE_REPLACE_METHODDEF
13908
    UNICODE_SPLIT_METHODDEF
13909
    UNICODE_RSPLIT_METHODDEF
13910
    UNICODE_JOIN_METHODDEF
13911
    UNICODE_CAPITALIZE_METHODDEF
13912
    UNICODE_CASEFOLD_METHODDEF
13913
    UNICODE_TITLE_METHODDEF
13914
    UNICODE_CENTER_METHODDEF
13915
    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13916
    UNICODE_EXPANDTABS_METHODDEF
13917
    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13918
    UNICODE_PARTITION_METHODDEF
13919
    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13920
    UNICODE_LJUST_METHODDEF
13921
    UNICODE_LOWER_METHODDEF
13922
    UNICODE_LSTRIP_METHODDEF
13923
    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13924
    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13925
    UNICODE_RJUST_METHODDEF
13926
    UNICODE_RSTRIP_METHODDEF
13927
    UNICODE_RPARTITION_METHODDEF
13928
    UNICODE_SPLITLINES_METHODDEF
13929
    UNICODE_STRIP_METHODDEF
13930
    UNICODE_SWAPCASE_METHODDEF
13931
    UNICODE_TRANSLATE_METHODDEF
13932
    UNICODE_UPPER_METHODDEF
13933
    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13934
    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13935
    UNICODE_ISASCII_METHODDEF
13936
    UNICODE_ISLOWER_METHODDEF
13937
    UNICODE_ISUPPER_METHODDEF
13938
    UNICODE_ISTITLE_METHODDEF
13939
    UNICODE_ISSPACE_METHODDEF
13940
    UNICODE_ISDECIMAL_METHODDEF
13941
    UNICODE_ISDIGIT_METHODDEF
13942
    UNICODE_ISNUMERIC_METHODDEF
13943
    UNICODE_ISALPHA_METHODDEF
13944
    UNICODE_ISALNUM_METHODDEF
13945
    UNICODE_ISIDENTIFIER_METHODDEF
13946
    UNICODE_ISPRINTABLE_METHODDEF
13947
    UNICODE_ZFILL_METHODDEF
13948
    {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13949
    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13950
    UNICODE___FORMAT___METHODDEF
13951
    UNICODE_MAKETRANS_METHODDEF
13952
    UNICODE_SIZEOF_METHODDEF
13953
#if 0
13954
    /* These methods are just used for debugging the implementation. */
13955
    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13956
#endif
13957
13958
    {"__getnewargs__",  unicode_getnewargs, METH_NOARGS},
13959
    {NULL, NULL}
13960
};
13961
13962
static PyObject *
13963
unicode_mod(PyObject *v, PyObject *w)
13964
0
{
13965
0
    if (!PyUnicode_Check(v))
13966
0
        Py_RETURN_NOTIMPLEMENTED;
13967
0
    return PyUnicode_Format(v, w);
13968
0
}
13969
13970
static PyNumberMethods unicode_as_number = {
13971
    0,              /*nb_add*/
13972
    0,              /*nb_subtract*/
13973
    0,              /*nb_multiply*/
13974
    unicode_mod,            /*nb_remainder*/
13975
};
13976
13977
static PySequenceMethods unicode_as_sequence = {
13978
    (lenfunc) unicode_length,       /* sq_length */
13979
    PyUnicode_Concat,           /* sq_concat */
13980
    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13981
    (ssizeargfunc) unicode_getitem,     /* sq_item */
13982
    0,                  /* sq_slice */
13983
    0,                  /* sq_ass_item */
13984
    0,                  /* sq_ass_slice */
13985
    PyUnicode_Contains,         /* sq_contains */
13986
};
13987
13988
static PyObject*
13989
unicode_subscript(PyObject* self, PyObject* item)
13990
5.61k
{
13991
5.61k
    if (PyUnicode_READY(self) == -1)
13992
0
        return NULL;
13993
13994
5.61k
    if (PyIndex_Check(item)) {
13995
5.15k
        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13996
5.15k
        if (i == -1 && PyErr_Occurred())
13997
0
            return NULL;
13998
5.15k
        if (i < 0)
13999
104
            i += PyUnicode_GET_LENGTH(self);
14000
5.15k
        return unicode_getitem(self, i);
14001
5.15k
    } else if (PySlice_Check(item)) {
14002
455
        Py_ssize_t start, stop, step, slicelength, i;
14003
455
        size_t cur;
14004
455
        PyObject *result;
14005
455
        void *src_data, *dest_data;
14006
455
        int src_kind, dest_kind;
14007
455
        Py_UCS4 ch, max_char, kind_limit;
14008
14009
455
        if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
14010
0
            return NULL;
14011
0
        }
14012
455
        slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14013
455
                                            &start, &stop, step);
14014
14015
455
        if (slicelength <= 0) {
14016
70
            _Py_RETURN_UNICODE_EMPTY();
14017
385
        } else if (start == 0 && step == 1 &&
14018
385
                   slicelength == PyUnicode_GET_LENGTH(self)) {
14019
0
            return unicode_result_unchanged(self);
14020
385
        } else if (step == 1) {
14021
385
            return PyUnicode_Substring(self,
14022
385
                                       start, start + slicelength);
14023
385
        }
14024
        /* General case */
14025
0
        src_kind = PyUnicode_KIND(self);
14026
0
        src_data = PyUnicode_DATA(self);
14027
0
        if (!PyUnicode_IS_ASCII(self)) {
14028
0
            kind_limit = kind_maxchar_limit(src_kind);
14029
0
            max_char = 0;
14030
0
            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14031
0
                ch = PyUnicode_READ(src_kind, src_data, cur);
14032
0
                if (ch > max_char) {
14033
0
                    max_char = ch;
14034
0
                    if (max_char >= kind_limit)
14035
0
                        break;
14036
0
                }
14037
0
            }
14038
0
        }
14039
0
        else
14040
0
            max_char = 127;
14041
0
        result = PyUnicode_New(slicelength, max_char);
14042
0
        if (result == NULL)
14043
0
            return NULL;
14044
0
        dest_kind = PyUnicode_KIND(result);
14045
0
        dest_data = PyUnicode_DATA(result);
14046
14047
0
        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14048
0
            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14049
0
            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
14050
0
        }
14051
0
        assert(_PyUnicode_CheckConsistency(result, 1));
14052
0
        return result;
14053
0
    } else {
14054
0
        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14055
0
        return NULL;
14056
0
    }
14057
5.61k
}
14058
14059
static PyMappingMethods unicode_as_mapping = {
14060
    (lenfunc)unicode_length,        /* mp_length */
14061
    (binaryfunc)unicode_subscript,  /* mp_subscript */
14062
    (objobjargproc)0,           /* mp_ass_subscript */
14063
};
14064
14065
14066
/* Helpers for PyUnicode_Format() */
14067
14068
struct unicode_formatter_t {
14069
    PyObject *args;
14070
    int args_owned;
14071
    Py_ssize_t arglen, argidx;
14072
    PyObject *dict;
14073
14074
    enum PyUnicode_Kind fmtkind;
14075
    Py_ssize_t fmtcnt, fmtpos;
14076
    void *fmtdata;
14077
    PyObject *fmtstr;
14078
14079
    _PyUnicodeWriter writer;
14080
};
14081
14082
struct unicode_format_arg_t {
14083
    Py_UCS4 ch;
14084
    int flags;
14085
    Py_ssize_t width;
14086
    int prec;
14087
    int sign;
14088
};
14089
14090
static PyObject *
14091
unicode_format_getnextarg(struct unicode_formatter_t *ctx)
14092
116
{
14093
116
    Py_ssize_t argidx = ctx->argidx;
14094
14095
116
    if (argidx < ctx->arglen) {
14096
116
        ctx->argidx++;
14097
116
        if (ctx->arglen < 0)
14098
46
            return ctx->args;
14099
70
        else
14100
70
            return PyTuple_GetItem(ctx->args, argidx);
14101
116
    }
14102
0
    PyErr_SetString(PyExc_TypeError,
14103
0
                    "not enough arguments for format string");
14104
0
    return NULL;
14105
116
}
14106
14107
/* Returns a new reference to a PyUnicode object, or NULL on failure. */
14108
14109
/* Format a float into the writer if the writer is not NULL, or into *p_output
14110
   otherwise.
14111
14112
   Return 0 on success, raise an exception and return -1 on error. */
14113
static int
14114
formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14115
            PyObject **p_output,
14116
            _PyUnicodeWriter *writer)
14117
0
{
14118
0
    char *p;
14119
0
    double x;
14120
0
    Py_ssize_t len;
14121
0
    int prec;
14122
0
    int dtoa_flags;
14123
14124
0
    x = PyFloat_AsDouble(v);
14125
0
    if (x == -1.0 && PyErr_Occurred())
14126
0
        return -1;
14127
14128
0
    prec = arg->prec;
14129
0
    if (prec < 0)
14130
0
        prec = 6;
14131
14132
0
    if (arg->flags & F_ALT)
14133
0
        dtoa_flags = Py_DTSF_ALT;
14134
0
    else
14135
0
        dtoa_flags = 0;
14136
0
    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
14137
0
    if (p == NULL)
14138
0
        return -1;
14139
0
    len = strlen(p);
14140
0
    if (writer) {
14141
0
        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
14142
0
            PyMem_Free(p);
14143
0
            return -1;
14144
0
        }
14145
0
    }
14146
0
    else
14147
0
        *p_output = _PyUnicode_FromASCII(p, len);
14148
0
    PyMem_Free(p);
14149
0
    return 0;
14150
0
}
14151
14152
/* formatlong() emulates the format codes d, u, o, x and X, and
14153
 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
14154
 * Python's regular ints.
14155
 * Return value:  a new PyUnicodeObject*, or NULL if error.
14156
 *     The output string is of the form
14157
 *         "-"? ("0x" | "0X")? digit+
14158
 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
14159
 *         set in flags.  The case of hex digits will be correct,
14160
 *     There will be at least prec digits, zero-filled on the left if
14161
 *         necessary to get that many.
14162
 * val          object to be converted
14163
 * flags        bitmask of format flags; only F_ALT is looked at
14164
 * prec         minimum number of digits; 0-fill on left if needed
14165
 * type         a character in [duoxX]; u acts the same as d
14166
 *
14167
 * CAUTION:  o, x and X conversions on regular ints can never
14168
 * produce a '-' sign, but can for Python's unbounded ints.
14169
 */
14170
PyObject *
14171
_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
14172
0
{
14173
0
    PyObject *result = NULL;
14174
0
    char *buf;
14175
0
    Py_ssize_t i;
14176
0
    int sign;           /* 1 if '-', else 0 */
14177
0
    int len;            /* number of characters */
14178
0
    Py_ssize_t llen;
14179
0
    int numdigits;      /* len == numnondigits + numdigits */
14180
0
    int numnondigits = 0;
14181
14182
    /* Avoid exceeding SSIZE_T_MAX */
14183
0
    if (prec > INT_MAX-3) {
14184
0
        PyErr_SetString(PyExc_OverflowError,
14185
0
                        "precision too large");
14186
0
        return NULL;
14187
0
    }
14188
14189
0
    assert(PyLong_Check(val));
14190
14191
0
    switch (type) {
14192
0
    default:
14193
0
        Py_UNREACHABLE();
14194
0
    case 'd':
14195
0
    case 'i':
14196
0
    case 'u':
14197
        /* int and int subclasses should print numerically when a numeric */
14198
        /* format code is used (see issue18780) */
14199
0
        result = PyNumber_ToBase(val, 10);
14200
0
        break;
14201
0
    case 'o':
14202
0
        numnondigits = 2;
14203
0
        result = PyNumber_ToBase(val, 8);
14204
0
        break;
14205
0
    case 'x':
14206
0
    case 'X':
14207
0
        numnondigits = 2;
14208
0
        result = PyNumber_ToBase(val, 16);
14209
0
        break;
14210
0
    }
14211
0
    if (!result)
14212
0
        return NULL;
14213
14214
0
    assert(unicode_modifiable(result));
14215
0
    assert(PyUnicode_IS_READY(result));
14216
0
    assert(PyUnicode_IS_ASCII(result));
14217
14218
    /* To modify the string in-place, there can only be one reference. */
14219
0
    if (Py_REFCNT(result) != 1) {
14220
0
        Py_DECREF(result);
14221
0
        PyErr_BadInternalCall();
14222
0
        return NULL;
14223
0
    }
14224
0
    buf = PyUnicode_DATA(result);
14225
0
    llen = PyUnicode_GET_LENGTH(result);
14226
0
    if (llen > INT_MAX) {
14227
0
        Py_DECREF(result);
14228
0
        PyErr_SetString(PyExc_ValueError,
14229
0
                        "string too large in _PyUnicode_FormatLong");
14230
0
        return NULL;
14231
0
    }
14232
0
    len = (int)llen;
14233
0
    sign = buf[0] == '-';
14234
0
    numnondigits += sign;
14235
0
    numdigits = len - numnondigits;
14236
0
    assert(numdigits > 0);
14237
14238
    /* Get rid of base marker unless F_ALT */
14239
0
    if (((alt) == 0 &&
14240
0
        (type == 'o' || type == 'x' || type == 'X'))) {
14241
0
        assert(buf[sign] == '0');
14242
0
        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14243
0
               buf[sign+1] == 'o');
14244
0
        numnondigits -= 2;
14245
0
        buf += 2;
14246
0
        len -= 2;
14247
0
        if (sign)
14248
0
            buf[0] = '-';
14249
0
        assert(len == numnondigits + numdigits);
14250
0
        assert(numdigits > 0);
14251
0
    }
14252
14253
    /* Fill with leading zeroes to meet minimum width. */
14254
0
    if (prec > numdigits) {
14255
0
        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14256
0
                                numnondigits + prec);
14257
0
        char *b1;
14258
0
        if (!r1) {
14259
0
            Py_DECREF(result);
14260
0
            return NULL;
14261
0
        }
14262
0
        b1 = PyBytes_AS_STRING(r1);
14263
0
        for (i = 0; i < numnondigits; ++i)
14264
0
            *b1++ = *buf++;
14265
0
        for (i = 0; i < prec - numdigits; i++)
14266
0
            *b1++ = '0';
14267
0
        for (i = 0; i < numdigits; i++)
14268
0
            *b1++ = *buf++;
14269
0
        *b1 = '\0';
14270
0
        Py_DECREF(result);
14271
0
        result = r1;
14272
0
        buf = PyBytes_AS_STRING(result);
14273
0
        len = numnondigits + prec;
14274
0
    }
14275
14276
    /* Fix up case for hex conversions. */
14277
0
    if (type == 'X') {
14278
        /* Need to convert all lower case letters to upper case.
14279
           and need to convert 0x to 0X (and -0x to -0X). */
14280
0
        for (i = 0; i < len; i++)
14281
0
            if (buf[i] >= 'a' && buf[i] <= 'x')
14282
0
                buf[i] -= 'a'-'A';
14283
0
    }
14284
0
    if (!PyUnicode_Check(result)
14285
0
        || buf != PyUnicode_DATA(result)) {
14286
0
        PyObject *unicode;
14287
0
        unicode = _PyUnicode_FromASCII(buf, len);
14288
0
        Py_DECREF(result);
14289
0
        result = unicode;
14290
0
    }
14291
0
    else if (len != PyUnicode_GET_LENGTH(result)) {
14292
0
        if (PyUnicode_Resize(&result, len) < 0)
14293
0
            Py_CLEAR(result);
14294
0
    }
14295
0
    return result;
14296
0
}
14297
14298
/* Format an integer or a float as an integer.
14299
 * Return 1 if the number has been formatted into the writer,
14300
 *        0 if the number has been formatted into *p_output
14301
 *       -1 and raise an exception on error */
14302
static int
14303
mainformatlong(PyObject *v,
14304
               struct unicode_format_arg_t *arg,
14305
               PyObject **p_output,
14306
               _PyUnicodeWriter *writer)
14307
28
{
14308
28
    PyObject *iobj, *res;
14309
28
    char type = (char)arg->ch;
14310
14311
28
    if (!PyNumber_Check(v))
14312
0
        goto wrongtype;
14313
14314
    /* make sure number is a type of integer for o, x, and X */
14315
28
    if (!PyLong_Check(v)) {
14316
0
        if (type == 'o' || type == 'x' || type == 'X') {
14317
0
            iobj = PyNumber_Index(v);
14318
0
            if (iobj == NULL) {
14319
0
                if (PyErr_ExceptionMatches(PyExc_TypeError))
14320
0
                    goto wrongtype;
14321
0
                return -1;
14322
0
            }
14323
0
        }
14324
0
        else {
14325
0
            iobj = PyNumber_Long(v);
14326
0
            if (iobj == NULL ) {
14327
0
                if (PyErr_ExceptionMatches(PyExc_TypeError))
14328
0
                    goto wrongtype;
14329
0
                return -1;
14330
0
            }
14331
0
        }
14332
0
        assert(PyLong_Check(iobj));
14333
0
    }
14334
28
    else {
14335
28
        iobj = v;
14336
28
        Py_INCREF(iobj);
14337
28
    }
14338
14339
28
    if (PyLong_CheckExact(v)
14340
28
        && arg->width == -1 && arg->prec == -1
14341
28
        && !(arg->flags & (F_SIGN | F_BLANK))
14342
28
        && type != 'X')
14343
28
    {
14344
        /* Fast path */
14345
28
        int alternate = arg->flags & F_ALT;
14346
28
        int base;
14347
14348
28
        switch(type)
14349
28
        {
14350
0
            default:
14351
0
                Py_UNREACHABLE();
14352
28
            case 'd':
14353
28
            case 'i':
14354
28
            case 'u':
14355
28
                base = 10;
14356
28
                break;
14357
0
            case 'o':
14358
0
                base = 8;
14359
0
                break;
14360
0
            case 'x':
14361
0
            case 'X':
14362
0
                base = 16;
14363
0
                break;
14364
28
        }
14365
14366
28
        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14367
0
            Py_DECREF(iobj);
14368
0
            return -1;
14369
0
        }
14370
28
        Py_DECREF(iobj);
14371
28
        return 1;
14372
28
    }
14373
14374
0
    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14375
0
    Py_DECREF(iobj);
14376
0
    if (res == NULL)
14377
0
        return -1;
14378
0
    *p_output = res;
14379
0
    return 0;
14380
14381
0
wrongtype:
14382
0
    switch(type)
14383
0
    {
14384
0
        case 'o':
14385
0
        case 'x':
14386
0
        case 'X':
14387
0
            PyErr_Format(PyExc_TypeError,
14388
0
                    "%%%c format: an integer is required, "
14389
0
                    "not %.200s",
14390
0
                    type, Py_TYPE(v)->tp_name);
14391
0
            break;
14392
0
        default:
14393
0
            PyErr_Format(PyExc_TypeError,
14394
0
                    "%%%c format: a number is required, "
14395
0
                    "not %.200s",
14396
0
                    type, Py_TYPE(v)->tp_name);
14397
0
            break;
14398
0
    }
14399
0
    return -1;
14400
0
}
14401
14402
static Py_UCS4
14403
formatchar(PyObject *v)
14404
0
{
14405
    /* presume that the buffer is at least 3 characters long */
14406
0
    if (PyUnicode_Check(v)) {
14407
0
        if (PyUnicode_GET_LENGTH(v) == 1) {
14408
0
            return PyUnicode_READ_CHAR(v, 0);
14409
0
        }
14410
0
        goto onError;
14411
0
    }
14412
0
    else {
14413
0
        PyObject *iobj;
14414
0
        long x;
14415
        /* make sure number is a type of integer */
14416
0
        if (!PyLong_Check(v)) {
14417
0
            iobj = PyNumber_Index(v);
14418
0
            if (iobj == NULL) {
14419
0
                goto onError;
14420
0
            }
14421
0
            x = PyLong_AsLong(iobj);
14422
0
            Py_DECREF(iobj);
14423
0
        }
14424
0
        else {
14425
0
            x = PyLong_AsLong(v);
14426
0
        }
14427
0
        if (x == -1 && PyErr_Occurred())
14428
0
            goto onError;
14429
14430
0
        if (x < 0 || x > MAX_UNICODE) {
14431
0
            PyErr_SetString(PyExc_OverflowError,
14432
0
                            "%c arg not in range(0x110000)");
14433
0
            return (Py_UCS4) -1;
14434
0
        }
14435
14436
0
        return (Py_UCS4) x;
14437
0
    }
14438
14439
0
  onError:
14440
0
    PyErr_SetString(PyExc_TypeError,
14441
0
                    "%c requires int or char");
14442
0
    return (Py_UCS4) -1;
14443
0
}
14444
14445
/* Parse options of an argument: flags, width, precision.
14446
   Handle also "%(name)" syntax.
14447
14448
   Return 0 if the argument has been formatted into arg->str.
14449
   Return 1 if the argument has been written into ctx->writer,
14450
   Raise an exception and return -1 on error. */
14451
static int
14452
unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14453
                         struct unicode_format_arg_t *arg)
14454
116
{
14455
116
#define FORMAT_READ(ctx) \
14456
457
        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14457
14458
116
    PyObject *v;
14459
14460
116
    if (arg->ch == '(') {
14461
        /* Get argument value from a dictionary. Example: "%(name)s". */
14462
46
        Py_ssize_t keystart;
14463
46
        Py_ssize_t keylen;
14464
46
        PyObject *key;
14465
46
        int pcount = 1;
14466
14467
46
        if (ctx->dict == NULL) {
14468
0
            PyErr_SetString(PyExc_TypeError,
14469
0
                            "format requires a mapping");
14470
0
            return -1;
14471
0
        }
14472
46
        ++ctx->fmtpos;
14473
46
        --ctx->fmtcnt;
14474
46
        keystart = ctx->fmtpos;
14475
        /* Skip over balanced parentheses */
14476
387
        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14477
341
            arg->ch = FORMAT_READ(ctx);
14478
341
            if (arg->ch == ')')
14479
46
                --pcount;
14480
295
            else if (arg->ch == '(')
14481
0
                ++pcount;
14482
341
            ctx->fmtpos++;
14483
341
        }
14484
46
        keylen = ctx->fmtpos - keystart - 1;
14485
46
        if (ctx->fmtcnt < 0 || pcount > 0) {
14486
0
            PyErr_SetString(PyExc_ValueError,
14487
0
                            "incomplete format key");
14488
0
            return -1;
14489
0
        }
14490
46
        key = PyUnicode_Substring(ctx->fmtstr,
14491
46
                                  keystart, keystart + keylen);
14492
46
        if (key == NULL)
14493
0
            return -1;
14494
46
        if (ctx->args_owned) {
14495
31
            ctx->args_owned = 0;
14496
31
            Py_DECREF(ctx->args);
14497
31
        }
14498
46
        ctx->args = PyObject_GetItem(ctx->dict, key);
14499
46
        Py_DECREF(key);
14500
46
        if (ctx->args == NULL)
14501
0
            return -1;
14502
46
        ctx->args_owned = 1;
14503
46
        ctx->arglen = -1;
14504
46
        ctx->argidx = -2;
14505
46
    }
14506
14507
    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14508
116
    while (--ctx->fmtcnt >= 0) {
14509
116
        arg->ch = FORMAT_READ(ctx);
14510
116
        ctx->fmtpos++;
14511
116
        switch (arg->ch) {
14512
0
        case '-': arg->flags |= F_LJUST; continue;
14513
0
        case '+': arg->flags |= F_SIGN; continue;
14514
0
        case ' ': arg->flags |= F_BLANK; continue;
14515
0
        case '#': arg->flags |= F_ALT; continue;
14516
0
        case '0': arg->flags |= F_ZERO; continue;
14517
116
        }
14518
116
        break;
14519
116
    }
14520
14521
    /* Parse width. Example: "%10s" => width=10 */
14522
116
    if (arg->ch == '*') {
14523
0
        v = unicode_format_getnextarg(ctx);
14524
0
        if (v == NULL)
14525
0
            return -1;
14526
0
        if (!PyLong_Check(v)) {
14527
0
            PyErr_SetString(PyExc_TypeError,
14528
0
                            "* wants int");
14529
0
            return -1;
14530
0
        }
14531
0
        arg->width = PyLong_AsSsize_t(v);
14532
0
        if (arg->width == -1 && PyErr_Occurred())
14533
0
            return -1;
14534
0
        if (arg->width < 0) {
14535
0
            arg->flags |= F_LJUST;
14536
0
            arg->width = -arg->width;
14537
0
        }
14538
0
        if (--ctx->fmtcnt >= 0) {
14539
0
            arg->ch = FORMAT_READ(ctx);
14540
0
            ctx->fmtpos++;
14541
0
        }
14542
0
    }
14543
116
    else if (arg->ch >= '0' && arg->ch <= '9') {
14544
0
        arg->width = arg->ch - '0';
14545
0
        while (--ctx->fmtcnt >= 0) {
14546
0
            arg->ch = FORMAT_READ(ctx);
14547
0
            ctx->fmtpos++;
14548
0
            if (arg->ch < '0' || arg->ch > '9')
14549
0
                break;
14550
            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14551
               mixing signed and unsigned comparison. Since arg->ch is between
14552
               '0' and '9', casting to int is safe. */
14553
0
            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14554
0
                PyErr_SetString(PyExc_ValueError,
14555
0
                                "width too big");
14556
0
                return -1;
14557
0
            }
14558
0
            arg->width = arg->width*10 + (arg->ch - '0');
14559
0
        }
14560
0
    }
14561
14562
    /* Parse precision. Example: "%.3f" => prec=3 */
14563
116
    if (arg->ch == '.') {
14564
0
        arg->prec = 0;
14565
0
        if (--ctx->fmtcnt >= 0) {
14566
0
            arg->ch = FORMAT_READ(ctx);
14567
0
            ctx->fmtpos++;
14568
0
        }
14569
0
        if (arg->ch == '*') {
14570
0
            v = unicode_format_getnextarg(ctx);
14571
0
            if (v == NULL)
14572
0
                return -1;
14573
0
            if (!PyLong_Check(v)) {
14574
0
                PyErr_SetString(PyExc_TypeError,
14575
0
                                "* wants int");
14576
0
                return -1;
14577
0
            }
14578
0
            arg->prec = _PyLong_AsInt(v);
14579
0
            if (arg->prec == -1 && PyErr_Occurred())
14580
0
                return -1;
14581
0
            if (arg->prec < 0)
14582
0
                arg->prec = 0;
14583
0
            if (--ctx->fmtcnt >= 0) {
14584
0
                arg->ch = FORMAT_READ(ctx);
14585
0
                ctx->fmtpos++;
14586
0
            }
14587
0
        }
14588
0
        else if (arg->ch >= '0' && arg->ch <= '9') {
14589
0
            arg->prec = arg->ch - '0';
14590
0
            while (--ctx->fmtcnt >= 0) {
14591
0
                arg->ch = FORMAT_READ(ctx);
14592
0
                ctx->fmtpos++;
14593
0
                if (arg->ch < '0' || arg->ch > '9')
14594
0
                    break;
14595
0
                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14596
0
                    PyErr_SetString(PyExc_ValueError,
14597
0
                                    "precision too big");
14598
0
                    return -1;
14599
0
                }
14600
0
                arg->prec = arg->prec*10 + (arg->ch - '0');
14601
0
            }
14602
0
        }
14603
0
    }
14604
14605
    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14606
116
    if (ctx->fmtcnt >= 0) {
14607
116
        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14608
0
            if (--ctx->fmtcnt >= 0) {
14609
0
                arg->ch = FORMAT_READ(ctx);
14610
0
                ctx->fmtpos++;
14611
0
            }
14612
0
        }
14613
116
    }
14614
116
    if (ctx->fmtcnt < 0) {
14615
0
        PyErr_SetString(PyExc_ValueError,
14616
0
                        "incomplete format");
14617
0
        return -1;
14618
0
    }
14619
116
    return 0;
14620
14621
116
#undef FORMAT_READ
14622
116
}
14623
14624
/* Format one argument. Supported conversion specifiers:
14625
14626
   - "s", "r", "a": any type
14627
   - "i", "d", "u": int or float
14628
   - "o", "x", "X": int
14629
   - "e", "E", "f", "F", "g", "G": float
14630
   - "c": int or str (1 character)
14631
14632
   When possible, the output is written directly into the Unicode writer
14633
   (ctx->writer). A string is created when padding is required.
14634
14635
   Return 0 if the argument has been formatted into *p_str,
14636
          1 if the argument has been written into ctx->writer,
14637
         -1 on error. */
14638
static int
14639
unicode_format_arg_format(struct unicode_formatter_t *ctx,
14640
                          struct unicode_format_arg_t *arg,
14641
                          PyObject **p_str)
14642
116
{
14643
116
    PyObject *v;
14644
116
    _PyUnicodeWriter *writer = &ctx->writer;
14645
14646
116
    if (ctx->fmtcnt == 0)
14647
28
        ctx->writer.overallocate = 0;
14648
14649
116
    v = unicode_format_getnextarg(ctx);
14650
116
    if (v == NULL)
14651
0
        return -1;
14652
14653
14654
116
    switch (arg->ch) {
14655
88
    case 's':
14656
88
    case 'r':
14657
88
    case 'a':
14658
88
        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14659
            /* Fast path */
14660
0
            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14661
0
                return -1;
14662
0
            return 1;
14663
0
        }
14664
14665
88
        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14666
88
            *p_str = v;
14667
88
            Py_INCREF(*p_str);
14668
88
        }
14669
0
        else {
14670
0
            if (arg->ch == 's')
14671
0
                *p_str = PyObject_Str(v);
14672
0
            else if (arg->ch == 'r')
14673
0
                *p_str = PyObject_Repr(v);
14674
0
            else
14675
0
                *p_str = PyObject_ASCII(v);
14676
0
        }
14677
88
        break;
14678
14679
0
    case 'i':
14680
28
    case 'd':
14681
28
    case 'u':
14682
28
    case 'o':
14683
28
    case 'x':
14684
28
    case 'X':
14685
28
    {
14686
28
        int ret = mainformatlong(v, arg, p_str, writer);
14687
28
        if (ret != 0)
14688
28
            return ret;
14689
0
        arg->sign = 1;
14690
0
        break;
14691
28
    }
14692
14693
0
    case 'e':
14694
0
    case 'E':
14695
0
    case 'f':
14696
0
    case 'F':
14697
0
    case 'g':
14698
0
    case 'G':
14699
0
        if (arg->width == -1 && arg->prec == -1
14700
0
            && !(arg->flags & (F_SIGN | F_BLANK)))
14701
0
        {
14702
            /* Fast path */
14703
0
            if (formatfloat(v, arg, NULL, writer) == -1)
14704
0
                return -1;
14705
0
            return 1;
14706
0
        }
14707
14708
0
        arg->sign = 1;
14709
0
        if (formatfloat(v, arg, p_str, NULL) == -1)
14710
0
            return -1;
14711
0
        break;
14712
14713
0
    case 'c':
14714
0
    {
14715
0
        Py_UCS4 ch = formatchar(v);
14716
0
        if (ch == (Py_UCS4) -1)
14717
0
            return -1;
14718
0
        if (arg->width == -1 && arg->prec == -1) {
14719
            /* Fast path */
14720
0
            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14721
0
                return -1;
14722
0
            return 1;
14723
0
        }
14724
0
        *p_str = PyUnicode_FromOrdinal(ch);
14725
0
        break;
14726
0
    }
14727
14728
0
    default:
14729
0
        PyErr_Format(PyExc_ValueError,
14730
0
                     "unsupported format character '%c' (0x%x) "
14731
0
                     "at index %zd",
14732
0
                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14733
0
                     (int)arg->ch,
14734
0
                     ctx->fmtpos - 1);
14735
0
        return -1;
14736
116
    }
14737
88
    if (*p_str == NULL)
14738
0
        return -1;
14739
88
    assert (PyUnicode_Check(*p_str));
14740
88
    return 0;
14741
88
}
14742
14743
static int
14744
unicode_format_arg_output(struct unicode_formatter_t *ctx,
14745
                          struct unicode_format_arg_t *arg,
14746
                          PyObject *str)
14747
88
{
14748
88
    Py_ssize_t len;
14749
88
    enum PyUnicode_Kind kind;
14750
88
    void *pbuf;
14751
88
    Py_ssize_t pindex;
14752
88
    Py_UCS4 signchar;
14753
88
    Py_ssize_t buflen;
14754
88
    Py_UCS4 maxchar;
14755
88
    Py_ssize_t sublen;
14756
88
    _PyUnicodeWriter *writer = &ctx->writer;
14757
88
    Py_UCS4 fill;
14758
14759
88
    fill = ' ';
14760
88
    if (arg->sign && arg->flags & F_ZERO)
14761
0
        fill = '0';
14762
14763
88
    if (PyUnicode_READY(str) == -1)
14764
0
        return -1;
14765
14766
88
    len = PyUnicode_GET_LENGTH(str);
14767
88
    if ((arg->width == -1 || arg->width <= len)
14768
88
        && (arg->prec == -1 || arg->prec >= len)
14769
88
        && !(arg->flags & (F_SIGN | F_BLANK)))
14770
88
    {
14771
        /* Fast path */
14772
88
        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14773
0
            return -1;
14774
88
        return 0;
14775
88
    }
14776
14777
    /* Truncate the string for "s", "r" and "a" formats
14778
       if the precision is set */
14779
0
    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14780
0
        if (arg->prec >= 0 && len > arg->prec)
14781
0
            len = arg->prec;
14782
0
    }
14783
14784
    /* Adjust sign and width */
14785
0
    kind = PyUnicode_KIND(str);
14786
0
    pbuf = PyUnicode_DATA(str);
14787
0
    pindex = 0;
14788
0
    signchar = '\0';
14789
0
    if (arg->sign) {
14790
0
        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14791
0
        if (ch == '-' || ch == '+') {
14792
0
            signchar = ch;
14793
0
            len--;
14794
0
            pindex++;
14795
0
        }
14796
0
        else if (arg->flags & F_SIGN)
14797
0
            signchar = '+';
14798
0
        else if (arg->flags & F_BLANK)
14799
0
            signchar = ' ';
14800
0
        else
14801
0
            arg->sign = 0;
14802
0
    }
14803
0
    if (arg->width < len)
14804
0
        arg->width = len;
14805
14806
    /* Prepare the writer */
14807
0
    maxchar = writer->maxchar;
14808
0
    if (!(arg->flags & F_LJUST)) {
14809
0
        if (arg->sign) {
14810
0
            if ((arg->width-1) > len)
14811
0
                maxchar = Py_MAX(maxchar, fill);
14812
0
        }
14813
0
        else {
14814
0
            if (arg->width > len)
14815
0
                maxchar = Py_MAX(maxchar, fill);
14816
0
        }
14817
0
    }
14818
0
    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14819
0
        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14820
0
        maxchar = Py_MAX(maxchar, strmaxchar);
14821
0
    }
14822
14823
0
    buflen = arg->width;
14824
0
    if (arg->sign && len == arg->width)
14825
0
        buflen++;
14826
0
    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14827
0
        return -1;
14828
14829
    /* Write the sign if needed */
14830
0
    if (arg->sign) {
14831
0
        if (fill != ' ') {
14832
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14833
0
            writer->pos += 1;
14834
0
        }
14835
0
        if (arg->width > len)
14836
0
            arg->width--;
14837
0
    }
14838
14839
    /* Write the numeric prefix for "x", "X" and "o" formats
14840
       if the alternate form is used.
14841
       For example, write "0x" for the "%#x" format. */
14842
0
    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14843
0
        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14844
0
        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14845
0
        if (fill != ' ') {
14846
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14847
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14848
0
            writer->pos += 2;
14849
0
            pindex += 2;
14850
0
        }
14851
0
        arg->width -= 2;
14852
0
        if (arg->width < 0)
14853
0
            arg->width = 0;
14854
0
        len -= 2;
14855
0
    }
14856
14857
    /* Pad left with the fill character if needed */
14858
0
    if (arg->width > len && !(arg->flags & F_LJUST)) {
14859
0
        sublen = arg->width - len;
14860
0
        unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
14861
0
        writer->pos += sublen;
14862
0
        arg->width = len;
14863
0
    }
14864
14865
    /* If padding with spaces: write sign if needed and/or numeric prefix if
14866
       the alternate form is used */
14867
0
    if (fill == ' ') {
14868
0
        if (arg->sign) {
14869
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14870
0
            writer->pos += 1;
14871
0
        }
14872
0
        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14873
0
            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14874
0
            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14875
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14876
0
            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14877
0
            writer->pos += 2;
14878
0
            pindex += 2;
14879
0
        }
14880
0
    }
14881
14882
    /* Write characters */
14883
0
    if (len) {
14884
0
        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14885
0
                                      str, pindex, len);
14886
0
        writer->pos += len;
14887
0
    }
14888
14889
    /* Pad right with the fill character if needed */
14890
0
    if (arg->width > len) {
14891
0
        sublen = arg->width - len;
14892
0
        unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
14893
0
        writer->pos += sublen;
14894
0
    }
14895
0
    return 0;
14896
0
}
14897
14898
/* Helper of PyUnicode_Format(): format one arg.
14899
   Return 0 on success, raise an exception and return -1 on error. */
14900
static int
14901
unicode_format_arg(struct unicode_formatter_t *ctx)
14902
116
{
14903
116
    struct unicode_format_arg_t arg;
14904
116
    PyObject *str;
14905
116
    int ret;
14906
14907
116
    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14908
116
    if (arg.ch == '%') {
14909
0
        ctx->fmtpos++;
14910
0
        ctx->fmtcnt--;
14911
0
        if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14912
0
            return -1;
14913
0
        return 0;
14914
0
    }
14915
116
    arg.flags = 0;
14916
116
    arg.width = -1;
14917
116
    arg.prec = -1;
14918
116
    arg.sign = 0;
14919
116
    str = NULL;
14920
14921
116
    ret = unicode_format_arg_parse(ctx, &arg);
14922
116
    if (ret == -1)
14923
0
        return -1;
14924
14925
116
    ret = unicode_format_arg_format(ctx, &arg, &str);
14926
116
    if (ret == -1)
14927
0
        return -1;
14928
14929
116
    if (ret != 1) {
14930
88
        ret = unicode_format_arg_output(ctx, &arg, str);
14931
88
        Py_DECREF(str);
14932
88
        if (ret == -1)
14933
0
            return -1;
14934
88
    }
14935
14936
116
    if (ctx->dict && (ctx->argidx < ctx->arglen)) {
14937
0
        PyErr_SetString(PyExc_TypeError,
14938
0
                        "not all arguments converted during string formatting");
14939
0
        return -1;
14940
0
    }
14941
116
    return 0;
14942
116
}
14943
14944
PyObject *
14945
PyUnicode_Format(PyObject *format, PyObject *args)
14946
57
{
14947
57
    struct unicode_formatter_t ctx;
14948
14949
57
    if (format == NULL || args == NULL) {
14950
0
        PyErr_BadInternalCall();
14951
0
        return NULL;
14952
0
    }
14953
14954
57
    if (ensure_unicode(format) < 0)
14955
0
        return NULL;
14956
14957
57
    ctx.fmtstr = format;
14958
57
    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14959
57
    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14960
57
    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14961
57
    ctx.fmtpos = 0;
14962
14963
57
    _PyUnicodeWriter_Init(&ctx.writer);
14964
57
    ctx.writer.min_length = ctx.fmtcnt + 100;
14965
57
    ctx.writer.overallocate = 1;
14966
14967
57
    if (PyTuple_Check(args)) {
14968
42
        ctx.arglen = PyTuple_Size(args);
14969
42
        ctx.argidx = 0;
14970
42
    }
14971
15
    else {
14972
15
        ctx.arglen = -1;
14973
15
        ctx.argidx = -2;
14974
15
    }
14975
57
    ctx.args_owned = 0;
14976
57
    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14977
15
        ctx.dict = args;
14978
42
    else
14979
42
        ctx.dict = NULL;
14980
57
    ctx.args = args;
14981
14982
290
    while (--ctx.fmtcnt >= 0) {
14983
233
        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14984
117
            Py_ssize_t nonfmtpos;
14985
14986
117
            nonfmtpos = ctx.fmtpos++;
14987
875
            while (ctx.fmtcnt >= 0 &&
14988
875
                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14989
758
                ctx.fmtpos++;
14990
758
                ctx.fmtcnt--;
14991
758
            }
14992
117
            if (ctx.fmtcnt < 0) {
14993
29
                ctx.fmtpos--;
14994
29
                ctx.writer.overallocate = 0;
14995
29
            }
14996
14997
117
            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14998
117
                                                nonfmtpos, ctx.fmtpos) < 0)
14999
0
                goto onError;
15000
117
        }
15001
116
        else {
15002
116
            ctx.fmtpos++;
15003
116
            if (unicode_format_arg(&ctx) == -1)
15004
0
                goto onError;
15005
116
        }
15006
233
    }
15007
15008
57
    if (ctx.argidx < ctx.arglen && !ctx.dict) {
15009
0
        PyErr_SetString(PyExc_TypeError,
15010
0
                        "not all arguments converted during string formatting");
15011
0
        goto onError;
15012
0
    }
15013
15014
57
    if (ctx.args_owned) {
15015
15
        Py_DECREF(ctx.args);
15016
15
    }
15017
57
    return _PyUnicodeWriter_Finish(&ctx.writer);
15018
15019
0
  onError:
15020
0
    _PyUnicodeWriter_Dealloc(&ctx.writer);
15021
0
    if (ctx.args_owned) {
15022
0
        Py_DECREF(ctx.args);
15023
0
    }
15024
0
    return NULL;
15025
57
}
15026
15027
static PyObject *
15028
unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15029
15030
static PyObject *
15031
unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15032
760
{
15033
760
    PyObject *x = NULL;
15034
760
    static char *kwlist[] = {"object", "encoding", "errors", 0};
15035
760
    char *encoding = NULL;
15036
760
    char *errors = NULL;
15037
15038
760
    if (type != &PyUnicode_Type)
15039
0
        return unicode_subtype_new(type, args, kwds);
15040
760
    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
15041
760
                                     kwlist, &x, &encoding, &errors))
15042
0
        return NULL;
15043
760
    if (x == NULL)
15044
0
        _Py_RETURN_UNICODE_EMPTY();
15045
760
    if (encoding == NULL && errors == NULL)
15046
759
        return PyObject_Str(x);
15047
1
    else
15048
1
        return PyUnicode_FromEncodedObject(x, encoding, errors);
15049
760
}
15050
15051
static PyObject *
15052
unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15053
0
{
15054
0
    PyObject *unicode, *self;
15055
0
    Py_ssize_t length, char_size;
15056
0
    int share_wstr, share_utf8;
15057
0
    unsigned int kind;
15058
0
    void *data;
15059
15060
0
    assert(PyType_IsSubtype(type, &PyUnicode_Type));
15061
15062
0
    unicode = unicode_new(&PyUnicode_Type, args, kwds);
15063
0
    if (unicode == NULL)
15064
0
        return NULL;
15065
0
    assert(_PyUnicode_CHECK(unicode));
15066
0
    if (PyUnicode_READY(unicode) == -1) {
15067
0
        Py_DECREF(unicode);
15068
0
        return NULL;
15069
0
    }
15070
15071
0
    self = type->tp_alloc(type, 0);
15072
0
    if (self == NULL) {
15073
0
        Py_DECREF(unicode);
15074
0
        return NULL;
15075
0
    }
15076
0
    kind = PyUnicode_KIND(unicode);
15077
0
    length = PyUnicode_GET_LENGTH(unicode);
15078
15079
0
    _PyUnicode_LENGTH(self) = length;
15080
#ifdef Py_DEBUG
15081
    _PyUnicode_HASH(self) = -1;
15082
#else
15083
0
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15084
0
#endif
15085
0
    _PyUnicode_STATE(self).interned = 0;
15086
0
    _PyUnicode_STATE(self).kind = kind;
15087
0
    _PyUnicode_STATE(self).compact = 0;
15088
0
    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
15089
0
    _PyUnicode_STATE(self).ready = 1;
15090
0
    _PyUnicode_WSTR(self) = NULL;
15091
0
    _PyUnicode_UTF8_LENGTH(self) = 0;
15092
0
    _PyUnicode_UTF8(self) = NULL;
15093
0
    _PyUnicode_WSTR_LENGTH(self) = 0;
15094
0
    _PyUnicode_DATA_ANY(self) = NULL;
15095
15096
0
    share_utf8 = 0;
15097
0
    share_wstr = 0;
15098
0
    if (kind == PyUnicode_1BYTE_KIND) {
15099
0
        char_size = 1;
15100
0
        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15101
0
            share_utf8 = 1;
15102
0
    }
15103
0
    else if (kind == PyUnicode_2BYTE_KIND) {
15104
0
        char_size = 2;
15105
0
        if (sizeof(wchar_t) == 2)
15106
0
            share_wstr = 1;
15107
0
    }
15108
0
    else {
15109
0
        assert(kind == PyUnicode_4BYTE_KIND);
15110
0
        char_size = 4;
15111
0
        if (sizeof(wchar_t) == 4)
15112
0
            share_wstr = 1;
15113
0
    }
15114
15115
    /* Ensure we won't overflow the length. */
15116
0
    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15117
0
        PyErr_NoMemory();
15118
0
        goto onError;
15119
0
    }
15120
0
    data = PyObject_MALLOC((length + 1) * char_size);
15121
0
    if (data == NULL) {
15122
0
        PyErr_NoMemory();
15123
0
        goto onError;
15124
0
    }
15125
15126
0
    _PyUnicode_DATA_ANY(self) = data;
15127
0
    if (share_utf8) {
15128
0
        _PyUnicode_UTF8_LENGTH(self) = length;
15129
0
        _PyUnicode_UTF8(self) = data;
15130
0
    }
15131
0
    if (share_wstr) {
15132
0
        _PyUnicode_WSTR_LENGTH(self) = length;
15133
0
        _PyUnicode_WSTR(self) = (wchar_t *)data;
15134
0
    }
15135
15136
0
    memcpy(data, PyUnicode_DATA(unicode),
15137
0
              kind * (length + 1));
15138
0
    assert(_PyUnicode_CheckConsistency(self, 1));
15139
#ifdef Py_DEBUG
15140
    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15141
#endif
15142
0
    Py_DECREF(unicode);
15143
0
    return self;
15144
15145
0
onError:
15146
0
    Py_DECREF(unicode);
15147
0
    Py_DECREF(self);
15148
0
    return NULL;
15149
0
}
15150
15151
PyDoc_STRVAR(unicode_doc,
15152
"str(object='') -> str\n\
15153
str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
15154
\n\
15155
Create a new string object from the given object. If encoding or\n\
15156
errors is specified, then the object must expose a data buffer\n\
15157
that will be decoded using the given encoding and error handler.\n\
15158
Otherwise, returns the result of object.__str__() (if defined)\n\
15159
or repr(object).\n\
15160
encoding defaults to sys.getdefaultencoding().\n\
15161
errors defaults to 'strict'.");
15162
15163
static PyObject *unicode_iter(PyObject *seq);
15164
15165
PyTypeObject PyUnicode_Type = {
15166
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15167
    "str",                        /* tp_name */
15168
    sizeof(PyUnicodeObject),      /* tp_basicsize */
15169
    0,                            /* tp_itemsize */
15170
    /* Slots */
15171
    (destructor)unicode_dealloc,  /* tp_dealloc */
15172
    0,                            /* tp_vectorcall_offset */
15173
    0,                            /* tp_getattr */
15174
    0,                            /* tp_setattr */
15175
    0,                            /* tp_as_async */
15176
    unicode_repr,                 /* tp_repr */
15177
    &unicode_as_number,           /* tp_as_number */
15178
    &unicode_as_sequence,         /* tp_as_sequence */
15179
    &unicode_as_mapping,          /* tp_as_mapping */
15180
    (hashfunc) unicode_hash,      /* tp_hash*/
15181
    0,                            /* tp_call*/
15182
    (reprfunc) unicode_str,       /* tp_str */
15183
    PyObject_GenericGetAttr,      /* tp_getattro */
15184
    0,                            /* tp_setattro */
15185
    0,                            /* tp_as_buffer */
15186
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
15187
    Py_TPFLAGS_UNICODE_SUBCLASS,   /* tp_flags */
15188
    unicode_doc,                  /* tp_doc */
15189
    0,                            /* tp_traverse */
15190
    0,                            /* tp_clear */
15191
    PyUnicode_RichCompare,        /* tp_richcompare */
15192
    0,                            /* tp_weaklistoffset */
15193
    unicode_iter,                 /* tp_iter */
15194
    0,                            /* tp_iternext */
15195
    unicode_methods,              /* tp_methods */
15196
    0,                            /* tp_members */
15197
    0,                            /* tp_getset */
15198
    &PyBaseObject_Type,           /* tp_base */
15199
    0,                            /* tp_dict */
15200
    0,                            /* tp_descr_get */
15201
    0,                            /* tp_descr_set */
15202
    0,                            /* tp_dictoffset */
15203
    0,                            /* tp_init */
15204
    0,                            /* tp_alloc */
15205
    unicode_new,                  /* tp_new */
15206
    PyObject_Del,                 /* tp_free */
15207
};
15208
15209
/* Initialize the Unicode implementation */
15210
15211
PyStatus
15212
_PyUnicode_Init(void)
15213
14
{
15214
    /* XXX - move this array to unicodectype.c ? */
15215
14
    Py_UCS2 linebreak[] = {
15216
14
        0x000A, /* LINE FEED */
15217
14
        0x000D, /* CARRIAGE RETURN */
15218
14
        0x001C, /* FILE SEPARATOR */
15219
14
        0x001D, /* GROUP SEPARATOR */
15220
14
        0x001E, /* RECORD SEPARATOR */
15221
14
        0x0085, /* NEXT LINE */
15222
14
        0x2028, /* LINE SEPARATOR */
15223
14
        0x2029, /* PARAGRAPH SEPARATOR */
15224
14
    };
15225
15226
    /* Init the implementation */
15227
14
    _Py_INCREF_UNICODE_EMPTY();
15228
14
    if (!unicode_empty) {
15229
0
        return _PyStatus_ERR("Can't create empty string");
15230
0
    }
15231
14
    Py_DECREF(unicode_empty);
15232
15233
14
    if (PyType_Ready(&PyUnicode_Type) < 0) {
15234
0
        return _PyStatus_ERR("Can't initialize unicode type");
15235
0
    }
15236
15237
    /* initialize the linebreak bloom filter */
15238
14
    bloom_linebreak = make_bloom_mask(
15239
14
        PyUnicode_2BYTE_KIND, linebreak,
15240
14
        Py_ARRAY_LENGTH(linebreak));
15241
15242
14
    if (PyType_Ready(&EncodingMapType) < 0) {
15243
0
         return _PyStatus_ERR("Can't initialize encoding map type");
15244
0
    }
15245
14
    if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15246
0
        return _PyStatus_ERR("Can't initialize field name iterator type");
15247
0
    }
15248
14
    if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15249
0
        return _PyStatus_ERR("Can't initialize formatter iter type");
15250
0
    }
15251
14
    return _PyStatus_OK();
15252
14
}
15253
15254
/* Finalize the Unicode implementation */
15255
15256
int
15257
PyUnicode_ClearFreeList(void)
15258
0
{
15259
0
    return 0;
15260
0
}
15261
15262
15263
void
15264
PyUnicode_InternInPlace(PyObject **p)
15265
232k
{
15266
232k
    PyObject *s = *p;
15267
232k
    PyObject *t;
15268
#ifdef Py_DEBUG
15269
    assert(s != NULL);
15270
    assert(_PyUnicode_CHECK(s));
15271
#else
15272
232k
    if (s == NULL || !PyUnicode_Check(s))
15273
0
        return;
15274
232k
#endif
15275
    /* If it's a subclass, we don't really know what putting
15276
       it in the interned dict might do. */
15277
232k
    if (!PyUnicode_CheckExact(s))
15278
0
        return;
15279
232k
    if (PyUnicode_CHECK_INTERNED(s))
15280
139k
        return;
15281
92.8k
    if (interned == NULL) {
15282
14
        interned = PyDict_New();
15283
14
        if (interned == NULL) {
15284
0
            PyErr_Clear(); /* Don't leave an exception */
15285
0
            return;
15286
0
        }
15287
14
    }
15288
92.8k
    Py_ALLOW_RECURSION
15289
92.8k
    t = PyDict_SetDefault(interned, s, s);
15290
92.8k
    Py_END_ALLOW_RECURSION
15291
92.8k
    if (t == NULL) {
15292
0
        PyErr_Clear();
15293
0
        return;
15294
0
    }
15295
92.8k
    if (t != s) {
15296
49.8k
        Py_INCREF(t);
15297
49.8k
        Py_SETREF(*p, t);
15298
49.8k
        return;
15299
49.8k
    }
15300
    /* The two references in interned are not counted by refcnt.
15301
       The deallocator will take care of this */
15302
43.0k
    Py_REFCNT(s) -= 2;
15303
43.0k
    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15304
43.0k
}
15305
15306
void
15307
PyUnicode_InternImmortal(PyObject **p)
15308
0
{
15309
0
    PyUnicode_InternInPlace(p);
15310
0
    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15311
0
        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15312
0
        Py_INCREF(*p);
15313
0
    }
15314
0
}
15315
15316
PyObject *
15317
PyUnicode_InternFromString(const char *cp)
15318
36.0k
{
15319
36.0k
    PyObject *s = PyUnicode_FromString(cp);
15320
36.0k
    if (s == NULL)
15321
0
        return NULL;
15322
36.0k
    PyUnicode_InternInPlace(&s);
15323
36.0k
    return s;
15324
36.0k
}
15325
15326
15327
#if defined(WITH_VALGRIND) || defined(__INSURE__)
15328
static void
15329
unicode_release_interned(void)
15330
{
15331
    PyObject *keys;
15332
    PyObject *s;
15333
    Py_ssize_t i, n;
15334
    Py_ssize_t immortal_size = 0, mortal_size = 0;
15335
15336
    if (interned == NULL || !PyDict_Check(interned))
15337
        return;
15338
    keys = PyDict_Keys(interned);
15339
    if (keys == NULL || !PyList_Check(keys)) {
15340
        PyErr_Clear();
15341
        return;
15342
    }
15343
15344
    /* Since unicode_release_interned() is intended to help a leak
15345
       detector, interned unicode strings are not forcibly deallocated;
15346
       rather, we give them their stolen references back, and then clear
15347
       and DECREF the interned dict. */
15348
15349
    n = PyList_GET_SIZE(keys);
15350
#ifdef INTERNED_STATS
15351
    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15352
            n);
15353
#endif
15354
    for (i = 0; i < n; i++) {
15355
        s = PyList_GET_ITEM(keys, i);
15356
        if (PyUnicode_READY(s) == -1) {
15357
            Py_UNREACHABLE();
15358
        }
15359
        switch (PyUnicode_CHECK_INTERNED(s)) {
15360
        case SSTATE_NOT_INTERNED:
15361
            /* XXX Shouldn't happen */
15362
            break;
15363
        case SSTATE_INTERNED_IMMORTAL:
15364
            Py_REFCNT(s) += 1;
15365
            immortal_size += PyUnicode_GET_LENGTH(s);
15366
            break;
15367
        case SSTATE_INTERNED_MORTAL:
15368
            Py_REFCNT(s) += 2;
15369
            mortal_size += PyUnicode_GET_LENGTH(s);
15370
            break;
15371
        default:
15372
            Py_FatalError("Inconsistent interned string state.");
15373
        }
15374
        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15375
    }
15376
#ifdef INTERNED_STATS
15377
    fprintf(stderr, "total size of all interned strings: "
15378
            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15379
            "mortal/immortal\n", mortal_size, immortal_size);
15380
#endif
15381
    Py_DECREF(keys);
15382
    PyDict_Clear(interned);
15383
    Py_CLEAR(interned);
15384
}
15385
#endif
15386
15387
15388
/********************* Unicode Iterator **************************/
15389
15390
typedef struct {
15391
    PyObject_HEAD
15392
    Py_ssize_t it_index;
15393
    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15394
} unicodeiterobject;
15395
15396
static void
15397
unicodeiter_dealloc(unicodeiterobject *it)
15398
66
{
15399
66
    _PyObject_GC_UNTRACK(it);
15400
66
    Py_XDECREF(it->it_seq);
15401
66
    PyObject_GC_Del(it);
15402
66
}
15403
15404
static int
15405
unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15406
0
{
15407
0
    Py_VISIT(it->it_seq);
15408
0
    return 0;
15409
0
}
15410
15411
static PyObject *
15412
unicodeiter_next(unicodeiterobject *it)
15413
445
{
15414
445
    PyObject *seq, *item;
15415
15416
445
    assert(it != NULL);
15417
445
    seq = it->it_seq;
15418
445
    if (seq == NULL)
15419
0
        return NULL;
15420
445
    assert(_PyUnicode_CHECK(seq));
15421
15422
445
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15423
393
        int kind = PyUnicode_KIND(seq);
15424
393
        void *data = PyUnicode_DATA(seq);
15425
393
        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15426
393
        item = PyUnicode_FromOrdinal(chr);
15427
393
        if (item != NULL)
15428
393
            ++it->it_index;
15429
393
        return item;
15430
393
    }
15431
15432
52
    it->it_seq = NULL;
15433
52
    Py_DECREF(seq);
15434
52
    return NULL;
15435
445
}
15436
15437
static PyObject *
15438
unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15439
0
{
15440
0
    Py_ssize_t len = 0;
15441
0
    if (it->it_seq)
15442
0
        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15443
0
    return PyLong_FromSsize_t(len);
15444
0
}
15445
15446
PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15447
15448
static PyObject *
15449
unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
15450
0
{
15451
0
    _Py_IDENTIFIER(iter);
15452
0
    if (it->it_seq != NULL) {
15453
0
        return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
15454
0
                             it->it_seq, it->it_index);
15455
0
    } else {
15456
0
        PyObject *u = (PyObject *)_PyUnicode_New(0);
15457
0
        if (u == NULL)
15458
0
            return NULL;
15459
0
        return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
15460
0
    }
15461
0
}
15462
15463
PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15464
15465
static PyObject *
15466
unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15467
0
{
15468
0
    Py_ssize_t index = PyLong_AsSsize_t(state);
15469
0
    if (index == -1 && PyErr_Occurred())
15470
0
        return NULL;
15471
0
    if (it->it_seq != NULL) {
15472
0
        if (index < 0)
15473
0
            index = 0;
15474
0
        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15475
0
            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15476
0
        it->it_index = index;
15477
0
    }
15478
0
    Py_RETURN_NONE;
15479
0
}
15480
15481
PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15482
15483
static PyMethodDef unicodeiter_methods[] = {
15484
    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15485
     length_hint_doc},
15486
    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15487
     reduce_doc},
15488
    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15489
     setstate_doc},
15490
    {NULL,      NULL}       /* sentinel */
15491
};
15492
15493
PyTypeObject PyUnicodeIter_Type = {
15494
    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15495
    "str_iterator",         /* tp_name */
15496
    sizeof(unicodeiterobject),      /* tp_basicsize */
15497
    0,                  /* tp_itemsize */
15498
    /* methods */
15499
    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15500
    0,                  /* tp_vectorcall_offset */
15501
    0,                  /* tp_getattr */
15502
    0,                  /* tp_setattr */
15503
    0,                  /* tp_as_async */
15504
    0,                  /* tp_repr */
15505
    0,                  /* tp_as_number */
15506
    0,                  /* tp_as_sequence */
15507
    0,                  /* tp_as_mapping */
15508
    0,                  /* tp_hash */
15509
    0,                  /* tp_call */
15510
    0,                  /* tp_str */
15511
    PyObject_GenericGetAttr,        /* tp_getattro */
15512
    0,                  /* tp_setattro */
15513
    0,                  /* tp_as_buffer */
15514
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15515
    0,                  /* tp_doc */
15516
    (traverseproc)unicodeiter_traverse, /* tp_traverse */
15517
    0,                  /* tp_clear */
15518
    0,                  /* tp_richcompare */
15519
    0,                  /* tp_weaklistoffset */
15520
    PyObject_SelfIter,          /* tp_iter */
15521
    (iternextfunc)unicodeiter_next,     /* tp_iternext */
15522
    unicodeiter_methods,            /* tp_methods */
15523
    0,
15524
};
15525
15526
static PyObject *
15527
unicode_iter(PyObject *seq)
15528
66
{
15529
66
    unicodeiterobject *it;
15530
15531
66
    if (!PyUnicode_Check(seq)) {
15532
0
        PyErr_BadInternalCall();
15533
0
        return NULL;
15534
0
    }
15535
66
    if (PyUnicode_READY(seq) == -1)
15536
0
        return NULL;
15537
66
    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15538
66
    if (it == NULL)
15539
0
        return NULL;
15540
66
    it->it_index = 0;
15541
66
    Py_INCREF(seq);
15542
66
    it->it_seq = seq;
15543
66
    _PyObject_GC_TRACK(it);
15544
66
    return (PyObject *)it;
15545
66
}
15546
15547
15548
size_t
15549
Py_UNICODE_strlen(const Py_UNICODE *u)
15550
0
{
15551
0
    return wcslen(u);
15552
0
}
15553
15554
Py_UNICODE*
15555
Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15556
0
{
15557
0
    Py_UNICODE *u = s1;
15558
0
    while ((*u++ = *s2++));
15559
0
    return s1;
15560
0
}
15561
15562
Py_UNICODE*
15563
Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15564
0
{
15565
0
    Py_UNICODE *u = s1;
15566
0
    while ((*u++ = *s2++))
15567
0
        if (n-- == 0)
15568
0
            break;
15569
0
    return s1;
15570
0
}
15571
15572
Py_UNICODE*
15573
Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15574
0
{
15575
0
    Py_UNICODE *u1 = s1;
15576
0
    u1 += wcslen(u1);
15577
0
    while ((*u1++ = *s2++));
15578
0
    return s1;
15579
0
}
15580
15581
int
15582
Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15583
0
{
15584
0
    while (*s1 && *s2 && *s1 == *s2)
15585
0
        s1++, s2++;
15586
0
    if (*s1 && *s2)
15587
0
        return (*s1 < *s2) ? -1 : +1;
15588
0
    if (*s1)
15589
0
        return 1;
15590
0
    if (*s2)
15591
0
        return -1;
15592
0
    return 0;
15593
0
}
15594
15595
int
15596
Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15597
0
{
15598
0
    Py_UNICODE u1, u2;
15599
0
    for (; n != 0; n--) {
15600
0
        u1 = *s1;
15601
0
        u2 = *s2;
15602
0
        if (u1 != u2)
15603
0
            return (u1 < u2) ? -1 : +1;
15604
0
        if (u1 == '\0')
15605
0
            return 0;
15606
0
        s1++;
15607
0
        s2++;
15608
0
    }
15609
0
    return 0;
15610
0
}
15611
15612
Py_UNICODE*
15613
Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15614
0
{
15615
0
    const Py_UNICODE *p;
15616
0
    for (p = s; *p; p++)
15617
0
        if (*p == c)
15618
0
            return (Py_UNICODE*)p;
15619
0
    return NULL;
15620
0
}
15621
15622
Py_UNICODE*
15623
Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15624
0
{
15625
0
    const Py_UNICODE *p;
15626
0
    p = s + wcslen(s);
15627
0
    while (p != s) {
15628
0
        p--;
15629
0
        if (*p == c)
15630
0
            return (Py_UNICODE*)p;
15631
0
    }
15632
0
    return NULL;
15633
0
}
15634
15635
Py_UNICODE*
15636
PyUnicode_AsUnicodeCopy(PyObject *unicode)
15637
0
{
15638
0
    Py_UNICODE *u, *copy;
15639
0
    Py_ssize_t len, size;
15640
15641
0
    if (!PyUnicode_Check(unicode)) {
15642
0
        PyErr_BadArgument();
15643
0
        return NULL;
15644
0
    }
15645
0
    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15646
0
    if (u == NULL)
15647
0
        return NULL;
15648
    /* Ensure we won't overflow the size. */
15649
0
    if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15650
0
        PyErr_NoMemory();
15651
0
        return NULL;
15652
0
    }
15653
0
    size = len + 1; /* copy the null character */
15654
0
    size *= sizeof(Py_UNICODE);
15655
0
    copy = PyMem_Malloc(size);
15656
0
    if (copy == NULL) {
15657
0
        PyErr_NoMemory();
15658
0
        return NULL;
15659
0
    }
15660
0
    memcpy(copy, u, size);
15661
0
    return copy;
15662
0
}
15663
15664
15665
static int
15666
encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
15667
56
{
15668
56
    int res;
15669
56
    res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15670
56
    if (res == -2) {
15671
0
        PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15672
0
        return -1;
15673
0
    }
15674
56
    if (res < 0) {
15675
0
        PyErr_NoMemory();
15676
0
        return -1;
15677
0
    }
15678
56
    return 0;
15679
56
}
15680
15681
15682
static int
15683
config_get_codec_name(wchar_t **config_encoding)
15684
28
{
15685
28
    char *encoding;
15686
28
    if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15687
0
        return -1;
15688
0
    }
15689
15690
28
    PyObject *name_obj = NULL;
15691
28
    PyObject *codec = _PyCodec_Lookup(encoding);
15692
28
    PyMem_RawFree(encoding);
15693
15694
28
    if (!codec)
15695
0
        goto error;
15696
15697
28
    name_obj = PyObject_GetAttrString(codec, "name");
15698
28
    Py_CLEAR(codec);
15699
28
    if (!name_obj) {
15700
0
        goto error;
15701
0
    }
15702
15703
28
    wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15704
28
    Py_DECREF(name_obj);
15705
28
    if (wname == NULL) {
15706
0
        goto error;
15707
0
    }
15708
15709
28
    wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15710
28
    if (raw_wname == NULL) {
15711
0
        PyMem_Free(wname);
15712
0
        PyErr_NoMemory();
15713
0
        goto error;
15714
0
    }
15715
15716
28
    PyMem_RawFree(*config_encoding);
15717
28
    *config_encoding = raw_wname;
15718
15719
28
    PyMem_Free(wname);
15720
28
    return 0;
15721
15722
0
error:
15723
0
    Py_XDECREF(codec);
15724
0
    Py_XDECREF(name_obj);
15725
0
    return -1;
15726
28
}
15727
15728
15729
static PyStatus
15730
init_stdio_encoding(PyThreadState *tstate)
15731
14
{
15732
    /* Update the stdio encoding to the normalized Python codec name. */
15733
14
    PyConfig *config = &tstate->interp->config;
15734
14
    if (config_get_codec_name(&config->stdio_encoding) < 0) {
15735
0
        return _PyStatus_ERR("failed to get the Python codec name "
15736
0
                             "of the stdio encoding");
15737
0
    }
15738
14
    return _PyStatus_OK();
15739
14
}
15740
15741
15742
static int
15743
init_fs_codec(PyInterpreterState *interp)
15744
14
{
15745
14
    PyConfig *config = &interp->config;
15746
15747
14
    _Py_error_handler error_handler;
15748
14
    error_handler = get_error_handler_wide(config->filesystem_errors);
15749
14
    if (error_handler == _Py_ERROR_UNKNOWN) {
15750
0
        PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15751
0
        return -1;
15752
0
    }
15753
15754
14
    char *encoding, *errors;
15755
14
    if (encode_wstr_utf8(config->filesystem_encoding,
15756
14
                         &encoding,
15757
14
                         "filesystem_encoding") < 0) {
15758
0
        return -1;
15759
0
    }
15760
15761
14
    if (encode_wstr_utf8(config->filesystem_errors,
15762
14
                         &errors,
15763
14
                         "filesystem_errors") < 0) {
15764
0
        PyMem_RawFree(encoding);
15765
0
        return -1;
15766
0
    }
15767
15768
14
    PyMem_RawFree(interp->fs_codec.encoding);
15769
14
    interp->fs_codec.encoding = encoding;
15770
14
    PyMem_RawFree(interp->fs_codec.errors);
15771
14
    interp->fs_codec.errors = errors;
15772
14
    interp->fs_codec.error_handler = error_handler;
15773
15774
    /* At this point, PyUnicode_EncodeFSDefault() and
15775
       PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15776
       the C implementation of the filesystem encoding. */
15777
15778
    /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15779
       global configuration variables. */
15780
14
    if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15781
14
                                  interp->fs_codec.errors) < 0) {
15782
0
        PyErr_NoMemory();
15783
0
        return -1;
15784
0
    }
15785
14
    return 0;
15786
14
}
15787
15788
15789
static PyStatus
15790
init_fs_encoding(PyThreadState *tstate)
15791
14
{
15792
14
    PyInterpreterState *interp = tstate->interp;
15793
15794
    /* Update the filesystem encoding to the normalized Python codec name.
15795
       For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15796
       (Python codec name). */
15797
14
    PyConfig *config = &interp->config;
15798
14
    if (config_get_codec_name(&config->filesystem_encoding) < 0) {
15799
0
        _Py_DumpPathConfig(tstate);
15800
0
        return _PyStatus_ERR("failed to get the Python codec "
15801
0
                             "of the filesystem encoding");
15802
0
    }
15803
15804
14
    if (init_fs_codec(interp) < 0) {
15805
0
        return _PyStatus_ERR("cannot initialize filesystem codec");
15806
0
    }
15807
14
    return _PyStatus_OK();
15808
14
}
15809
15810
15811
PyStatus
15812
_PyUnicode_InitEncodings(PyThreadState *tstate)
15813
14
{
15814
14
    PyStatus status = init_fs_encoding(tstate);
15815
14
    if (_PyStatus_EXCEPTION(status)) {
15816
0
        return status;
15817
0
    }
15818
15819
14
    return init_stdio_encoding(tstate);
15820
14
}
15821
15822
15823
#ifdef MS_WINDOWS
15824
int
15825
_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15826
{
15827
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15828
    PyConfig *config = &interp->config;
15829
15830
    /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15831
    wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15832
    wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15833
    if (encoding == NULL || errors == NULL) {
15834
        PyMem_RawFree(encoding);
15835
        PyMem_RawFree(errors);
15836
        PyErr_NoMemory();
15837
        return -1;
15838
    }
15839
15840
    PyMem_RawFree(config->filesystem_encoding);
15841
    config->filesystem_encoding = encoding;
15842
    PyMem_RawFree(config->filesystem_errors);
15843
    config->filesystem_errors = errors;
15844
15845
    return init_fs_codec(interp);
15846
}
15847
#endif
15848
15849
15850
void
15851
_PyUnicode_Fini(void)
15852
0
{
15853
#if defined(WITH_VALGRIND) || defined(__INSURE__)
15854
    /* Insure++ is a memory analysis tool that aids in discovering
15855
     * memory leaks and other memory problems.  On Python exit, the
15856
     * interned string dictionaries are flagged as being in use at exit
15857
     * (which it is).  Under normal circumstances, this is fine because
15858
     * the memory will be automatically reclaimed by the system.  Under
15859
     * memory debugging, it's a huge source of useless noise, so we
15860
     * trade off slower shutdown for less distraction in the memory
15861
     * reports.  -baw
15862
     */
15863
    unicode_release_interned();
15864
#endif /* __INSURE__ */
15865
15866
0
    Py_CLEAR(unicode_empty);
15867
15868
0
    for (Py_ssize_t i = 0; i < 256; i++) {
15869
0
        Py_CLEAR(unicode_latin1[i]);
15870
0
    }
15871
0
    _PyUnicode_ClearStaticStrings();
15872
0
    (void)PyUnicode_ClearFreeList();
15873
15874
0
    PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15875
0
    PyMem_RawFree(interp->fs_codec.encoding);
15876
0
    interp->fs_codec.encoding = NULL;
15877
0
    PyMem_RawFree(interp->fs_codec.errors);
15878
0
    interp->fs_codec.errors = NULL;
15879
0
}
15880
15881
15882
/* A _string module, to export formatter_parser and formatter_field_name_split
15883
   to the string.Formatter class implemented in Python. */
15884
15885
static PyMethodDef _string_methods[] = {
15886
    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15887
     METH_O, PyDoc_STR("split the argument as a field name")},
15888
    {"formatter_parser", (PyCFunction) formatter_parser,
15889
     METH_O, PyDoc_STR("parse the argument as a format string")},
15890
    {NULL, NULL}
15891
};
15892
15893
static struct PyModuleDef _string_module = {
15894
    PyModuleDef_HEAD_INIT,
15895
    "_string",
15896
    PyDoc_STR("string helper module"),
15897
    0,
15898
    _string_methods,
15899
    NULL,
15900
    NULL,
15901
    NULL,
15902
    NULL
15903
};
15904
15905
PyMODINIT_FUNC
15906
PyInit__string(void)
15907
1
{
15908
1
    return PyModule_Create(&_string_module);
15909
1
}
15910
15911
15912
#ifdef __cplusplus
15913
}
15914
#endif