Coverage Report

Created: 2025-07-04 06:49

/src/cpython-install/include/python3.15/cpython/unicodeobject.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef Py_CPYTHON_UNICODEOBJECT_H
2
#  error "this header file must not be included directly"
3
#endif
4
5
/* Py_UNICODE was the native Unicode storage format (code unit) used by
6
   Python and represents a single Unicode element in the Unicode type.
7
   With PEP 393, Py_UNICODE is deprecated and replaced with a
8
   typedef to wchar_t. */
9
Py_DEPRECATED(3.13) typedef wchar_t PY_UNICODE_TYPE;
10
Py_DEPRECATED(3.13) typedef wchar_t Py_UNICODE;
11
12
13
/* --- Internal Unicode Operations ---------------------------------------- */
14
15
// Static inline functions to work with surrogates
16
0
static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
17
0
    return (0xD800 <= ch && ch <= 0xDFFF);
18
0
}
19
0
static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
20
0
    return (0xD800 <= ch && ch <= 0xDBFF);
21
0
}
22
0
static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
23
0
    return (0xDC00 <= ch && ch <= 0xDFFF);
24
0
}
25
26
// Join two surrogate characters and return a single Py_UCS4 value.
27
0
static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low)  {
28
0
    assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
29
0
    assert(Py_UNICODE_IS_LOW_SURROGATE(low));
30
0
    return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
31
0
}
32
33
// High surrogate = top 10 bits added to 0xD800.
34
// The character must be in the range [U+10000; U+10ffff].
35
0
static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
36
0
    assert(0x10000 <= ch && ch <= 0x10ffff);
37
0
    return (0xD800 - (0x10000 >> 10) + (ch >> 10));
38
0
}
39
40
// Low surrogate = bottom 10 bits added to 0xDC00.
41
// The character must be in the range [U+10000; U+10ffff].
42
0
static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
43
0
    assert(0x10000 <= ch && ch <= 0x10ffff);
44
0
    return (0xDC00 + (ch & 0x3FF));
45
0
}
46
47
48
/* --- Unicode Type ------------------------------------------------------- */
49
50
struct _PyUnicodeObject_state {
51
    /* If interned is non-zero, the two references from the
52
       dictionary to this object are *not* counted in ob_refcnt.
53
       The possible values here are:
54
           0: Not Interned
55
           1: Interned
56
           2: Interned and Immortal
57
           3: Interned, Immortal, and Static
58
       This categorization allows the runtime to determine the right
59
       cleanup mechanism at runtime shutdown. */
60
#ifdef Py_GIL_DISABLED
61
    // Needs to be accessed atomically, so can't be a bit field.
62
    unsigned char interned;
63
#else
64
    unsigned int interned:2;
65
#endif
66
    /* Character size:
67
68
       - PyUnicode_1BYTE_KIND (1):
69
70
         * character type = Py_UCS1 (8 bits, unsigned)
71
         * all characters are in the range U+0000-U+00FF (latin1)
72
         * if ascii is set, all characters are in the range U+0000-U+007F
73
         (ASCII), otherwise at least one character is in the range
74
         U+0080-U+00FF
75
76
       - PyUnicode_2BYTE_KIND (2):
77
78
         * character type = Py_UCS2 (16 bits, unsigned)
79
         * all characters are in the range U+0000-U+FFFF (BMP)
80
         * at least one character is in the range U+0100-U+FFFF
81
82
       - PyUnicode_4BYTE_KIND (4):
83
84
         * character type = Py_UCS4 (32 bits, unsigned)
85
         * all characters are in the range U+0000-U+10FFFF
86
         * at least one character is in the range U+10000-U+10FFFF
87
       */
88
    unsigned int kind:3;
89
    /* Compact is with respect to the allocation scheme. Compact unicode
90
       objects only require one memory block while non-compact objects use
91
       one block for the PyUnicodeObject struct and another for its data
92
       buffer. */
93
    unsigned int compact:1;
94
    /* The string only contains characters in the range U+0000-U+007F (ASCII)
95
       and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
96
       set, use the PyASCIIObject structure. */
97
    unsigned int ascii:1;
98
    /* The object is statically allocated. */
99
    unsigned int statically_allocated:1;
100
#ifndef Py_GIL_DISABLED
101
    /* Historical: padding to ensure that PyUnicode_DATA() is always aligned to
102
       4 bytes (see issue gh-63736 on m68k) */
103
    unsigned int :24;
104
#endif
105
};
106
107
/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
108
   structure. state.ascii and state.compact are set, and the data
109
   immediately follow the structure. utf8_length can be found
110
   in the length field; the utf8 pointer is equal to the data pointer. */
111
typedef struct {
112
    /* There are 4 forms of Unicode strings:
113
114
       - compact ascii:
115
116
         * structure = PyASCIIObject
117
         * test: PyUnicode_IS_COMPACT_ASCII(op)
118
         * kind = PyUnicode_1BYTE_KIND
119
         * compact = 1
120
         * ascii = 1
121
         * (length is the length of the utf8)
122
         * (data starts just after the structure)
123
         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
124
125
       - compact:
126
127
         * structure = PyCompactUnicodeObject
128
         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
129
         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
130
           PyUnicode_4BYTE_KIND
131
         * compact = 1
132
         * ascii = 0
133
         * utf8 is not shared with data
134
         * utf8_length = 0 if utf8 is NULL
135
         * (data starts just after the structure)
136
137
       - legacy string:
138
139
         * structure = PyUnicodeObject structure
140
         * test: !PyUnicode_IS_COMPACT(op)
141
         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
142
           PyUnicode_4BYTE_KIND
143
         * compact = 0
144
         * data.any is not NULL
145
         * utf8 is shared and utf8_length = length with data.any if ascii = 1
146
         * utf8_length = 0 if utf8 is NULL
147
148
       Compact strings use only one memory block (structure + characters),
149
       whereas legacy strings use one block for the structure and one block
150
       for characters.
151
152
       Legacy strings are created by subclasses of Unicode.
153
154
       See also _PyUnicode_CheckConsistency().
155
    */
156
    PyObject_HEAD
157
    Py_ssize_t length;          /* Number of code points in the string */
158
    Py_hash_t hash;             /* Hash value; -1 if not set */
159
    /* Ensure 4 byte alignment for PyUnicode_DATA(), see gh-63736 on m68k. */
160
   _Py_ALIGNED_DEF(4, struct _PyUnicodeObject_state) state;
161
} PyASCIIObject;
162
163
/* Non-ASCII strings allocated through PyUnicode_New use the
164
   PyCompactUnicodeObject structure. state.compact is set, and the data
165
   immediately follow the structure. */
166
typedef struct {
167
    PyASCIIObject _base;
168
    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
169
                                 * terminating \0. */
170
    char *utf8;                 /* UTF-8 representation (null-terminated) */
171
} PyCompactUnicodeObject;
172
173
/* Object format for Unicode subclasses. */
174
typedef struct {
175
    PyCompactUnicodeObject _base;
176
    union {
177
        void *any;
178
        Py_UCS1 *latin1;
179
        Py_UCS2 *ucs2;
180
        Py_UCS4 *ucs4;
181
    } data;                     /* Canonical, smallest-form Unicode buffer */
182
} PyUnicodeObject;
183
184
185
#define _PyASCIIObject_CAST(op) \
186
    (assert(PyUnicode_Check(op)), \
187
     _Py_CAST(PyASCIIObject*, (op)))
188
#define _PyCompactUnicodeObject_CAST(op) \
189
    (assert(PyUnicode_Check(op)), \
190
     _Py_CAST(PyCompactUnicodeObject*, (op)))
191
#define _PyUnicodeObject_CAST(op) \
192
    (assert(PyUnicode_Check(op)), \
193
     _Py_CAST(PyUnicodeObject*, (op)))
194
195
196
/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
197
198
/* Values for PyASCIIObject.state: */
199
200
/* Interning state. */
201
#define SSTATE_NOT_INTERNED 0
202
#define SSTATE_INTERNED_MORTAL 1
203
#define SSTATE_INTERNED_IMMORTAL 2
204
#define SSTATE_INTERNED_IMMORTAL_STATIC 3
205
206
/* Use only if you know it's a string */
207
0
static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
208
0
#ifdef Py_GIL_DISABLED
209
0
    return _Py_atomic_load_uint8_relaxed(&_PyASCIIObject_CAST(op)->state.interned);
210
0
#else
211
0
    return _PyASCIIObject_CAST(op)->state.interned;
212
0
#endif
213
0
}
214
#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
215
216
/* For backward compatibility. Soft-deprecated. */
217
0
static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
218
0
    return 1;
219
0
}
220
#define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
221
222
/* Return true if the string contains only ASCII characters, or 0 if not. The
223
   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not. */
224
0
static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
225
0
    return _PyASCIIObject_CAST(op)->state.ascii;
226
0
}
227
#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
228
229
/* Return true if the string is compact or 0 if not.
230
   No type checks are performed. */
231
0
static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
232
0
    return _PyASCIIObject_CAST(op)->state.compact;
233
0
}
234
#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
235
236
/* Return true if the string is a compact ASCII string (use PyASCIIObject
237
   structure), or 0 if not.  No type checks are performed. */
238
0
static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
239
0
    return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
240
0
}
241
#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
242
243
enum PyUnicode_Kind {
244
/* Return values of the PyUnicode_KIND() function: */
245
    PyUnicode_1BYTE_KIND = 1,
246
    PyUnicode_2BYTE_KIND = 2,
247
    PyUnicode_4BYTE_KIND = 4
248
};
249
250
PyAPI_FUNC(int) PyUnicode_KIND(PyObject *op);
251
252
// PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
253
//
254
// gh-89653: Converting this macro to a static inline function would introduce
255
// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
256
// unsigned numbers) where kind type is an int or on
257
// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
258
#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
259
260
/* Return a void pointer to the raw unicode buffer. */
261
0
static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
262
0
    if (PyUnicode_IS_ASCII(op)) {
263
0
        return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
264
0
    }
265
0
    return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
266
0
}
267
268
0
static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
269
0
    void *data;
270
0
    assert(!PyUnicode_IS_COMPACT(op));
271
0
    data = _PyUnicodeObject_CAST(op)->data.any;
272
0
    assert(data != NULL);
273
0
    return data;
274
0
}
275
276
PyAPI_FUNC(void*) PyUnicode_DATA(PyObject *op);
277
278
0
static inline void* _PyUnicode_DATA(PyObject *op) {
279
0
    if (PyUnicode_IS_COMPACT(op)) {
280
0
        return _PyUnicode_COMPACT_DATA(op);
281
0
    }
282
0
    return _PyUnicode_NONCOMPACT_DATA(op);
283
0
}
284
#define PyUnicode_DATA(op) _PyUnicode_DATA(_PyObject_CAST(op))
285
286
/* Return pointers to the canonical representation cast to unsigned char,
287
   Py_UCS2, or Py_UCS4 for direct character access.
288
   No checks are performed, use PyUnicode_KIND() before to ensure
289
   these will work correctly. */
290
291
#define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
292
#define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
293
#define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
294
295
/* Returns the length of the unicode string. */
296
0
static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
297
0
    return _PyASCIIObject_CAST(op)->length;
298
0
}
299
#define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
300
301
/* Returns the cached hash, or -1 if not cached yet. */
302
static inline Py_hash_t
303
0
PyUnstable_Unicode_GET_CACHED_HASH(PyObject *op) {
304
0
    assert(PyUnicode_Check(op));
305
0
#ifdef Py_GIL_DISABLED
306
0
    return _Py_atomic_load_ssize_relaxed(&_PyASCIIObject_CAST(op)->hash);
307
0
#else
308
0
    return _PyASCIIObject_CAST(op)->hash;
309
0
#endif
310
0
}
311
312
/* Write into the canonical representation, this function does not do any sanity
313
   checks and is intended for usage in loops.  The caller should cache the
314
   kind and data pointers obtained from other function calls.
315
   index is the index in the string (starts at 0) and value is the new
316
   code point value which should be written to that location. */
317
static inline void PyUnicode_WRITE(int kind, void *data,
318
                                   Py_ssize_t index, Py_UCS4 value)
319
0
{
320
0
    assert(index >= 0);
321
0
    if (kind == PyUnicode_1BYTE_KIND) {
322
0
        assert(value <= 0xffU);
323
0
        _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
324
0
    }
325
0
    else if (kind == PyUnicode_2BYTE_KIND) {
326
0
        assert(value <= 0xffffU);
327
0
        _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
328
0
    }
329
0
    else {
330
0
        assert(kind == PyUnicode_4BYTE_KIND);
331
0
        assert(value <= 0x10ffffU);
332
0
        _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
333
0
    }
334
0
}
335
#define PyUnicode_WRITE(kind, data, index, value) \
336
    PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
337
                    (index), _Py_STATIC_CAST(Py_UCS4, value))
338
339
/* Read a code point from the string's canonical representation.  No checks
340
   are performed. */
341
static inline Py_UCS4 PyUnicode_READ(int kind,
342
                                     const void *data, Py_ssize_t index)
343
0
{
344
0
    assert(index >= 0);
345
0
    if (kind == PyUnicode_1BYTE_KIND) {
346
0
        return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
347
0
    }
348
0
    if (kind == PyUnicode_2BYTE_KIND) {
349
0
        return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
350
0
    }
351
0
    assert(kind == PyUnicode_4BYTE_KIND);
352
0
    return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
353
0
}
354
#define PyUnicode_READ(kind, data, index) \
355
    PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
356
                   _Py_STATIC_CAST(const void*, data), \
357
                   (index))
358
359
/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
360
   calls PyUnicode_KIND() and might call it twice.  For single reads, use
361
   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
362
   cache kind and use PyUnicode_READ instead. */
363
static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
364
0
{
365
0
    int kind;
366
0
367
0
    assert(index >= 0);
368
0
    // Tolerate reading the NUL character at str[len(str)]
369
0
    assert(index <= PyUnicode_GET_LENGTH(unicode));
370
0
371
0
    kind = PyUnicode_KIND(unicode);
372
0
    if (kind == PyUnicode_1BYTE_KIND) {
373
0
        return PyUnicode_1BYTE_DATA(unicode)[index];
374
0
    }
375
0
    if (kind == PyUnicode_2BYTE_KIND) {
376
0
        return PyUnicode_2BYTE_DATA(unicode)[index];
377
0
    }
378
0
    assert(kind == PyUnicode_4BYTE_KIND);
379
0
    return PyUnicode_4BYTE_DATA(unicode)[index];
380
0
}
381
#define PyUnicode_READ_CHAR(unicode, index) \
382
    PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
383
384
/* Return a maximum character value which is suitable for creating another
385
   string based on op.  This is always an approximation but more efficient
386
   than iterating over the string. */
387
static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
388
0
{
389
0
    int kind;
390
0
391
0
    if (PyUnicode_IS_ASCII(op)) {
392
0
        return 0x7fU;
393
0
    }
394
0
395
0
    kind = PyUnicode_KIND(op);
396
0
    if (kind == PyUnicode_1BYTE_KIND) {
397
0
       return 0xffU;
398
0
    }
399
0
    if (kind == PyUnicode_2BYTE_KIND) {
400
0
        return 0xffffU;
401
0
    }
402
0
    assert(kind == PyUnicode_4BYTE_KIND);
403
0
    return 0x10ffffU;
404
0
}
405
#define PyUnicode_MAX_CHAR_VALUE(op) \
406
    PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
407
408
409
/* === Public API ========================================================= */
410
411
/* With PEP 393, this is the recommended way to allocate a new unicode object.
412
   This function will allocate the object and its buffer in a single memory
413
   block.  Objects created using this function are not resizable. */
414
PyAPI_FUNC(PyObject*) PyUnicode_New(
415
    Py_ssize_t size,            /* Number of code points in the new string */
416
    Py_UCS4 maxchar             /* maximum code point value in the string */
417
    );
418
419
/* For backward compatibility. Soft-deprecated. */
420
static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
421
0
{
422
0
    return 0;
423
0
}
424
#define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
425
426
/* Copy character from one unicode object into another, this function performs
427
   character conversion when necessary and falls back to memcpy() if possible.
428
429
   Fail if to is too small (smaller than *how_many* or smaller than
430
   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
431
   kind(to), or if *to* has more than 1 reference.
432
433
   Return the number of written character, or return -1 and raise an exception
434
   on error.
435
436
   Pseudo-code:
437
438
       how_many = min(how_many, len(from) - from_start)
439
       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
440
       return how_many
441
442
   Note: The function doesn't write a terminating null character.
443
   */
444
PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
445
    PyObject *to,
446
    Py_ssize_t to_start,
447
    PyObject *from,
448
    Py_ssize_t from_start,
449
    Py_ssize_t how_many
450
    );
451
452
/* Fill a string with a character: write fill_char into
453
   unicode[start:start+length].
454
455
   Fail if fill_char is bigger than the string maximum character, or if the
456
   string has more than 1 reference.
457
458
   Return the number of written character, or return -1 and raise an exception
459
   on error. */
460
PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
461
    PyObject *unicode,
462
    Py_ssize_t start,
463
    Py_ssize_t length,
464
    Py_UCS4 fill_char
465
    );
466
467
/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
468
   Scan the string to find the maximum character. */
469
PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
470
    int kind,
471
    const void *buffer,
472
    Py_ssize_t size);
473
474
475
/* --- Public PyUnicodeWriter API ----------------------------------------- */
476
477
typedef struct PyUnicodeWriter PyUnicodeWriter;
478
479
PyAPI_FUNC(PyUnicodeWriter*) PyUnicodeWriter_Create(Py_ssize_t length);
480
PyAPI_FUNC(void) PyUnicodeWriter_Discard(PyUnicodeWriter *writer);
481
PyAPI_FUNC(PyObject*) PyUnicodeWriter_Finish(PyUnicodeWriter *writer);
482
483
PyAPI_FUNC(int) PyUnicodeWriter_WriteChar(
484
    PyUnicodeWriter *writer,
485
    Py_UCS4 ch);
486
PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
487
    PyUnicodeWriter *writer,
488
    const char *str,
489
    Py_ssize_t size);
490
PyAPI_FUNC(int) PyUnicodeWriter_WriteASCII(
491
    PyUnicodeWriter *writer,
492
    const char *str,
493
    Py_ssize_t size);
494
PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
495
    PyUnicodeWriter *writer,
496
    const wchar_t *str,
497
    Py_ssize_t size);
498
PyAPI_FUNC(int) PyUnicodeWriter_WriteUCS4(
499
    PyUnicodeWriter *writer,
500
    Py_UCS4 *str,
501
    Py_ssize_t size);
502
503
PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
504
    PyUnicodeWriter *writer,
505
    PyObject *obj);
506
PyAPI_FUNC(int) PyUnicodeWriter_WriteRepr(
507
    PyUnicodeWriter *writer,
508
    PyObject *obj);
509
PyAPI_FUNC(int) PyUnicodeWriter_WriteSubstring(
510
    PyUnicodeWriter *writer,
511
    PyObject *str,
512
    Py_ssize_t start,
513
    Py_ssize_t end);
514
PyAPI_FUNC(int) PyUnicodeWriter_Format(
515
    PyUnicodeWriter *writer,
516
    const char *format,
517
    ...);
518
PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
519
    PyUnicodeWriter *writer,
520
    const char *string,         /* UTF-8 encoded string */
521
    Py_ssize_t length,          /* size of string */
522
    const char *errors,         /* error handling */
523
    Py_ssize_t *consumed);      /* bytes consumed */
524
525
526
/* --- Private _PyUnicodeWriter API --------------------------------------- */
527
528
typedef struct {
529
    PyObject *buffer;
530
    void *data;
531
    int kind;
532
    Py_UCS4 maxchar;
533
    Py_ssize_t size;
534
    Py_ssize_t pos;
535
536
    /* minimum number of allocated characters (default: 0) */
537
    Py_ssize_t min_length;
538
539
    /* minimum character (default: 127, ASCII) */
540
    Py_UCS4 min_char;
541
542
    /* If non-zero, overallocate the buffer (default: 0). */
543
    unsigned char overallocate;
544
545
    /* If readonly is 1, buffer is a shared string (cannot be modified)
546
       and size is set to 0. */
547
    unsigned char readonly;
548
} _PyUnicodeWriter;
549
550
// Initialize a Unicode writer.
551
//
552
// By default, the minimum buffer size is 0 character and overallocation is
553
// disabled. Set min_length, min_char and overallocate attributes to control
554
// the allocation of the buffer.
555
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(void) _PyUnicodeWriter_Init(
556
    _PyUnicodeWriter *writer);
557
558
/* Prepare the buffer to write 'length' characters
559
   with the specified maximum character.
560
561
   Return 0 on success, raise an exception and return -1 on error. */
562
#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
563
    (((MAXCHAR) <= (WRITER)->maxchar                                  \
564
      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
565
     ? 0                                                              \
566
     : (((LENGTH) == 0)                                               \
567
        ? 0                                                           \
568
        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
569
570
/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
571
   instead. */
572
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_PrepareInternal(
573
    _PyUnicodeWriter *writer,
574
    Py_ssize_t length,
575
    Py_UCS4 maxchar);
576
577
/* Prepare the buffer to have at least the kind KIND.
578
   For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
579
   support characters in range U+000-U+FFFF.
580
581
   Return 0 on success, raise an exception and return -1 on error. */
582
#define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
583
    ((KIND) <= (WRITER)->kind                                         \
584
     ? 0                                                              \
585
     : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
586
587
/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
588
   macro instead. */
589
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_PrepareKindInternal(
590
    _PyUnicodeWriter *writer,
591
    int kind);
592
593
/* Append a Unicode character.
594
   Return 0 on success, raise an exception and return -1 on error. */
595
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteChar(
596
    _PyUnicodeWriter *writer,
597
    Py_UCS4 ch);
598
599
/* Append a Unicode string.
600
   Return 0 on success, raise an exception and return -1 on error. */
601
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteStr(
602
    _PyUnicodeWriter *writer,
603
    PyObject *str);               /* Unicode string */
604
605
/* Append a substring of a Unicode string.
606
   Return 0 on success, raise an exception and return -1 on error. */
607
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteSubstring(
608
    _PyUnicodeWriter *writer,
609
    PyObject *str,              /* Unicode string */
610
    Py_ssize_t start,
611
    Py_ssize_t end);
612
613
/* Append an ASCII-encoded byte string.
614
   Return 0 on success, raise an exception and return -1 on error. */
615
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteASCIIString(
616
    _PyUnicodeWriter *writer,
617
    const char *str,           /* ASCII-encoded byte string */
618
    Py_ssize_t len);           /* number of bytes, or -1 if unknown */
619
620
/* Append a latin1-encoded byte string.
621
   Return 0 on success, raise an exception and return -1 on error. */
622
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(int) _PyUnicodeWriter_WriteLatin1String(
623
    _PyUnicodeWriter *writer,
624
    const char *str,           /* latin1-encoded byte string */
625
    Py_ssize_t len);           /* length in bytes */
626
627
/* Get the value of the writer as a Unicode string. Clear the
628
   buffer of the writer. Raise an exception and return NULL
629
   on error. */
630
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(PyObject *) _PyUnicodeWriter_Finish(
631
    _PyUnicodeWriter *writer);
632
633
/* Deallocate memory of a writer (clear its internal buffer). */
634
_Py_DEPRECATED_EXTERNALLY(3.14) PyAPI_FUNC(void) _PyUnicodeWriter_Dealloc(
635
    _PyUnicodeWriter *writer);
636
637
638
/* --- Manage the default encoding ---------------------------------------- */
639
640
/* Returns a pointer to the default encoding (UTF-8) of the
641
   Unicode object unicode.
642
643
   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
644
   in the unicodeobject.
645
646
   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
647
   support the previous internal function with the same behaviour.
648
649
   Use of this API is DEPRECATED since no size information can be
650
   extracted from the returned data.
651
*/
652
653
PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
654
655
// Deprecated alias kept for backward compatibility
656
Py_DEPRECATED(3.14) static inline const char*
657
_PyUnicode_AsString(PyObject *unicode)
658
0
{
659
0
    return PyUnicode_AsUTF8(unicode);
660
0
}
661
662
663
/* === Characters Type APIs =============================================== */
664
665
/* These should not be used directly. Use the Py_UNICODE_IS* and
666
   Py_UNICODE_TO* macros instead.
667
668
   These APIs are implemented in Objects/unicodectype.c.
669
670
*/
671
672
PyAPI_FUNC(int) _PyUnicode_IsLowercase(
673
    Py_UCS4 ch       /* Unicode character */
674
    );
675
676
PyAPI_FUNC(int) _PyUnicode_IsUppercase(
677
    Py_UCS4 ch       /* Unicode character */
678
    );
679
680
PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
681
    Py_UCS4 ch       /* Unicode character */
682
    );
683
684
PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
685
    const Py_UCS4 ch         /* Unicode character */
686
    );
687
688
PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
689
    const Py_UCS4 ch         /* Unicode character */
690
    );
691
692
PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
693
    Py_UCS4 ch       /* Unicode character */
694
    );
695
696
PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
697
    Py_UCS4 ch       /* Unicode character */
698
    );
699
700
PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
701
    Py_UCS4 ch       /* Unicode character */
702
    );
703
704
PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
705
    Py_UCS4 ch       /* Unicode character */
706
    );
707
708
PyAPI_FUNC(int) _PyUnicode_ToDigit(
709
    Py_UCS4 ch       /* Unicode character */
710
    );
711
712
PyAPI_FUNC(double) _PyUnicode_ToNumeric(
713
    Py_UCS4 ch       /* Unicode character */
714
    );
715
716
PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
717
    Py_UCS4 ch       /* Unicode character */
718
    );
719
720
PyAPI_FUNC(int) _PyUnicode_IsDigit(
721
    Py_UCS4 ch       /* Unicode character */
722
    );
723
724
PyAPI_FUNC(int) _PyUnicode_IsNumeric(
725
    Py_UCS4 ch       /* Unicode character */
726
    );
727
728
PyAPI_FUNC(int) _PyUnicode_IsPrintable(
729
    Py_UCS4 ch       /* Unicode character */
730
    );
731
732
PyAPI_FUNC(int) _PyUnicode_IsAlpha(
733
    Py_UCS4 ch       /* Unicode character */
734
    );
735
736
// Helper array used by Py_UNICODE_ISSPACE().
737
PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
738
739
// Since splitting on whitespace is an important use case, and
740
// whitespace in most situations is solely ASCII whitespace, we
741
// optimize for the common case by using a quick look-up table
742
// _Py_ascii_whitespace (see below) with an inlined check.
743
0
static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
744
0
    if (ch < 128) {
745
0
        return _Py_ascii_whitespace[ch];
746
0
    }
747
0
    return _PyUnicode_IsWhitespace(ch);
748
0
}
749
750
#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
751
#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
752
#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
753
#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
754
755
#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
756
#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
757
#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
758
759
#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
760
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
761
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
762
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
763
764
#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
765
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
766
#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
767
768
#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
769
770
0
static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
771
0
   return (Py_UNICODE_ISALPHA(ch)
772
0
           || Py_UNICODE_ISDECIMAL(ch)
773
0
           || Py_UNICODE_ISDIGIT(ch)
774
0
           || Py_UNICODE_ISNUMERIC(ch));
775
0
}
776
777
778
/* === Misc functions ===================================================== */
779
780
// Return an interned Unicode object for an Identifier; may fail if there is no
781
// memory.
782
PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);