/src/cpython/Objects/unicode_writer.c

Source
/*

Unicode implementation based on original code by Fredrik Lundh,
modified by Marc-Andre Lemburg <mal@lemburg.com>.

Major speed upgrades to the method implementations at the Reykjavik
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.

Copyright (c) Corporation for National Research Initiatives.

--------------------------------------------------------------------
The original string type implementation is:

  Copyright (c) 1999 by Secret Labs AB
  Copyright (c) 1999 by Fredrik Lundh

By obtaining, using, and/or copying this software and/or its
associated documentation, you agree that you have read, understood,
and will comply with the following terms and conditions:

Permission to use, copy, modify, and distribute this software and its
associated documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appears in all
copies, and that both that copyright notice and this permission notice
appear in supporting documentation, and that the name of Secret Labs
AB or the author not be used in advertising or publicity pertaining to
distribution of the software without specific, written prior
permission.

SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--------------------------------------------------------------------

*/

#include "Python.h"
#include "pycore_freelist.h"      // _Py_FREELIST_FREE()
#include "pycore_long.h"          // _PyLong_FormatWriter()
#include "pycore_unicodeobject.h" // _PyUnicode_Result()


#ifdef MS_WINDOWS
   /* On Windows, overallocate by 50% is the best factor */
#  define OVERALLOCATE_FACTOR 2
#else
   /* On Linux, overallocate by 25% is the best factor */
#  define OVERALLOCATE_FACTOR 4
#endif


/* Compilation of templated routines */

#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()

#include "stringlib/ucs1lib.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"


/* Copy an ASCII or latin1 char* string into a Python Unicode string.

   WARNING: The function doesn't copy the terminating null character and
   doesn't check the maximum character (may write a latin1 character in an
   ASCII string). */
static void
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
                   const char *str, Py_ssize_t len)
{
    int kind = PyUnicode_KIND(unicode);
    const void *data = PyUnicode_DATA(unicode);
    const char *end = str + len;

    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
    switch (kind) {
    case PyUnicode_1BYTE_KIND: {
#ifdef Py_DEBUG
        if (PyUnicode_IS_ASCII(unicode)) {
            Py_UCS4 maxchar = ucs1lib_find_max_char(
                (const Py_UCS1*)str,
                (const Py_UCS1*)str + len);
            assert(maxchar < 128);
        }
#endif
        memcpy((char *) data + index, str, len);
        break;
    }
    case PyUnicode_2BYTE_KIND: {
        Py_UCS2 *start = (Py_UCS2 *)data + index;
        Py_UCS2 *ucs2 = start;

        for (; str < end; ++ucs2, ++str)
            *ucs2 = (Py_UCS2)*str;

        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
        break;
    }
    case PyUnicode_4BYTE_KIND: {
        Py_UCS4 *start = (Py_UCS4 *)data + index;
        Py_UCS4 *ucs4 = start;

        for (; str < end; ++ucs4, ++str)
            *ucs4 = (Py_UCS4)*str;

        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
        break;
    }
    default:
        Py_UNREACHABLE();
    }
}


static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
{
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
    writer->data = PyUnicode_DATA(writer->buffer);

    if (!writer->readonly) {
        writer->kind = PyUnicode_KIND(writer->buffer);
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
    }
    else {
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
        writer->kind = 0;
        assert(writer->kind <= PyUnicode_1BYTE_KIND);

        /* Copy-on-write mode: set buffer size to 0 so
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
         * next write. */
        writer->size = 0;
    }
}


void
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
{
    memset(writer, 0, sizeof(*writer));

    /* ASCII is the bare minimum */
    writer->min_char = 127;

    /* use a kind value smaller than PyUnicode_1BYTE_KIND so
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
    assert(writer->kind == 0);
    assert(writer->kind < PyUnicode_1BYTE_KIND);
}


PyUnicodeWriter*
PyUnicodeWriter_Create(Py_ssize_t length)
{
    if (length < 0) {
        PyErr_SetString(PyExc_ValueError,
                        "length must be positive");
        return NULL;
    }

    const size_t size = sizeof(_PyUnicodeWriter);
    PyUnicodeWriter *pub_writer;
    pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
    if (pub_writer == NULL) {
        pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
        if (pub_writer == NULL) {
            return (PyUnicodeWriter *)PyErr_NoMemory();
        }
    }
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;

    _PyUnicodeWriter_Init(writer);
    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
        PyUnicodeWriter_Discard(pub_writer);
        return NULL;
    }
    writer->overallocate = 1;

    return pub_writer;
}


void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
{
    if (writer == NULL) {
        return;
    }
    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
}


// Initialize _PyUnicodeWriter with initial buffer
void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
{
    memset(writer, 0, sizeof(*writer));
    writer->buffer = buffer;
    _PyUnicodeWriter_Update(writer);
    writer->min_length = writer->size;
}


int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                 Py_ssize_t length, Py_UCS4 maxchar)
{
    Py_ssize_t newlen;
    PyObject *newbuffer;

    assert(length >= 0);
    assert(maxchar <= _Py_MAX_UNICODE);

    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
    assert((maxchar > writer->maxchar && length >= 0)
           || length > 0);

    if (length > PY_SSIZE_T_MAX - writer->pos) {
        PyErr_NoMemory();
        return -1;
    }
    newlen = writer->pos + length;

    maxchar = Py_MAX(maxchar, writer->min_char);

    if (writer->buffer == NULL) {
        assert(!writer->readonly);
        if (writer->overallocate
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
            /* overallocate to limit the number of realloc() */
            newlen += newlen / OVERALLOCATE_FACTOR;
        }
        if (newlen < writer->min_length)
            newlen = writer->min_length;

        writer->buffer = PyUnicode_New(newlen, maxchar);
        if (writer->buffer == NULL)
            return -1;
    }
    else if (newlen > writer->size) {
        if (writer->overallocate
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
            /* overallocate to limit the number of realloc() */
            newlen += newlen / OVERALLOCATE_FACTOR;
        }
        if (newlen < writer->min_length)
            newlen = writer->min_length;

        if (maxchar > writer->maxchar || writer->readonly) {
            /* resize + widen */
            maxchar = Py_MAX(maxchar, writer->maxchar);
            newbuffer = PyUnicode_New(newlen, maxchar);
            if (newbuffer == NULL)
                return -1;
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
                                          writer->buffer, 0, writer->pos);
            Py_DECREF(writer->buffer);
            writer->readonly = 0;
        }
        else {
            newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen);
            if (newbuffer == NULL)
                return -1;
        }
        writer->buffer = newbuffer;
    }
    else if (maxchar > writer->maxchar) {
        assert(!writer->readonly);
        newbuffer = PyUnicode_New(writer->size, maxchar);
        if (newbuffer == NULL)
            return -1;
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
                                      writer->buffer, 0, writer->pos);
        Py_SETREF(writer->buffer, newbuffer);
    }
    _PyUnicodeWriter_Update(writer);
    return 0;

#undef OVERALLOCATE_FACTOR
}

int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
                                     int kind)
{
    Py_UCS4 maxchar;

    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
    assert(writer->kind < kind);

    switch (kind)
    {
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
    case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break;
    default:
        Py_UNREACHABLE();
    }

    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
}


int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
{
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
}


int
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
{
    if (ch > _Py_MAX_UNICODE) {
        PyErr_SetString(PyExc_ValueError,
                        "character must be in range(0x110000)");
        return -1;
    }

    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
}


int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
{
    assert(PyUnicode_Check(str));

    Py_UCS4 maxchar;
    Py_ssize_t len;

    len = PyUnicode_GET_LENGTH(str);
    if (len == 0)
        return 0;
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
        if (writer->buffer == NULL && !writer->overallocate) {
            assert(_PyUnicode_CheckConsistency(str, 1));
            writer->readonly = 1;
            writer->buffer = Py_NewRef(str);
            _PyUnicodeWriter_Update(writer);
            writer->pos += len;
            return 0;
        }
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
            return -1;
    }
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
                                  str, 0, len);
    writer->pos += len;
    return 0;
}


int
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
{
    PyTypeObject *type = Py_TYPE(obj);
    if (type == &PyUnicode_Type) {
        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
    }

    if (type == &PyLong_Type) {
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
    }

    PyObject *str = PyObject_Str(obj);
    if (str == NULL) {
        return -1;
    }

    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
    Py_DECREF(str);
    return res;
}


int
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
{
    if (obj == NULL) {
        return _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter*)writer, "<NULL>", 6);
    }

    if (Py_TYPE(obj) == &PyLong_Type) {
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
    }

    PyObject *repr = PyObject_Repr(obj);
    if (repr == NULL) {
        return -1;
    }

    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
    Py_DECREF(repr);
    return res;
}


int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
                                Py_ssize_t start, Py_ssize_t end)
{
    assert(0 <= start);
    assert(end <= PyUnicode_GET_LENGTH(str));
    assert(start <= end);

    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
        return _PyUnicodeWriter_WriteStr(writer, str);

    Py_ssize_t len = end - start;
    if (len == 0) {
        return 0;
    }

    Py_UCS4 maxchar;
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
    }
    else {
        maxchar = writer->maxchar;
    }
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
        return -1;
    }

    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
                                  str, start, len);
    writer->pos += len;
    return 0;
}


int
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
                               Py_ssize_t start, Py_ssize_t end)
{
    if (!PyUnicode_Check(str)) {
        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
        return -1;
    }
    if (start < 0 || start > end) {
        PyErr_Format(PyExc_ValueError, "invalid start argument");
        return -1;
    }
    if (end > PyUnicode_GET_LENGTH(str)) {
        PyErr_Format(PyExc_ValueError, "invalid end argument");
        return -1;
    }

    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
                                           start, end);
}


int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
                                  const char *ascii, Py_ssize_t len)
{
    if (len == -1)
        len = strlen(ascii);

    if (len == 0) {
        return 0;
    }

    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);

    if (writer->buffer == NULL && !writer->overallocate) {
        PyObject *str;

        str = _PyUnicode_FromASCII(ascii, len);
        if (str == NULL)
            return -1;

        writer->readonly = 1;
        writer->buffer = str;
        _PyUnicodeWriter_Update(writer);
        writer->pos += len;
        return 0;
    }

    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
        return -1;

    switch (writer->kind)
    {
    case PyUnicode_1BYTE_KIND:
    {
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
        Py_UCS1 *data = writer->data;

        memcpy(data + writer->pos, str, len);
        break;
    }
    case PyUnicode_2BYTE_KIND:
    {
        _PyUnicode_CONVERT_BYTES(
            Py_UCS1, Py_UCS2,
            ascii, ascii + len,
            (Py_UCS2 *)writer->data + writer->pos);
        break;
    }
    case PyUnicode_4BYTE_KIND:
    {
        _PyUnicode_CONVERT_BYTES(
            Py_UCS1, Py_UCS4,
            ascii, ascii + len,
            (Py_UCS4 *)writer->data + writer->pos);
        break;
    }
    default:
        Py_UNREACHABLE();
    }

    writer->pos += len;
    return 0;
}


int
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
                           const char *str,
                           Py_ssize_t size)
{
    assert(writer != NULL);
    _Py_AssertHoldsTstate();

    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
}


int
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
                          const char *str,
                          Py_ssize_t size)
{
    if (size < 0) {
        size = strlen(str);
    }

    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
    Py_ssize_t old_pos = _writer->pos;
    int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size,
                                          _Py_ERROR_STRICT, NULL, NULL);
    if (res < 0) {
        _writer->pos = old_pos;
    }
    return res;
}


int
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
                                   const char *string,
                                   Py_ssize_t length,
                                   const char *errors,
                                   Py_ssize_t *consumed)
{
    if (length < 0) {
        length = strlen(string);
    }

    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
    Py_ssize_t old_pos = _writer->pos;
    int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length,
                                          _Py_ERROR_UNKNOWN, errors,
                                          consumed);
    if (res < 0) {
        _writer->pos = old_pos;
        if (consumed) {
            *consumed = 0;
        }
    }
    return res;
}


int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
                                   const char *str, Py_ssize_t len)
{
    Py_UCS4 maxchar;

    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
        return -1;
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
    writer->pos += len;
    return 0;
}


PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
{
    PyObject *str;

    if (writer->pos == 0) {
        Py_CLEAR(writer->buffer);
        return _PyUnicode_GetEmpty();
    }

    str = writer->buffer;
    writer->buffer = NULL;

    if (writer->readonly) {
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
        return str;
    }

    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
        PyObject *str2;
        str2 = _PyUnicode_ResizeCompact(str, writer->pos);
        if (str2 == NULL) {
            Py_DECREF(str);
            return NULL;
        }
        str = str2;
    }

    assert(_PyUnicode_CheckConsistency(str, 1));
    return _PyUnicode_Result(str);
}


PyObject*
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
{
    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
    return str;
}


void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
{
    Py_CLEAR(writer->buffer);
}

Coverage Report

Created: 2026-03-23 06:45

Line	Count	Source
1		/*
2
3		Unicode implementation based on original code by Fredrik Lundh,
4		modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6		Major speed upgrades to the method implementations at the Reykjavik
7		NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9		Copyright (c) Corporation for National Research Initiatives.
10
11		--------------------------------------------------------------------
12		The original string type implementation is:
13
14		Copyright (c) 1999 by Secret Labs AB
15		Copyright (c) 1999 by Fredrik Lundh
16
17		By obtaining, using, and/or copying this software and/or its
18		associated documentation, you agree that you have read, understood,
19		and will comply with the following terms and conditions:
20
21		Permission to use, copy, modify, and distribute this software and its
22		associated documentation for any purpose and without fee is hereby
23		granted, provided that the above copyright notice appears in all
24		copies, and that both that copyright notice and this permission notice
25		appear in supporting documentation, and that the name of Secret Labs
26		AB or the author not be used in advertising or publicity pertaining to
27		distribution of the software without specific, written prior
28		permission.
29
30		SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31		THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32		FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33		ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34		WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35		ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36		OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37		--------------------------------------------------------------------
38
39		*/
40
41		#include "Python.h"
42		#include "pycore_freelist.h" // _Py_FREELIST_FREE()
43		#include "pycore_long.h" // _PyLong_FormatWriter()
44		#include "pycore_unicodeobject.h" // _PyUnicode_Result()
45
46
47		#ifdef MS_WINDOWS
48		/* On Windows, overallocate by 50% is the best factor */
49		# define OVERALLOCATE_FACTOR 2
50		#else
51		/* On Linux, overallocate by 25% is the best factor */
52	82.3M	# define OVERALLOCATE_FACTOR 4
53		#endif
54
55
56		/* Compilation of templated routines */
57
58		#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
59
60		#include "stringlib/ucs1lib.h"
61		#include "stringlib/find_max_char.h"
62		#include "stringlib/undef.h"
63
64
65		/* Copy an ASCII or latin1 char* string into a Python Unicode string.
66
67		WARNING: The function doesn't copy the terminating null character and
68		doesn't check the maximum character (may write a latin1 character in an
69		ASCII string). */
70		static void
71		unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
72		const char *str, Py_ssize_t len)
73	0	{
74	0	int kind = PyUnicode_KIND(unicode);
75	0	const void *data = PyUnicode_DATA(unicode);
76	0	const char *end = str + len;
77
78	0	assert(index + len <= PyUnicode_GET_LENGTH(unicode));
79	0	switch (kind) {
80	0	case PyUnicode_1BYTE_KIND: {
81		#ifdef Py_DEBUG
82		if (PyUnicode_IS_ASCII(unicode)) {
83		Py_UCS4 maxchar = ucs1lib_find_max_char(
84		(const Py_UCS1*)str,
85		(const Py_UCS1*)str + len);
86		assert(maxchar < 128);
87		}
88		#endif
89	0	memcpy((char *) data + index, str, len);
90	0	break;
91	0	}
92	0	case PyUnicode_2BYTE_KIND: {
93	0	Py_UCS2 start = (Py_UCS2 )data + index;
94	0	Py_UCS2 *ucs2 = start;
95
96	0	for (; str < end; ++ucs2, ++str)
97	0	ucs2 = (Py_UCS2)str;
98
99	0	assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
100	0	break;
101	0	}
102	0	case PyUnicode_4BYTE_KIND: {
103	0	Py_UCS4 start = (Py_UCS4 )data + index;
104	0	Py_UCS4 *ucs4 = start;
105
106	0	for (; str < end; ++ucs4, ++str)
107	0	ucs4 = (Py_UCS4)str;
108
109	0	assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
110	0	break;
111	0	}
112	0	default:
113	0	Py_UNREACHABLE();
114	0	}
115	0	}
116
117
118		static inline void
119		_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
120	64.8M	{
121	64.8M	writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
122	64.8M	writer->data = PyUnicode_DATA(writer->buffer);
123
124	64.8M	if (!writer->readonly) {
125	64.7M	writer->kind = PyUnicode_KIND(writer->buffer);
126	64.7M	writer->size = PyUnicode_GET_LENGTH(writer->buffer);
127	64.7M	}
128	86.2k	else {
129		/* use a value smaller than PyUnicode_1BYTE_KIND() so
130		_PyUnicodeWriter_PrepareKind() will copy the buffer. */
131	86.2k	writer->kind = 0;
132	86.2k	assert(writer->kind <= PyUnicode_1BYTE_KIND);
133
134		/* Copy-on-write mode: set buffer size to 0 so
135		* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
136		* next write. */
137	86.2k	writer->size = 0;
138	86.2k	}
139	64.8M	}
140
141
142		void
143		_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
144	39.5M	{
145	39.5M	memset(writer, 0, sizeof(*writer));
146
147		/* ASCII is the bare minimum */
148	39.5M	writer->min_char = 127;
149
150		/* use a kind value smaller than PyUnicode_1BYTE_KIND so
151		_PyUnicodeWriter_PrepareKind() will copy the buffer. */
152	39.5M	assert(writer->kind == 0);
153	39.5M	assert(writer->kind < PyUnicode_1BYTE_KIND);
154	39.5M	}
155
156
157		PyUnicodeWriter*
158		PyUnicodeWriter_Create(Py_ssize_t length)
159	3.99M	{
160	3.99M	if (length < 0) {
161	0	PyErr_SetString(PyExc_ValueError,
162	0	"length must be positive");
163	0	return NULL;
164	0	}
165
166	3.99M	const size_t size = sizeof(_PyUnicodeWriter);
167	3.99M	PyUnicodeWriter *pub_writer;
168	3.99M	pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
169	3.99M	if (pub_writer == NULL) {
170	2.26M	pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
171	2.26M	if (pub_writer == NULL) {
172	0	return (PyUnicodeWriter *)PyErr_NoMemory();
173	0	}
174	2.26M	}
175	3.99M	_PyUnicodeWriter writer = (_PyUnicodeWriter )pub_writer;
176
177	3.99M	_PyUnicodeWriter_Init(writer);
178	3.99M	if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
179	0	PyUnicodeWriter_Discard(pub_writer);
180	0	return NULL;
181	0	}
182	3.99M	writer->overallocate = 1;
183
184	3.99M	return pub_writer;
185	3.99M	}
186
187
188		void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
189	77.1k	{
190	77.1k	if (writer == NULL) {
191	76.4k	return;
192	76.4k	}
193	660	_PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
194	660	_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
195	660	}
196
197
198		// Initialize _PyUnicodeWriter with initial buffer
199		void
200		_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter writer, PyObject buffer)
201	8.95M	{
202	8.95M	memset(writer, 0, sizeof(*writer));
203	8.95M	writer->buffer = buffer;
204	8.95M	_PyUnicodeWriter_Update(writer);
205	8.95M	writer->min_length = writer->size;
206	8.95M	}
207
208
209		int
210		_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
211		Py_ssize_t length, Py_UCS4 maxchar)
212	55.7M	{
213	55.7M	Py_ssize_t newlen;
214	55.7M	PyObject *newbuffer;
215
216	55.7M	assert(length >= 0);
217	55.7M	assert(maxchar <= _Py_MAX_UNICODE);
218
219		/* ensure that the _PyUnicodeWriter_Prepare macro was used */
220	55.7M	assert((maxchar > writer->maxchar && length >= 0)
221	55.7M	\|\| length > 0);
222
223	55.7M	if (length > PY_SSIZE_T_MAX - writer->pos) {
224	0	PyErr_NoMemory();
225	0	return -1;
226	0	}
227	55.7M	newlen = writer->pos + length;
228
229	55.7M	maxchar = Py_MAX(maxchar, writer->min_char);
230
231	55.7M	if (writer->buffer == NULL) {
232	38.6M	assert(!writer->readonly);
233	38.6M	if (writer->overallocate
234	33.0M	&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
235		/* overallocate to limit the number of realloc() */
236	33.0M	newlen += newlen / OVERALLOCATE_FACTOR;
237	33.0M	}
238	38.6M	if (newlen < writer->min_length)
239	33.9M	newlen = writer->min_length;
240
241	38.6M	writer->buffer = PyUnicode_New(newlen, maxchar);
242	38.6M	if (writer->buffer == NULL)
243	0	return -1;
244	38.6M	}
245	17.0M	else if (newlen > writer->size) {
246	8.46M	if (writer->overallocate
247	8.09M	&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
248		/* overallocate to limit the number of realloc() */
249	8.09M	newlen += newlen / OVERALLOCATE_FACTOR;
250	8.09M	}
251	8.46M	if (newlen < writer->min_length)
252	1.29k	newlen = writer->min_length;
253
254	8.46M	if (maxchar > writer->maxchar \|\| writer->readonly) {
255		/* resize + widen */
256	2.84M	maxchar = Py_MAX(maxchar, writer->maxchar);
257	2.84M	newbuffer = PyUnicode_New(newlen, maxchar);
258	2.84M	if (newbuffer == NULL)
259	0	return -1;
260	2.84M	_PyUnicode_FastCopyCharacters(newbuffer, 0,
261	2.84M	writer->buffer, 0, writer->pos);
262	2.84M	Py_DECREF(writer->buffer);
263	2.84M	writer->readonly = 0;
264	2.84M	}
265	5.61M	else {
266	5.61M	newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen);
267	5.61M	if (newbuffer == NULL)
268	0	return -1;
269	5.61M	}
270	8.46M	writer->buffer = newbuffer;
271	8.46M	}
272	8.62M	else if (maxchar > writer->maxchar) {
273	8.62M	assert(!writer->readonly);
274	8.62M	newbuffer = PyUnicode_New(writer->size, maxchar);
275	8.62M	if (newbuffer == NULL)
276	0	return -1;
277	8.62M	_PyUnicode_FastCopyCharacters(newbuffer, 0,
278	8.62M	writer->buffer, 0, writer->pos);
279	8.62M	Py_SETREF(writer->buffer, newbuffer);
280	8.62M	}
281	55.7M	_PyUnicodeWriter_Update(writer);
282	55.7M	return 0;
283
284	55.7M	#undef OVERALLOCATE_FACTOR
285	55.7M	}
286
287		int
288		_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
289		int kind)
290	198k	{
291	198k	Py_UCS4 maxchar;
292
293		/* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
294	198k	assert(writer->kind < kind);
295
296	198k	switch (kind)
297	198k	{
298	0	case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
299	198k	case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
300	0	case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break;
301	0	default:
302	0	Py_UNREACHABLE();
303	198k	}
304
305	198k	return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
306	198k	}
307
308
309		int
310		_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
311	83.4M	{
312	83.4M	return _PyUnicodeWriter_WriteCharInline(writer, ch);
313	83.4M	}
314
315
316		int
317		PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
318	61.5M	{
319	61.5M	if (ch > _Py_MAX_UNICODE) {
320	0	PyErr_SetString(PyExc_ValueError,
321	0	"character must be in range(0x110000)");
322	0	return -1;
323	0	}
324
325	61.5M	return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
326	61.5M	}
327
328
329		int
330		_PyUnicodeWriter_WriteStr(_PyUnicodeWriter writer, PyObject str)
331	59.6M	{
332	59.6M	assert(PyUnicode_Check(str));
333
334	59.6M	Py_UCS4 maxchar;
335	59.6M	Py_ssize_t len;
336
337	59.6M	len = PyUnicode_GET_LENGTH(str);
338	59.6M	if (len == 0)
339	4.14M	return 0;
340	55.4M	maxchar = PyUnicode_MAX_CHAR_VALUE(str);
341	55.4M	if (maxchar > writer->maxchar \|\| len > writer->size - writer->pos) {
342	17.4M	if (writer->buffer == NULL && !writer->overallocate) {
343	5.53k	assert(_PyUnicode_CheckConsistency(str, 1));
344	5.53k	writer->readonly = 1;
345	5.53k	writer->buffer = Py_NewRef(str);
346	5.53k	_PyUnicodeWriter_Update(writer);
347	5.53k	writer->pos += len;
348	5.53k	return 0;
349	5.53k	}
350	17.4M	if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
351	0	return -1;
352	17.4M	}
353	55.4M	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
354	55.4M	str, 0, len);
355	55.4M	writer->pos += len;
356	55.4M	return 0;
357	55.4M	}
358
359
360		int
361		PyUnicodeWriter_WriteStr(PyUnicodeWriter writer, PyObject obj)
362	2.88M	{
363	2.88M	PyTypeObject *type = Py_TYPE(obj);
364	2.88M	if (type == &PyUnicode_Type) {
365	2.88M	return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
366	2.88M	}
367
368	0	if (type == &PyLong_Type) {
369	0	return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
370	0	}
371
372	0	PyObject *str = PyObject_Str(obj);
373	0	if (str == NULL) {
374	0	return -1;
375	0	}
376
377	0	int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
378	0	Py_DECREF(str);
379	0	return res;
380	0	}
381
382
383		int
384		PyUnicodeWriter_WriteRepr(PyUnicodeWriter writer, PyObject obj)
385	7.67M	{
386	7.67M	if (obj == NULL) {
387	0	return _PyUnicodeWriter_WriteASCIIString((_PyUnicodeWriter*)writer, "<NULL>", 6);
388	0	}
389
390	7.67M	if (Py_TYPE(obj) == &PyLong_Type) {
391	258k	return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
392	258k	}
393
394	7.41M	PyObject *repr = PyObject_Repr(obj);
395	7.41M	if (repr == NULL) {
396	0	return -1;
397	0	}
398
399	7.41M	int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
400	7.41M	Py_DECREF(repr);
401	7.41M	return res;
402	7.41M	}
403
404
405		int
406		_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter writer, PyObject str,
407		Py_ssize_t start, Py_ssize_t end)
408	45.6M	{
409	45.6M	assert(0 <= start);
410	45.6M	assert(end <= PyUnicode_GET_LENGTH(str));
411	45.6M	assert(start <= end);
412
413	45.6M	if (start == 0 && end == PyUnicode_GET_LENGTH(str))
414	90	return _PyUnicodeWriter_WriteStr(writer, str);
415
416	45.6M	Py_ssize_t len = end - start;
417	45.6M	if (len == 0) {
418	192	return 0;
419	192	}
420
421	45.6M	Py_UCS4 maxchar;
422	45.6M	if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
423	13.3M	maxchar = _PyUnicode_FindMaxChar(str, start, end);
424	13.3M	}
425	32.2M	else {
426	32.2M	maxchar = writer->maxchar;
427	32.2M	}
428	45.6M	if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
429	0	return -1;
430	0	}
431
432	45.6M	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
433	45.6M	str, start, len);
434	45.6M	writer->pos += len;
435	45.6M	return 0;
436	45.6M	}
437
438
439		int
440		PyUnicodeWriter_WriteSubstring(PyUnicodeWriter writer, PyObject str,
441		Py_ssize_t start, Py_ssize_t end)
442	412k	{
443	412k	if (!PyUnicode_Check(str)) {
444	0	PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
445	0	return -1;
446	0	}
447	412k	if (start < 0 \|\| start > end) {
448	0	PyErr_Format(PyExc_ValueError, "invalid start argument");
449	0	return -1;
450	0	}
451	412k	if (end > PyUnicode_GET_LENGTH(str)) {
452	0	PyErr_Format(PyExc_ValueError, "invalid end argument");
453	0	return -1;
454	0	}
455
456	412k	return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
457	412k	start, end);
458	412k	}
459
460
461		int
462		_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
463		const char *ascii, Py_ssize_t len)
464	40.4M	{
465	40.4M	if (len == -1)
466	0	len = strlen(ascii);
467
468	40.4M	if (len == 0) {
469	0	return 0;
470	0	}
471
472	40.4M	assert(ucs1lib_find_max_char((const Py_UCS1)ascii, (const Py_UCS1)ascii + len) < 128);
473
474	40.4M	if (writer->buffer == NULL && !writer->overallocate) {
475	80.6k	PyObject *str;
476
477	80.6k	str = _PyUnicode_FromASCII(ascii, len);
478	80.6k	if (str == NULL)
479	0	return -1;
480
481	80.6k	writer->readonly = 1;
482	80.6k	writer->buffer = str;
483	80.6k	_PyUnicodeWriter_Update(writer);
484	80.6k	writer->pos += len;
485	80.6k	return 0;
486	80.6k	}
487
488	40.3M	if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
489	0	return -1;
490
491	40.3M	switch (writer->kind)
492	40.3M	{
493	40.3M	case PyUnicode_1BYTE_KIND:
494	40.3M	{
495	40.3M	const Py_UCS1 str = (const Py_UCS1 )ascii;
496	40.3M	Py_UCS1 *data = writer->data;
497
498	40.3M	memcpy(data + writer->pos, str, len);
499	40.3M	break;
500	0	}
501	12.0k	case PyUnicode_2BYTE_KIND:
502	12.0k	{
503	12.0k	_PyUnicode_CONVERT_BYTES(
504	12.0k	Py_UCS1, Py_UCS2,
505	12.0k	ascii, ascii + len,
506	12.0k	(Py_UCS2 *)writer->data + writer->pos);
507	12.0k	break;
508	0	}
509	3.43k	case PyUnicode_4BYTE_KIND:
510	3.43k	{
511	3.43k	_PyUnicode_CONVERT_BYTES(
512	3.43k	Py_UCS1, Py_UCS4,
513	3.43k	ascii, ascii + len,
514	3.43k	(Py_UCS4 *)writer->data + writer->pos);
515	3.43k	break;
516	0	}
517	0	default:
518	0	Py_UNREACHABLE();
519	40.3M	}
520
521	40.3M	writer->pos += len;
522	40.3M	return 0;
523	40.3M	}
524
525
526		int
527		PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
528		const char *str,
529		Py_ssize_t size)
530	490k	{
531	490k	assert(writer != NULL);
532	490k	_Py_AssertHoldsTstate();
533
534	490k	_PyUnicodeWriter priv_writer = (_PyUnicodeWriter)writer;
535	490k	return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
536	490k	}
537
538
539		int
540		PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
541		const char *str,
542		Py_ssize_t size)
543	0	{
544	0	if (size < 0) {
545	0	size = strlen(str);
546	0	}
547
548	0	_PyUnicodeWriter _writer = (_PyUnicodeWriter)writer;
549	0	Py_ssize_t old_pos = _writer->pos;
550	0	int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size,
551	0	_Py_ERROR_STRICT, NULL, NULL);
552	0	if (res < 0) {
553	0	_writer->pos = old_pos;
554	0	}
555	0	return res;
556	0	}
557
558
559		int
560		PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
561		const char *string,
562		Py_ssize_t length,
563		const char *errors,
564		Py_ssize_t *consumed)
565	0	{
566	0	if (length < 0) {
567	0	length = strlen(string);
568	0	}
569
570	0	_PyUnicodeWriter _writer = (_PyUnicodeWriter)writer;
571	0	Py_ssize_t old_pos = _writer->pos;
572	0	int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length,
573	0	_Py_ERROR_UNKNOWN, errors,
574	0	consumed);
575	0	if (res < 0) {
576	0	_writer->pos = old_pos;
577	0	if (consumed) {
578	0	*consumed = 0;
579	0	}
580	0	}
581	0	return res;
582	0	}
583
584
585		int
586		_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
587		const char *str, Py_ssize_t len)
588	0	{
589	0	Py_UCS4 maxchar;
590
591	0	maxchar = ucs1lib_find_max_char((const Py_UCS1)str, (const Py_UCS1)str + len);
592	0	if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
593	0	return -1;
594	0	unicode_write_cstr(writer->buffer, writer->pos, str, len);
595	0	writer->pos += len;
596	0	return 0;
597	0	}
598
599
600		PyObject *
601		_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
602	45.6M	{
603	45.6M	PyObject *str;
604
605	45.6M	if (writer->pos == 0) {
606	829	Py_CLEAR(writer->buffer);
607	829	return _PyUnicode_GetEmpty();
608	829	}
609
610	45.6M	str = writer->buffer;
611	45.6M	writer->buffer = NULL;
612
613	45.6M	if (writer->readonly) {
614	84.9k	assert(PyUnicode_GET_LENGTH(str) == writer->pos);
615	84.9k	return str;
616	84.9k	}
617
618	45.5M	if (PyUnicode_GET_LENGTH(str) != writer->pos) {
619	43.8M	PyObject *str2;
620	43.8M	str2 = _PyUnicode_ResizeCompact(str, writer->pos);
621	43.8M	if (str2 == NULL) {
622	0	Py_DECREF(str);
623	0	return NULL;
624	0	}
625	43.8M	str = str2;
626	43.8M	}
627
628	45.5M	assert(_PyUnicode_CheckConsistency(str, 1));
629	45.5M	return _PyUnicode_Result(str);
630	45.5M	}
631
632
633		PyObject*
634		PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
635	3.99M	{
636	3.99M	PyObject str = _PyUnicodeWriter_Finish((_PyUnicodeWriter)writer);
637	3.99M	assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
638	3.99M	_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
639	3.99M	return str;
640	3.99M	}
641
642
643		void
644		_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
645	2.84M	{
646		Py_CLEAR(writer->buffer);
647	2.84M	}