/src/cpython3/Objects/unicode_writer.c

Source
/*

Unicode implementation based on original code by Fredrik Lundh,
modified by Marc-Andre Lemburg <mal@lemburg.com>.

Major speed upgrades to the method implementations at the Reykjavik
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.

Copyright (c) Corporation for National Research Initiatives.

--------------------------------------------------------------------
The original string type implementation is:

  Copyright (c) 1999 by Secret Labs AB
  Copyright (c) 1999 by Fredrik Lundh

By obtaining, using, and/or copying this software and/or its
associated documentation, you agree that you have read, understood,
and will comply with the following terms and conditions:

Permission to use, copy, modify, and distribute this software and its
associated documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appears in all
copies, and that both that copyright notice and this permission notice
appear in supporting documentation, and that the name of Secret Labs
AB or the author not be used in advertising or publicity pertaining to
distribution of the software without specific, written prior
permission.

SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--------------------------------------------------------------------

*/

#include "Python.h"
#include "pycore_freelist.h"      // _Py_FREELIST_FREE()
#include "pycore_long.h"          // _PyLong_FormatWriter()
#include "pycore_unicodeobject.h" // _PyUnicode_Result()


#ifdef MS_WINDOWS
   /* On Windows, overallocate by 50% is the best factor */
#  define OVERALLOCATE_FACTOR 2
#else
   /* On Linux, overallocate by 25% is the best factor */
#  define OVERALLOCATE_FACTOR 4
#endif


/* Compilation of templated routines */

#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()

#include "stringlib/ucs1lib.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"


/* Copy an ASCII or latin1 char* string into a Python Unicode string.

   WARNING: The function doesn't copy the terminating null character and
   doesn't check the maximum character (may write a latin1 character in an
   ASCII string). */
static void
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
                   const char *str, Py_ssize_t len)
{
    int kind = PyUnicode_KIND(unicode);
    const void *data = PyUnicode_DATA(unicode);
    const char *end = str + len;

    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
    switch (kind) {
    case PyUnicode_1BYTE_KIND: {
#ifdef Py_DEBUG
        if (PyUnicode_IS_ASCII(unicode)) {
            Py_UCS4 maxchar = ucs1lib_find_max_char(
                (const Py_UCS1*)str,
                (const Py_UCS1*)str + len);
            assert(maxchar < 128);
        }
#endif
        memcpy((char *) data + index, str, len);
        break;
    }
    case PyUnicode_2BYTE_KIND: {
        Py_UCS2 *start = (Py_UCS2 *)data + index;
        Py_UCS2 *ucs2 = start;

        for (; str < end; ++ucs2, ++str)
            *ucs2 = (Py_UCS2)*str;

        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
        break;
    }
    case PyUnicode_4BYTE_KIND: {
        Py_UCS4 *start = (Py_UCS4 *)data + index;
        Py_UCS4 *ucs4 = start;

        for (; str < end; ++ucs4, ++str)
            *ucs4 = (Py_UCS4)*str;

        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
        break;
    }
    default:
        Py_UNREACHABLE();
    }
}


static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
{
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
    writer->data = PyUnicode_DATA(writer->buffer);

    if (!writer->readonly) {
        writer->kind = PyUnicode_KIND(writer->buffer);
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
    }
    else {
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
        writer->kind = 0;
        assert(writer->kind <= PyUnicode_1BYTE_KIND);

        /* Copy-on-write mode: set buffer size to 0 so
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
         * next write. */
        writer->size = 0;
    }
}


void
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
{
    memset(writer, 0, sizeof(*writer));

    /* ASCII is the bare minimum */
    writer->min_char = 127;

    /* use a kind value smaller than PyUnicode_1BYTE_KIND so
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
    assert(writer->kind == 0);
    assert(writer->kind < PyUnicode_1BYTE_KIND);
}


PyUnicodeWriter*
PyUnicodeWriter_Create(Py_ssize_t length)
{
    if (length < 0) {
        PyErr_SetString(PyExc_ValueError,
                        "length must be positive");
        return NULL;
    }

    const size_t size = sizeof(_PyUnicodeWriter);
    PyUnicodeWriter *pub_writer;
    pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
    if (pub_writer == NULL) {
        pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
        if (pub_writer == NULL) {
            return (PyUnicodeWriter *)PyErr_NoMemory();
        }
    }
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;

    _PyUnicodeWriter_Init(writer);
    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
        PyUnicodeWriter_Discard(pub_writer);
        return NULL;
    }
    writer->overallocate = 1;

    return pub_writer;
}


void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
{
    if (writer == NULL) {
        return;
    }
    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
}


// Initialize _PyUnicodeWriter with initial buffer
void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
{
    memset(writer, 0, sizeof(*writer));
    writer->buffer = buffer;
    _PyUnicodeWriter_Update(writer);
    writer->min_length = writer->size;
}


int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                 Py_ssize_t length, Py_UCS4 maxchar)
{
    Py_ssize_t newlen;
    PyObject *newbuffer;

    assert(length >= 0);
    assert(maxchar <= _Py_MAX_UNICODE);

    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
    assert((maxchar > writer->maxchar && length >= 0)
           || length > 0);

    if (length > PY_SSIZE_T_MAX - writer->pos) {
        PyErr_NoMemory();
        return -1;
    }
    newlen = writer->pos + length;

    maxchar = Py_MAX(maxchar, writer->min_char);

    if (writer->buffer == NULL) {
        assert(!writer->readonly);
        if (writer->overallocate
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
            /* overallocate to limit the number of realloc() */
            newlen += newlen / OVERALLOCATE_FACTOR;
        }
        if (newlen < writer->min_length)
            newlen = writer->min_length;

        writer->buffer = PyUnicode_New(newlen, maxchar);
        if (writer->buffer == NULL)
            return -1;
    }
    else if (newlen > writer->size) {
        if (writer->overallocate
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
            /* overallocate to limit the number of realloc() */
            newlen += newlen / OVERALLOCATE_FACTOR;
        }
        if (newlen < writer->min_length)
            newlen = writer->min_length;

        if (maxchar > writer->maxchar || writer->readonly) {
            /* resize + widen */
            maxchar = Py_MAX(maxchar, writer->maxchar);
            newbuffer = PyUnicode_New(newlen, maxchar);
            if (newbuffer == NULL)
                return -1;
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
                                          writer->buffer, 0, writer->pos);
            Py_DECREF(writer->buffer);
            writer->readonly = 0;
        }
        else {
            newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen);
            if (newbuffer == NULL)
                return -1;
        }
        writer->buffer = newbuffer;
    }
    else if (maxchar > writer->maxchar) {
        assert(!writer->readonly);
        newbuffer = PyUnicode_New(writer->size, maxchar);
        if (newbuffer == NULL)
            return -1;
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
                                      writer->buffer, 0, writer->pos);
        Py_SETREF(writer->buffer, newbuffer);
    }
    _PyUnicodeWriter_Update(writer);
    return 0;

#undef OVERALLOCATE_FACTOR
}

int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
                                     int kind)
{
    Py_UCS4 maxchar;

    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
    assert(writer->kind < kind);

    switch (kind)
    {
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
    case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break;
    default:
        Py_UNREACHABLE();
    }

    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
}


int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
{
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
}


int
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
{
    if (ch > _Py_MAX_UNICODE) {
        PyErr_SetString(PyExc_ValueError,
                        "character must be in range(0x110000)");
        return -1;
    }

    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
}


int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
{
    assert(PyUnicode_Check(str));

    Py_UCS4 maxchar;
    Py_ssize_t len;

    len = PyUnicode_GET_LENGTH(str);
    if (len == 0)
        return 0;
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
        if (writer->buffer == NULL && !writer->overallocate) {
            assert(_PyUnicode_CheckConsistency(str, 1));
            writer->readonly = 1;
            writer->buffer = Py_NewRef(str);
            _PyUnicodeWriter_Update(writer);
            writer->pos += len;
            return 0;
        }
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
            return -1;
    }
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
                                  str, 0, len);
    writer->pos += len;
    return 0;
}


int
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
{
    PyTypeObject *type = Py_TYPE(obj);
    if (type == &PyUnicode_Type) {
        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
    }

    if (type == &PyLong_Type) {
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
    }

    PyObject *str = PyObject_Str(obj);
    if (str == NULL) {
        return -1;
    }

    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
    Py_DECREF(str);
    return res;
}


int
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
{
    if (Py_TYPE(obj) == &PyLong_Type) {
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
    }

    PyObject *repr = PyObject_Repr(obj);
    if (repr == NULL) {
        return -1;
    }

    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
    Py_DECREF(repr);
    return res;
}


int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
                                Py_ssize_t start, Py_ssize_t end)
{
    assert(0 <= start);
    assert(end <= PyUnicode_GET_LENGTH(str));
    assert(start <= end);

    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
        return _PyUnicodeWriter_WriteStr(writer, str);

    Py_ssize_t len = end - start;
    if (len == 0) {
        return 0;
    }

    Py_UCS4 maxchar;
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
    }
    else {
        maxchar = writer->maxchar;
    }
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
        return -1;
    }

    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
                                  str, start, len);
    writer->pos += len;
    return 0;
}


int
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
                               Py_ssize_t start, Py_ssize_t end)
{
    if (!PyUnicode_Check(str)) {
        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
        return -1;
    }
    if (start < 0 || start > end) {
        PyErr_Format(PyExc_ValueError, "invalid start argument");
        return -1;
    }
    if (end > PyUnicode_GET_LENGTH(str)) {
        PyErr_Format(PyExc_ValueError, "invalid end argument");
        return -1;
    }

    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
                                           start, end);
}


int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
                                  const char *ascii, Py_ssize_t len)
{
    if (len == -1)
        len = strlen(ascii);

    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);

    if (writer->buffer == NULL && !writer->overallocate) {
        PyObject *str;

        str = _PyUnicode_FromASCII(ascii, len);
        if (str == NULL)
            return -1;

        writer->readonly = 1;
        writer->buffer = str;
        _PyUnicodeWriter_Update(writer);
        writer->pos += len;
        return 0;
    }

    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
        return -1;

    switch (writer->kind)
    {
    case PyUnicode_1BYTE_KIND:
    {
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
        Py_UCS1 *data = writer->data;

        memcpy(data + writer->pos, str, len);
        break;
    }
    case PyUnicode_2BYTE_KIND:
    {
        _PyUnicode_CONVERT_BYTES(
            Py_UCS1, Py_UCS2,
            ascii, ascii + len,
            (Py_UCS2 *)writer->data + writer->pos);
        break;
    }
    case PyUnicode_4BYTE_KIND:
    {
        _PyUnicode_CONVERT_BYTES(
            Py_UCS1, Py_UCS4,
            ascii, ascii + len,
            (Py_UCS4 *)writer->data + writer->pos);
        break;
    }
    default:
        Py_UNREACHABLE();
    }

    writer->pos += len;
    return 0;
}


int
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
                           const char *str,
                           Py_ssize_t size)
{
    assert(writer != NULL);
    _Py_AssertHoldsTstate();

    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
}


int
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
                          const char *str,
                          Py_ssize_t size)
{
    if (size < 0) {
        size = strlen(str);
    }

    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
    Py_ssize_t old_pos = _writer->pos;
    int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size,
                                          _Py_ERROR_STRICT, NULL, NULL);
    if (res < 0) {
        _writer->pos = old_pos;
    }
    return res;
}


int
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
                                   const char *string,
                                   Py_ssize_t length,
                                   const char *errors,
                                   Py_ssize_t *consumed)
{
    if (length < 0) {
        length = strlen(string);
    }

    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
    Py_ssize_t old_pos = _writer->pos;
    int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length,
                                          _Py_ERROR_UNKNOWN, errors,
                                          consumed);
    if (res < 0) {
        _writer->pos = old_pos;
        if (consumed) {
            *consumed = 0;
        }
    }
    return res;
}


int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
                                   const char *str, Py_ssize_t len)
{
    Py_UCS4 maxchar;

    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
        return -1;
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
    writer->pos += len;
    return 0;
}


PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
{
    PyObject *str;

    if (writer->pos == 0) {
        Py_CLEAR(writer->buffer);
        return _PyUnicode_GetEmpty();
    }

    str = writer->buffer;
    writer->buffer = NULL;

    if (writer->readonly) {
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
        return str;
    }

    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
        PyObject *str2;
        str2 = _PyUnicode_ResizeCompact(str, writer->pos);
        if (str2 == NULL) {
            Py_DECREF(str);
            return NULL;
        }
        str = str2;
    }

    assert(_PyUnicode_CheckConsistency(str, 1));
    return _PyUnicode_Result(str);
}


PyObject*
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
{
    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
    return str;
}


void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
{
    Py_CLEAR(writer->buffer);
}

Coverage Report

Created: 2025-12-14 07:07

Line	Count	Source
1		/*
2
3		Unicode implementation based on original code by Fredrik Lundh,
4		modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6		Major speed upgrades to the method implementations at the Reykjavik
7		NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9		Copyright (c) Corporation for National Research Initiatives.
10
11		--------------------------------------------------------------------
12		The original string type implementation is:
13
14		Copyright (c) 1999 by Secret Labs AB
15		Copyright (c) 1999 by Fredrik Lundh
16
17		By obtaining, using, and/or copying this software and/or its
18		associated documentation, you agree that you have read, understood,
19		and will comply with the following terms and conditions:
20
21		Permission to use, copy, modify, and distribute this software and its
22		associated documentation for any purpose and without fee is hereby
23		granted, provided that the above copyright notice appears in all
24		copies, and that both that copyright notice and this permission notice
25		appear in supporting documentation, and that the name of Secret Labs
26		AB or the author not be used in advertising or publicity pertaining to
27		distribution of the software without specific, written prior
28		permission.
29
30		SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31		THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32		FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33		ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34		WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35		ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36		OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37		--------------------------------------------------------------------
38
39		*/
40
41		#include "Python.h"
42		#include "pycore_freelist.h" // _Py_FREELIST_FREE()
43		#include "pycore_long.h" // _PyLong_FormatWriter()
44		#include "pycore_unicodeobject.h" // _PyUnicode_Result()
45
46
47		#ifdef MS_WINDOWS
48		/* On Windows, overallocate by 50% is the best factor */
49		# define OVERALLOCATE_FACTOR 2
50		#else
51		/* On Linux, overallocate by 25% is the best factor */
52	4.17M	# define OVERALLOCATE_FACTOR 4
53		#endif
54
55
56		/* Compilation of templated routines */
57
58		#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
59
60		#include "stringlib/ucs1lib.h"
61		#include "stringlib/find_max_char.h"
62		#include "stringlib/undef.h"
63
64
65		/* Copy an ASCII or latin1 char* string into a Python Unicode string.
66
67		WARNING: The function doesn't copy the terminating null character and
68		doesn't check the maximum character (may write a latin1 character in an
69		ASCII string). */
70		static void
71		unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
72		const char *str, Py_ssize_t len)
73	0	{
74	0	int kind = PyUnicode_KIND(unicode);
75	0	const void *data = PyUnicode_DATA(unicode);
76	0	const char *end = str + len;
77
78	0	assert(index + len <= PyUnicode_GET_LENGTH(unicode));
79	0	switch (kind) {
80	0	case PyUnicode_1BYTE_KIND: {
81		#ifdef Py_DEBUG
82		if (PyUnicode_IS_ASCII(unicode)) {
83		Py_UCS4 maxchar = ucs1lib_find_max_char(
84		(const Py_UCS1*)str,
85		(const Py_UCS1*)str + len);
86		assert(maxchar < 128);
87		}
88		#endif
89	0	memcpy((char *) data + index, str, len);
90	0	break;
91	0	}
92	0	case PyUnicode_2BYTE_KIND: {
93	0	Py_UCS2 start = (Py_UCS2 )data + index;
94	0	Py_UCS2 *ucs2 = start;
95
96	0	for (; str < end; ++ucs2, ++str)
97	0	ucs2 = (Py_UCS2)str;
98
99	0	assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
100	0	break;
101	0	}
102	0	case PyUnicode_4BYTE_KIND: {
103	0	Py_UCS4 start = (Py_UCS4 )data + index;
104	0	Py_UCS4 *ucs4 = start;
105
106	0	for (; str < end; ++ucs4, ++str)
107	0	ucs4 = (Py_UCS4)str;
108
109	0	assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
110	0	break;
111	0	}
112	0	default:
113	0	Py_UNREACHABLE();
114	0	}
115	0	}
116
117
118		static inline void
119		_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
120	17.1M	{
121	17.1M	writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
122	17.1M	writer->data = PyUnicode_DATA(writer->buffer);
123
124	17.1M	if (!writer->readonly) {
125	17.0M	writer->kind = PyUnicode_KIND(writer->buffer);
126	17.0M	writer->size = PyUnicode_GET_LENGTH(writer->buffer);
127	17.0M	}
128	38.0k	else {
129		/* use a value smaller than PyUnicode_1BYTE_KIND() so
130		_PyUnicodeWriter_PrepareKind() will copy the buffer. */
131	38.0k	writer->kind = 0;
132	38.0k	assert(writer->kind <= PyUnicode_1BYTE_KIND);
133
134		/* Copy-on-write mode: set buffer size to 0 so
135		* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
136		* next write. */
137	38.0k	writer->size = 0;
138	38.0k	}
139	17.1M	}
140
141
142		void
143		_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
144	2.11M	{
145	2.11M	memset(writer, 0, sizeof(*writer));
146
147		/* ASCII is the bare minimum */
148	2.11M	writer->min_char = 127;
149
150		/* use a kind value smaller than PyUnicode_1BYTE_KIND so
151		_PyUnicodeWriter_PrepareKind() will copy the buffer. */
152	2.11M	assert(writer->kind == 0);
153	2.11M	assert(writer->kind < PyUnicode_1BYTE_KIND);
154	2.11M	}
155
156
157		PyUnicodeWriter*
158		PyUnicodeWriter_Create(Py_ssize_t length)
159	84.4k	{
160	84.4k	if (length < 0) {
161	0	PyErr_SetString(PyExc_ValueError,
162	0	"length must be positive");
163	0	return NULL;
164	0	}
165
166	84.4k	const size_t size = sizeof(_PyUnicodeWriter);
167	84.4k	PyUnicodeWriter *pub_writer;
168	84.4k	pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
169	84.4k	if (pub_writer == NULL) {
170	999	pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
171	999	if (pub_writer == NULL) {
172	0	return (PyUnicodeWriter *)PyErr_NoMemory();
173	0	}
174	999	}
175	84.4k	_PyUnicodeWriter writer = (_PyUnicodeWriter )pub_writer;
176
177	84.4k	_PyUnicodeWriter_Init(writer);
178	84.4k	if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
179	0	PyUnicodeWriter_Discard(pub_writer);
180	0	return NULL;
181	0	}
182	84.4k	writer->overallocate = 1;
183
184	84.4k	return pub_writer;
185	84.4k	}
186
187
188		void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
189	38	{
190	38	if (writer == NULL) {
191	16	return;
192	16	}
193	22	_PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
194	22	_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
195	22	}
196
197
198		// Initialize _PyUnicodeWriter with initial buffer
199		void
200		_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter writer, PyObject buffer)
201	7.02M	{
202	7.02M	memset(writer, 0, sizeof(*writer));
203	7.02M	writer->buffer = buffer;
204	7.02M	_PyUnicodeWriter_Update(writer);
205	7.02M	writer->min_length = writer->size;
206	7.02M	}
207
208
209		int
210		_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
211		Py_ssize_t length, Py_UCS4 maxchar)
212	10.0M	{
213	10.0M	Py_ssize_t newlen;
214	10.0M	PyObject *newbuffer;
215
216	10.0M	assert(length >= 0);
217	10.0M	assert(maxchar <= _Py_MAX_UNICODE);
218
219		/* ensure that the _PyUnicodeWriter_Prepare macro was used */
220	10.0M	assert((maxchar > writer->maxchar && length >= 0)
221	10.0M	\|\| length > 0);
222
223	10.0M	if (length > PY_SSIZE_T_MAX - writer->pos) {
224	0	PyErr_NoMemory();
225	0	return -1;
226	0	}
227	10.0M	newlen = writer->pos + length;
228
229	10.0M	maxchar = Py_MAX(maxchar, writer->min_char);
230
231	10.0M	if (writer->buffer == NULL) {
232	2.07M	assert(!writer->readonly);
233	2.07M	if (writer->overallocate
234	2.01M	&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
235		/* overallocate to limit the number of realloc() */
236	2.01M	newlen += newlen / OVERALLOCATE_FACTOR;
237	2.01M	}
238	2.07M	if (newlen < writer->min_length)
239	1.96M	newlen = writer->min_length;
240
241	2.07M	writer->buffer = PyUnicode_New(newlen, maxchar);
242	2.07M	if (writer->buffer == NULL)
243	0	return -1;
244	2.07M	}
245	7.97M	else if (newlen > writer->size) {
246	74.8k	if (writer->overallocate
247	72.6k	&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
248		/* overallocate to limit the number of realloc() */
249	72.6k	newlen += newlen / OVERALLOCATE_FACTOR;
250	72.6k	}
251	74.8k	if (newlen < writer->min_length)
252	0	newlen = writer->min_length;
253
254	74.8k	if (maxchar > writer->maxchar \|\| writer->readonly) {
255		/* resize + widen */
256	16.9k	maxchar = Py_MAX(maxchar, writer->maxchar);
257	16.9k	newbuffer = PyUnicode_New(newlen, maxchar);
258	16.9k	if (newbuffer == NULL)
259	0	return -1;
260	16.9k	_PyUnicode_FastCopyCharacters(newbuffer, 0,
261	16.9k	writer->buffer, 0, writer->pos);
262	16.9k	Py_DECREF(writer->buffer);
263	16.9k	writer->readonly = 0;
264	16.9k	}
265	57.8k	else {
266	57.8k	newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen);
267	57.8k	if (newbuffer == NULL)
268	0	return -1;
269	57.8k	}
270	74.8k	writer->buffer = newbuffer;
271	74.8k	}
272	7.89M	else if (maxchar > writer->maxchar) {
273	7.89M	assert(!writer->readonly);
274	7.89M	newbuffer = PyUnicode_New(writer->size, maxchar);
275	7.89M	if (newbuffer == NULL)
276	0	return -1;
277	7.89M	_PyUnicode_FastCopyCharacters(newbuffer, 0,
278	7.89M	writer->buffer, 0, writer->pos);
279	7.89M	Py_SETREF(writer->buffer, newbuffer);
280	7.89M	}
281	10.0M	_PyUnicodeWriter_Update(writer);
282	10.0M	return 0;
283
284	10.0M	#undef OVERALLOCATE_FACTOR
285	10.0M	}
286
287		int
288		_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
289		int kind)
290	8	{
291	8	Py_UCS4 maxchar;
292
293		/* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
294	8	assert(writer->kind < kind);
295
296	8	switch (kind)
297	8	{
298	0	case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
299	8	case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
300	0	case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break;
301	0	default:
302	0	Py_UNREACHABLE();
303	8	}
304
305	8	return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
306	8	}
307
308
309		int
310		_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
311	1.98M	{
312	1.98M	return _PyUnicodeWriter_WriteCharInline(writer, ch);
313	1.98M	}
314
315
316		int
317		PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
318	444k	{
319	444k	if (ch > _Py_MAX_UNICODE) {
320	0	PyErr_SetString(PyExc_ValueError,
321	0	"character must be in range(0x110000)");
322	0	return -1;
323	0	}
324
325	444k	return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
326	444k	}
327
328
329		int
330		_PyUnicodeWriter_WriteStr(_PyUnicodeWriter writer, PyObject str)
331	1.80M	{
332	1.80M	assert(PyUnicode_Check(str));
333
334	1.80M	Py_UCS4 maxchar;
335	1.80M	Py_ssize_t len;
336
337	1.80M	len = PyUnicode_GET_LENGTH(str);
338	1.80M	if (len == 0)
339	4.34k	return 0;
340	1.80M	maxchar = PyUnicode_MAX_CHAR_VALUE(str);
341	1.80M	if (maxchar > writer->maxchar \|\| len > writer->size - writer->pos) {
342	41.1k	if (writer->buffer == NULL && !writer->overallocate) {
343	0	assert(_PyUnicode_CheckConsistency(str, 1));
344	0	writer->readonly = 1;
345	0	writer->buffer = Py_NewRef(str);
346	0	_PyUnicodeWriter_Update(writer);
347	0	writer->pos += len;
348	0	return 0;
349	0	}
350	41.1k	if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
351	0	return -1;
352	41.1k	}
353	1.80M	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
354	1.80M	str, 0, len);
355	1.80M	writer->pos += len;
356	1.80M	return 0;
357	1.80M	}
358
359
360		int
361		PyUnicodeWriter_WriteStr(PyUnicodeWriter writer, PyObject obj)
362	213k	{
363	213k	PyTypeObject *type = Py_TYPE(obj);
364	213k	if (type == &PyUnicode_Type) {
365	213k	return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
366	213k	}
367
368	0	if (type == &PyLong_Type) {
369	0	return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
370	0	}
371
372	0	PyObject *str = PyObject_Str(obj);
373	0	if (str == NULL) {
374	0	return -1;
375	0	}
376
377	0	int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
378	0	Py_DECREF(str);
379	0	return res;
380	0	}
381
382
383		int
384		PyUnicodeWriter_WriteRepr(PyUnicodeWriter writer, PyObject obj)
385	0	{
386	0	if (Py_TYPE(obj) == &PyLong_Type) {
387	0	return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
388	0	}
389
390	0	PyObject *repr = PyObject_Repr(obj);
391	0	if (repr == NULL) {
392	0	return -1;
393	0	}
394
395	0	int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
396	0	Py_DECREF(repr);
397	0	return res;
398	0	}
399
400
401		int
402		_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter writer, PyObject str,
403		Py_ssize_t start, Py_ssize_t end)
404	3.69M	{
405	3.69M	assert(0 <= start);
406	3.69M	assert(end <= PyUnicode_GET_LENGTH(str));
407	3.69M	assert(start <= end);
408
409	3.69M	if (start == 0 && end == PyUnicode_GET_LENGTH(str))
410	8.92k	return _PyUnicodeWriter_WriteStr(writer, str);
411
412	3.68M	Py_ssize_t len = end - start;
413	3.68M	if (len == 0) {
414	0	return 0;
415	0	}
416
417	3.68M	Py_UCS4 maxchar;
418	3.68M	if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
419	2.17M	maxchar = _PyUnicode_FindMaxChar(str, start, end);
420	2.17M	}
421	1.51M	else {
422	1.51M	maxchar = writer->maxchar;
423	1.51M	}
424	3.68M	if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
425	0	return -1;
426	0	}
427
428	3.68M	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
429	3.68M	str, start, len);
430	3.68M	writer->pos += len;
431	3.68M	return 0;
432	3.68M	}
433
434
435		int
436		PyUnicodeWriter_WriteSubstring(PyUnicodeWriter writer, PyObject str,
437		Py_ssize_t start, Py_ssize_t end)
438	420k	{
439	420k	if (!PyUnicode_Check(str)) {
440	0	PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
441	0	return -1;
442	0	}
443	420k	if (start < 0 \|\| start > end) {
444	0	PyErr_Format(PyExc_ValueError, "invalid start argument");
445	0	return -1;
446	0	}
447	420k	if (end > PyUnicode_GET_LENGTH(str)) {
448	0	PyErr_Format(PyExc_ValueError, "invalid end argument");
449	0	return -1;
450	0	}
451
452	420k	return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
453	420k	start, end);
454	420k	}
455
456
457		int
458		_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
459		const char *ascii, Py_ssize_t len)
460	523k	{
461	523k	if (len == -1)
462	0	len = strlen(ascii);
463
464	523k	assert(ucs1lib_find_max_char((const Py_UCS1)ascii, (const Py_UCS1)ascii + len) < 128);
465
466	523k	if (writer->buffer == NULL && !writer->overallocate) {
467	38.0k	PyObject *str;
468
469	38.0k	str = _PyUnicode_FromASCII(ascii, len);
470	38.0k	if (str == NULL)
471	0	return -1;
472
473	38.0k	writer->readonly = 1;
474	38.0k	writer->buffer = str;
475	38.0k	_PyUnicodeWriter_Update(writer);
476	38.0k	writer->pos += len;
477	38.0k	return 0;
478	38.0k	}
479
480	485k	if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
481	0	return -1;
482
483	485k	switch (writer->kind)
484	485k	{
485	480k	case PyUnicode_1BYTE_KIND:
486	480k	{
487	480k	const Py_UCS1 str = (const Py_UCS1 )ascii;
488	480k	Py_UCS1 *data = writer->data;
489
490	480k	memcpy(data + writer->pos, str, len);
491	480k	break;
492	0	}
493	3.89k	case PyUnicode_2BYTE_KIND:
494	3.89k	{
495	3.89k	_PyUnicode_CONVERT_BYTES(
496	3.89k	Py_UCS1, Py_UCS2,
497	3.89k	ascii, ascii + len,
498	3.89k	(Py_UCS2 *)writer->data + writer->pos);
499	3.89k	break;
500	0	}
501	1.44k	case PyUnicode_4BYTE_KIND:
502	1.44k	{
503	1.44k	_PyUnicode_CONVERT_BYTES(
504	1.44k	Py_UCS1, Py_UCS4,
505	1.44k	ascii, ascii + len,
506	1.44k	(Py_UCS4 *)writer->data + writer->pos);
507	1.44k	break;
508	0	}
509	0	default:
510	0	Py_UNREACHABLE();
511	485k	}
512
513	485k	writer->pos += len;
514	485k	return 0;
515	485k	}
516
517
518		int
519		PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
520		const char *str,
521		Py_ssize_t size)
522	10	{
523	10	assert(writer != NULL);
524	10	_Py_AssertHoldsTstate();
525
526	10	_PyUnicodeWriter priv_writer = (_PyUnicodeWriter)writer;
527	10	return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
528	10	}
529
530
531		int
532		PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
533		const char *str,
534		Py_ssize_t size)
535	180k	{
536	180k	if (size < 0) {
537	180k	size = strlen(str);
538	180k	}
539
540	180k	_PyUnicodeWriter _writer = (_PyUnicodeWriter)writer;
541	180k	Py_ssize_t old_pos = _writer->pos;
542	180k	int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size,
543	180k	_Py_ERROR_STRICT, NULL, NULL);
544	180k	if (res < 0) {
545	0	_writer->pos = old_pos;
546	0	}
547	180k	return res;
548	180k	}
549
550
551		int
552		PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
553		const char *string,
554		Py_ssize_t length,
555		const char *errors,
556		Py_ssize_t *consumed)
557	0	{
558	0	if (length < 0) {
559	0	length = strlen(string);
560	0	}
561
562	0	_PyUnicodeWriter _writer = (_PyUnicodeWriter)writer;
563	0	Py_ssize_t old_pos = _writer->pos;
564	0	int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length,
565	0	_Py_ERROR_UNKNOWN, errors,
566	0	consumed);
567	0	if (res < 0) {
568	0	_writer->pos = old_pos;
569	0	if (consumed) {
570	0	*consumed = 0;
571	0	}
572	0	}
573	0	return res;
574	0	}
575
576
577		int
578		_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
579		const char *str, Py_ssize_t len)
580	0	{
581	0	Py_UCS4 maxchar;
582
583	0	maxchar = ucs1lib_find_max_char((const Py_UCS1)str, (const Py_UCS1)str + len);
584	0	if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
585	0	return -1;
586	0	unicode_write_cstr(writer->buffer, writer->pos, str, len);
587	0	writer->pos += len;
588	0	return 0;
589	0	}
590
591
592		PyObject *
593		_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
594	9.13M	{
595	9.13M	PyObject *str;
596
597	9.13M	if (writer->pos == 0) {
598	3.84k	Py_CLEAR(writer->buffer);
599	3.84k	return _PyUnicode_GetEmpty();
600	3.84k	}
601
602	9.13M	str = writer->buffer;
603	9.13M	writer->buffer = NULL;
604
605	9.13M	if (writer->readonly) {
606	38.0k	assert(PyUnicode_GET_LENGTH(str) == writer->pos);
607	38.0k	return str;
608	38.0k	}
609
610	9.09M	if (PyUnicode_GET_LENGTH(str) != writer->pos) {
611	8.72M	PyObject *str2;
612	8.72M	str2 = _PyUnicode_ResizeCompact(str, writer->pos);
613	8.72M	if (str2 == NULL) {
614	0	Py_DECREF(str);
615	0	return NULL;
616	0	}
617	8.72M	str = str2;
618	8.72M	}
619
620	9.09M	assert(_PyUnicode_CheckConsistency(str, 1));
621	9.09M	return _PyUnicode_Result(str);
622	9.09M	}
623
624
625		PyObject*
626		PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
627	84.4k	{
628	84.4k	PyObject str = _PyUnicodeWriter_Finish((_PyUnicodeWriter)writer);
629	84.4k	assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
630	84.4k	_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
631	84.4k	return str;
632	84.4k	}
633
634
635		void
636		_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
637	2.48k	{
638		Py_CLEAR(writer->buffer);
639	2.48k	}