/src/cpython/Objects/unicode_writer.c

Source
/*

Unicode implementation based on original code by Fredrik Lundh,
modified by Marc-Andre Lemburg <mal@lemburg.com>.

Major speed upgrades to the method implementations at the Reykjavik
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.

Copyright (c) Corporation for National Research Initiatives.

--------------------------------------------------------------------
The original string type implementation is:

  Copyright (c) 1999 by Secret Labs AB
  Copyright (c) 1999 by Fredrik Lundh

By obtaining, using, and/or copying this software and/or its
associated documentation, you agree that you have read, understood,
and will comply with the following terms and conditions:

Permission to use, copy, modify, and distribute this software and its
associated documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appears in all
copies, and that both that copyright notice and this permission notice
appear in supporting documentation, and that the name of Secret Labs
AB or the author not be used in advertising or publicity pertaining to
distribution of the software without specific, written prior
permission.

SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
--------------------------------------------------------------------

*/

#include "Python.h"
#include "pycore_freelist.h"      // _Py_FREELIST_FREE()
#include "pycore_long.h"          // _PyLong_FormatWriter()
#include "pycore_unicodeobject.h" // _PyUnicode_Result()


#ifdef MS_WINDOWS
   /* On Windows, overallocate by 50% is the best factor */
#  define OVERALLOCATE_FACTOR 2
#else
   /* On Linux, overallocate by 25% is the best factor */
#  define OVERALLOCATE_FACTOR 4
#endif


/* Compilation of templated routines */

#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()

#include "stringlib/ucs1lib.h"
#include "stringlib/find_max_char.h"
#include "stringlib/undef.h"


/* Copy an ASCII or latin1 char* string into a Python Unicode string.

   WARNING: The function doesn't copy the terminating null character and
   doesn't check the maximum character (may write a latin1 character in an
   ASCII string). */
static void
unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
                   const char *str, Py_ssize_t len)
{
    int kind = PyUnicode_KIND(unicode);
    const void *data = PyUnicode_DATA(unicode);
    const char *end = str + len;

    assert(index + len <= PyUnicode_GET_LENGTH(unicode));
    switch (kind) {
    case PyUnicode_1BYTE_KIND: {
#ifdef Py_DEBUG
        if (PyUnicode_IS_ASCII(unicode)) {
            Py_UCS4 maxchar = ucs1lib_find_max_char(
                (const Py_UCS1*)str,
                (const Py_UCS1*)str + len);
            assert(maxchar < 128);
        }
#endif
        memcpy((char *) data + index, str, len);
        break;
    }
    case PyUnicode_2BYTE_KIND: {
        Py_UCS2 *start = (Py_UCS2 *)data + index;
        Py_UCS2 *ucs2 = start;

        for (; str < end; ++ucs2, ++str)
            *ucs2 = (Py_UCS2)*str;

        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
        break;
    }
    case PyUnicode_4BYTE_KIND: {
        Py_UCS4 *start = (Py_UCS4 *)data + index;
        Py_UCS4 *ucs4 = start;

        for (; str < end; ++ucs4, ++str)
            *ucs4 = (Py_UCS4)*str;

        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
        break;
    }
    default:
        Py_UNREACHABLE();
    }
}


static inline void
_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
{
    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
    writer->data = PyUnicode_DATA(writer->buffer);

    if (!writer->readonly) {
        writer->kind = PyUnicode_KIND(writer->buffer);
        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
    }
    else {
        /* use a value smaller than PyUnicode_1BYTE_KIND() so
           _PyUnicodeWriter_PrepareKind() will copy the buffer. */
        writer->kind = 0;
        assert(writer->kind <= PyUnicode_1BYTE_KIND);

        /* Copy-on-write mode: set buffer size to 0 so
         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
         * next write. */
        writer->size = 0;
    }
}


void
_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
{
    memset(writer, 0, sizeof(*writer));

    /* ASCII is the bare minimum */
    writer->min_char = 127;

    /* use a kind value smaller than PyUnicode_1BYTE_KIND so
       _PyUnicodeWriter_PrepareKind() will copy the buffer. */
    assert(writer->kind == 0);
    assert(writer->kind < PyUnicode_1BYTE_KIND);
}


PyUnicodeWriter*
PyUnicodeWriter_Create(Py_ssize_t length)
{
    if (length < 0) {
        PyErr_SetString(PyExc_ValueError,
                        "length must be positive");
        return NULL;
    }

    const size_t size = sizeof(_PyUnicodeWriter);
    PyUnicodeWriter *pub_writer;
    pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
    if (pub_writer == NULL) {
        pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
        if (pub_writer == NULL) {
            return (PyUnicodeWriter *)PyErr_NoMemory();
        }
    }
    _PyUnicodeWriter *writer = (_PyUnicodeWriter *)pub_writer;

    _PyUnicodeWriter_Init(writer);
    if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
        PyUnicodeWriter_Discard(pub_writer);
        return NULL;
    }
    writer->overallocate = 1;

    return pub_writer;
}


void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
{
    if (writer == NULL) {
        return;
    }
    _PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
}


// Initialize _PyUnicodeWriter with initial buffer
void
_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
{
    memset(writer, 0, sizeof(*writer));
    writer->buffer = buffer;
    _PyUnicodeWriter_Update(writer);
    writer->min_length = writer->size;
}


int
_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
                                 Py_ssize_t length, Py_UCS4 maxchar)
{
    Py_ssize_t newlen;
    PyObject *newbuffer;

    assert(length >= 0);
    assert(maxchar <= _Py_MAX_UNICODE);

    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
    assert((maxchar > writer->maxchar && length >= 0)
           || length > 0);

    if (length > PY_SSIZE_T_MAX - writer->pos) {
        PyErr_NoMemory();
        return -1;
    }
    newlen = writer->pos + length;

    maxchar = Py_MAX(maxchar, writer->min_char);

    if (writer->buffer == NULL) {
        assert(!writer->readonly);
        if (writer->overallocate
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
            /* overallocate to limit the number of realloc() */
            newlen += newlen / OVERALLOCATE_FACTOR;
        }
        if (newlen < writer->min_length)
            newlen = writer->min_length;

        writer->buffer = PyUnicode_New(newlen, maxchar);
        if (writer->buffer == NULL)
            return -1;
    }
    else if (newlen > writer->size) {
        if (writer->overallocate
            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
            /* overallocate to limit the number of realloc() */
            newlen += newlen / OVERALLOCATE_FACTOR;
        }
        if (newlen < writer->min_length)
            newlen = writer->min_length;

        if (maxchar > writer->maxchar || writer->readonly) {
            /* resize + widen */
            maxchar = Py_MAX(maxchar, writer->maxchar);
            newbuffer = PyUnicode_New(newlen, maxchar);
            if (newbuffer == NULL)
                return -1;
            _PyUnicode_FastCopyCharacters(newbuffer, 0,
                                          writer->buffer, 0, writer->pos);
            Py_DECREF(writer->buffer);
            writer->readonly = 0;
        }
        else {
            newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen);
            if (newbuffer == NULL)
                return -1;
        }
        writer->buffer = newbuffer;
    }
    else if (maxchar > writer->maxchar) {
        assert(!writer->readonly);
        newbuffer = PyUnicode_New(writer->size, maxchar);
        if (newbuffer == NULL)
            return -1;
        _PyUnicode_FastCopyCharacters(newbuffer, 0,
                                      writer->buffer, 0, writer->pos);
        Py_SETREF(writer->buffer, newbuffer);
    }
    _PyUnicodeWriter_Update(writer);
    return 0;

#undef OVERALLOCATE_FACTOR
}

int
_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
                                     int kind)
{
    Py_UCS4 maxchar;

    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
    assert(writer->kind < kind);

    switch (kind)
    {
    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
    case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break;
    default:
        Py_UNREACHABLE();
    }

    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
}


int
_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
{
    return _PyUnicodeWriter_WriteCharInline(writer, ch);
}


int
PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
{
    if (ch > _Py_MAX_UNICODE) {
        PyErr_SetString(PyExc_ValueError,
                        "character must be in range(0x110000)");
        return -1;
    }

    return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
}


int
_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
{
    assert(PyUnicode_Check(str));

    Py_UCS4 maxchar;
    Py_ssize_t len;

    len = PyUnicode_GET_LENGTH(str);
    if (len == 0)
        return 0;
    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
        if (writer->buffer == NULL && !writer->overallocate) {
            assert(_PyUnicode_CheckConsistency(str, 1));
            writer->readonly = 1;
            writer->buffer = Py_NewRef(str);
            _PyUnicodeWriter_Update(writer);
            writer->pos += len;
            return 0;
        }
        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
            return -1;
    }
    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
                                  str, 0, len);
    writer->pos += len;
    return 0;
}


int
PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
{
    PyTypeObject *type = Py_TYPE(obj);
    if (type == &PyUnicode_Type) {
        return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
    }

    if (type == &PyLong_Type) {
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
    }

    PyObject *str = PyObject_Str(obj);
    if (str == NULL) {
        return -1;
    }

    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
    Py_DECREF(str);
    return res;
}


int
PyUnicodeWriter_WriteRepr(PyUnicodeWriter *writer, PyObject *obj)
{
    if (Py_TYPE(obj) == &PyLong_Type) {
        return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
    }

    PyObject *repr = PyObject_Repr(obj);
    if (repr == NULL) {
        return -1;
    }

    int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
    Py_DECREF(repr);
    return res;
}


int
_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
                                Py_ssize_t start, Py_ssize_t end)
{
    assert(0 <= start);
    assert(end <= PyUnicode_GET_LENGTH(str));
    assert(start <= end);

    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
        return _PyUnicodeWriter_WriteStr(writer, str);

    Py_ssize_t len = end - start;
    if (len == 0) {
        return 0;
    }

    Py_UCS4 maxchar;
    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
        maxchar = _PyUnicode_FindMaxChar(str, start, end);
    }
    else {
        maxchar = writer->maxchar;
    }
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
        return -1;
    }

    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
                                  str, start, len);
    writer->pos += len;
    return 0;
}


int
PyUnicodeWriter_WriteSubstring(PyUnicodeWriter *writer, PyObject *str,
                               Py_ssize_t start, Py_ssize_t end)
{
    if (!PyUnicode_Check(str)) {
        PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
        return -1;
    }
    if (start < 0 || start > end) {
        PyErr_Format(PyExc_ValueError, "invalid start argument");
        return -1;
    }
    if (end > PyUnicode_GET_LENGTH(str)) {
        PyErr_Format(PyExc_ValueError, "invalid end argument");
        return -1;
    }

    return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
                                           start, end);
}


int
_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
                                  const char *ascii, Py_ssize_t len)
{
    if (len == -1)
        len = strlen(ascii);

    assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);

    if (writer->buffer == NULL && !writer->overallocate) {
        PyObject *str;

        str = _PyUnicode_FromASCII(ascii, len);
        if (str == NULL)
            return -1;

        writer->readonly = 1;
        writer->buffer = str;
        _PyUnicodeWriter_Update(writer);
        writer->pos += len;
        return 0;
    }

    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
        return -1;

    switch (writer->kind)
    {
    case PyUnicode_1BYTE_KIND:
    {
        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
        Py_UCS1 *data = writer->data;

        memcpy(data + writer->pos, str, len);
        break;
    }
    case PyUnicode_2BYTE_KIND:
    {
        _PyUnicode_CONVERT_BYTES(
            Py_UCS1, Py_UCS2,
            ascii, ascii + len,
            (Py_UCS2 *)writer->data + writer->pos);
        break;
    }
    case PyUnicode_4BYTE_KIND:
    {
        _PyUnicode_CONVERT_BYTES(
            Py_UCS1, Py_UCS4,
            ascii, ascii + len,
            (Py_UCS4 *)writer->data + writer->pos);
        break;
    }
    default:
        Py_UNREACHABLE();
    }

    writer->pos += len;
    return 0;
}


int
PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
                           const char *str,
                           Py_ssize_t size)
{
    assert(writer != NULL);
    _Py_AssertHoldsTstate();

    _PyUnicodeWriter *priv_writer = (_PyUnicodeWriter*)writer;
    return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
}


int
PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
                          const char *str,
                          Py_ssize_t size)
{
    if (size < 0) {
        size = strlen(str);
    }

    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
    Py_ssize_t old_pos = _writer->pos;
    int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size,
                                          _Py_ERROR_STRICT, NULL, NULL);
    if (res < 0) {
        _writer->pos = old_pos;
    }
    return res;
}


int
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
                                   const char *string,
                                   Py_ssize_t length,
                                   const char *errors,
                                   Py_ssize_t *consumed)
{
    if (length < 0) {
        length = strlen(string);
    }

    _PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
    Py_ssize_t old_pos = _writer->pos;
    int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length,
                                          _Py_ERROR_UNKNOWN, errors,
                                          consumed);
    if (res < 0) {
        _writer->pos = old_pos;
        if (consumed) {
            *consumed = 0;
        }
    }
    return res;
}


int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
                                   const char *str, Py_ssize_t len)
{
    Py_UCS4 maxchar;

    maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
        return -1;
    unicode_write_cstr(writer->buffer, writer->pos, str, len);
    writer->pos += len;
    return 0;
}


PyObject *
_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
{
    PyObject *str;

    if (writer->pos == 0) {
        Py_CLEAR(writer->buffer);
        return _PyUnicode_GetEmpty();
    }

    str = writer->buffer;
    writer->buffer = NULL;

    if (writer->readonly) {
        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
        return str;
    }

    if (PyUnicode_GET_LENGTH(str) != writer->pos) {
        PyObject *str2;
        str2 = _PyUnicode_ResizeCompact(str, writer->pos);
        if (str2 == NULL) {
            Py_DECREF(str);
            return NULL;
        }
        str = str2;
    }

    assert(_PyUnicode_CheckConsistency(str, 1));
    return _PyUnicode_Result(str);
}


PyObject*
PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
{
    PyObject *str = _PyUnicodeWriter_Finish((_PyUnicodeWriter*)writer);
    assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
    _Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
    return str;
}


void
_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
{
    Py_CLEAR(writer->buffer);
}

Coverage Report

Created: 2025-11-02 06:30

Line	Count	Source
1		/*
2
3		Unicode implementation based on original code by Fredrik Lundh,
4		modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6		Major speed upgrades to the method implementations at the Reykjavik
7		NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9		Copyright (c) Corporation for National Research Initiatives.
10
11		--------------------------------------------------------------------
12		The original string type implementation is:
13
14		Copyright (c) 1999 by Secret Labs AB
15		Copyright (c) 1999 by Fredrik Lundh
16
17		By obtaining, using, and/or copying this software and/or its
18		associated documentation, you agree that you have read, understood,
19		and will comply with the following terms and conditions:
20
21		Permission to use, copy, modify, and distribute this software and its
22		associated documentation for any purpose and without fee is hereby
23		granted, provided that the above copyright notice appears in all
24		copies, and that both that copyright notice and this permission notice
25		appear in supporting documentation, and that the name of Secret Labs
26		AB or the author not be used in advertising or publicity pertaining to
27		distribution of the software without specific, written prior
28		permission.
29
30		SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31		THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32		FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33		ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34		WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35		ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36		OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37		--------------------------------------------------------------------
38
39		*/
40
41		#include "Python.h"
42		#include "pycore_freelist.h" // _Py_FREELIST_FREE()
43		#include "pycore_long.h" // _PyLong_FormatWriter()
44		#include "pycore_unicodeobject.h" // _PyUnicode_Result()
45
46
47		#ifdef MS_WINDOWS
48		/* On Windows, overallocate by 50% is the best factor */
49		# define OVERALLOCATE_FACTOR 2
50		#else
51		/* On Linux, overallocate by 25% is the best factor */
52	103M	# define OVERALLOCATE_FACTOR 4
53		#endif
54
55
56		/* Compilation of templated routines */
57
58		#define STRINGLIB_GET_EMPTY() _PyUnicode_GetEmpty()
59
60		#include "stringlib/ucs1lib.h"
61		#include "stringlib/find_max_char.h"
62		#include "stringlib/undef.h"
63
64
65		/* Copy an ASCII or latin1 char* string into a Python Unicode string.
66
67		WARNING: The function doesn't copy the terminating null character and
68		doesn't check the maximum character (may write a latin1 character in an
69		ASCII string). */
70		static void
71		unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
72		const char *str, Py_ssize_t len)
73	0	{
74	0	int kind = PyUnicode_KIND(unicode);
75	0	const void *data = PyUnicode_DATA(unicode);
76	0	const char *end = str + len;
77
78	0	assert(index + len <= PyUnicode_GET_LENGTH(unicode));
79	0	switch (kind) {
80	0	case PyUnicode_1BYTE_KIND: {
81		#ifdef Py_DEBUG
82		if (PyUnicode_IS_ASCII(unicode)) {
83		Py_UCS4 maxchar = ucs1lib_find_max_char(
84		(const Py_UCS1*)str,
85		(const Py_UCS1*)str + len);
86		assert(maxchar < 128);
87		}
88		#endif
89	0	memcpy((char *) data + index, str, len);
90	0	break;
91	0	}
92	0	case PyUnicode_2BYTE_KIND: {
93	0	Py_UCS2 start = (Py_UCS2 )data + index;
94	0	Py_UCS2 *ucs2 = start;
95
96	0	for (; str < end; ++ucs2, ++str)
97	0	ucs2 = (Py_UCS2)str;
98
99	0	assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
100	0	break;
101	0	}
102	0	case PyUnicode_4BYTE_KIND: {
103	0	Py_UCS4 start = (Py_UCS4 )data + index;
104	0	Py_UCS4 *ucs4 = start;
105
106	0	for (; str < end; ++ucs4, ++str)
107	0	ucs4 = (Py_UCS4)str;
108
109	0	assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
110	0	break;
111	0	}
112	0	default:
113	0	Py_UNREACHABLE();
114	0	}
115	0	}
116
117
118		static inline void
119		_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
120	68.7M	{
121	68.7M	writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
122	68.7M	writer->data = PyUnicode_DATA(writer->buffer);
123
124	68.7M	if (!writer->readonly) {
125	68.7M	writer->kind = PyUnicode_KIND(writer->buffer);
126	68.7M	writer->size = PyUnicode_GET_LENGTH(writer->buffer);
127	68.7M	}
128	16.2k	else {
129		/* use a value smaller than PyUnicode_1BYTE_KIND() so
130		_PyUnicodeWriter_PrepareKind() will copy the buffer. */
131	16.2k	writer->kind = 0;
132	16.2k	assert(writer->kind <= PyUnicode_1BYTE_KIND);
133
134		/* Copy-on-write mode: set buffer size to 0 so
135		* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
136		* next write. */
137	16.2k	writer->size = 0;
138	16.2k	}
139	68.7M	}
140
141
142		void
143		_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
144	54.1M	{
145	54.1M	memset(writer, 0, sizeof(*writer));
146
147		/* ASCII is the bare minimum */
148	54.1M	writer->min_char = 127;
149
150		/* use a kind value smaller than PyUnicode_1BYTE_KIND so
151		_PyUnicodeWriter_PrepareKind() will copy the buffer. */
152	54.1M	assert(writer->kind == 0);
153	54.1M	assert(writer->kind < PyUnicode_1BYTE_KIND);
154	54.1M	}
155
156
157		PyUnicodeWriter*
158		PyUnicodeWriter_Create(Py_ssize_t length)
159	4.41M	{
160	4.41M	if (length < 0) {
161	0	PyErr_SetString(PyExc_ValueError,
162	0	"length must be positive");
163	0	return NULL;
164	0	}
165
166	4.41M	const size_t size = sizeof(_PyUnicodeWriter);
167	4.41M	PyUnicodeWriter *pub_writer;
168	4.41M	pub_writer = _Py_FREELIST_POP_MEM(unicode_writers);
169	4.41M	if (pub_writer == NULL) {
170	2.48M	pub_writer = (PyUnicodeWriter *)PyMem_Malloc(size);
171	2.48M	if (pub_writer == NULL) {
172	0	return (PyUnicodeWriter *)PyErr_NoMemory();
173	0	}
174	2.48M	}
175	4.41M	_PyUnicodeWriter writer = (_PyUnicodeWriter )pub_writer;
176
177	4.41M	_PyUnicodeWriter_Init(writer);
178	4.41M	if (_PyUnicodeWriter_Prepare(writer, length, 127) < 0) {
179	0	PyUnicodeWriter_Discard(pub_writer);
180	0	return NULL;
181	0	}
182	4.41M	writer->overallocate = 1;
183
184	4.41M	return pub_writer;
185	4.41M	}
186
187
188		void PyUnicodeWriter_Discard(PyUnicodeWriter *writer)
189	64.4k	{
190	64.4k	if (writer == NULL) {
191	63.9k	return;
192	63.9k	}
193	469	_PyUnicodeWriter_Dealloc((_PyUnicodeWriter*)writer);
194	469	_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
195	469	}
196
197
198		// Initialize _PyUnicodeWriter with initial buffer
199		void
200		_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter writer, PyObject buffer)
201	681k	{
202	681k	memset(writer, 0, sizeof(*writer));
203	681k	writer->buffer = buffer;
204	681k	_PyUnicodeWriter_Update(writer);
205	681k	writer->min_length = writer->size;
206	681k	}
207
208
209		int
210		_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
211		Py_ssize_t length, Py_UCS4 maxchar)
212	68.0M	{
213	68.0M	Py_ssize_t newlen;
214	68.0M	PyObject *newbuffer;
215
216	68.0M	assert(length >= 0);
217	68.0M	assert(maxchar <= _Py_MAX_UNICODE);
218
219		/* ensure that the _PyUnicodeWriter_Prepare macro was used */
220	68.0M	assert((maxchar > writer->maxchar && length >= 0)
221	68.0M	\|\| length > 0);
222
223	68.0M	if (length > PY_SSIZE_T_MAX - writer->pos) {
224	0	PyErr_NoMemory();
225	0	return -1;
226	0	}
227	68.0M	newlen = writer->pos + length;
228
229	68.0M	maxchar = Py_MAX(maxchar, writer->min_char);
230
231	68.0M	if (writer->buffer == NULL) {
232	48.8M	assert(!writer->readonly);
233	48.8M	if (writer->overallocate
234	35.7M	&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
235		/* overallocate to limit the number of realloc() */
236	35.7M	newlen += newlen / OVERALLOCATE_FACTOR;
237	35.7M	}
238	48.8M	if (newlen < writer->min_length)
239	44.3M	newlen = writer->min_length;
240
241	48.8M	writer->buffer = PyUnicode_New(newlen, maxchar);
242	48.8M	if (writer->buffer == NULL)
243	0	return -1;
244	48.8M	}
245	19.1M	else if (newlen > writer->size) {
246	16.3M	if (writer->overallocate
247	15.9M	&& newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
248		/* overallocate to limit the number of realloc() */
249	15.9M	newlen += newlen / OVERALLOCATE_FACTOR;
250	15.9M	}
251	16.3M	if (newlen < writer->min_length)
252	1.34k	newlen = writer->min_length;
253
254	16.3M	if (maxchar > writer->maxchar \|\| writer->readonly) {
255		/* resize + widen */
256	3.84M	maxchar = Py_MAX(maxchar, writer->maxchar);
257	3.84M	newbuffer = PyUnicode_New(newlen, maxchar);
258	3.84M	if (newbuffer == NULL)
259	0	return -1;
260	3.84M	_PyUnicode_FastCopyCharacters(newbuffer, 0,
261	3.84M	writer->buffer, 0, writer->pos);
262	3.84M	Py_DECREF(writer->buffer);
263	3.84M	writer->readonly = 0;
264	3.84M	}
265	12.4M	else {
266	12.4M	newbuffer = _PyUnicode_ResizeCompact(writer->buffer, newlen);
267	12.4M	if (newbuffer == NULL)
268	0	return -1;
269	12.4M	}
270	16.3M	writer->buffer = newbuffer;
271	16.3M	}
272	2.86M	else if (maxchar > writer->maxchar) {
273	2.86M	assert(!writer->readonly);
274	2.86M	newbuffer = PyUnicode_New(writer->size, maxchar);
275	2.86M	if (newbuffer == NULL)
276	0	return -1;
277	2.86M	_PyUnicode_FastCopyCharacters(newbuffer, 0,
278	2.86M	writer->buffer, 0, writer->pos);
279	2.86M	Py_SETREF(writer->buffer, newbuffer);
280	2.86M	}
281	68.0M	_PyUnicodeWriter_Update(writer);
282	68.0M	return 0;
283
284	68.0M	#undef OVERALLOCATE_FACTOR
285	68.0M	}
286
287		int
288		_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
289		int kind)
290	150k	{
291	150k	Py_UCS4 maxchar;
292
293		/* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
294	150k	assert(writer->kind < kind);
295
296	150k	switch (kind)
297	150k	{
298	0	case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
299	150k	case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
300	0	case PyUnicode_4BYTE_KIND: maxchar = _Py_MAX_UNICODE; break;
301	0	default:
302	0	Py_UNREACHABLE();
303	150k	}
304
305	150k	return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
306	150k	}
307
308
309		int
310		_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
311	98.4M	{
312	98.4M	return _PyUnicodeWriter_WriteCharInline(writer, ch);
313	98.4M	}
314
315
316		int
317		PyUnicodeWriter_WriteChar(PyUnicodeWriter *writer, Py_UCS4 ch)
318	66.8M	{
319	66.8M	if (ch > _Py_MAX_UNICODE) {
320	0	PyErr_SetString(PyExc_ValueError,
321	0	"character must be in range(0x110000)");
322	0	return -1;
323	0	}
324
325	66.8M	return _PyUnicodeWriter_WriteChar((_PyUnicodeWriter*)writer, ch);
326	66.8M	}
327
328
329		int
330		_PyUnicodeWriter_WriteStr(_PyUnicodeWriter writer, PyObject str)
331	70.1M	{
332	70.1M	assert(PyUnicode_Check(str));
333
334	70.1M	Py_UCS4 maxchar;
335	70.1M	Py_ssize_t len;
336
337	70.1M	len = PyUnicode_GET_LENGTH(str);
338	70.1M	if (len == 0)
339	26.2M	return 0;
340	43.8M	maxchar = PyUnicode_MAX_CHAR_VALUE(str);
341	43.8M	if (maxchar > writer->maxchar \|\| len > writer->size - writer->pos) {
342	24.2M	if (writer->buffer == NULL && !writer->overallocate) {
343	8.17k	assert(_PyUnicode_CheckConsistency(str, 1));
344	8.17k	writer->readonly = 1;
345	8.17k	writer->buffer = Py_NewRef(str);
346	8.17k	_PyUnicodeWriter_Update(writer);
347	8.17k	writer->pos += len;
348	8.17k	return 0;
349	8.17k	}
350	24.2M	if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
351	0	return -1;
352	24.2M	}
353	43.8M	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
354	43.8M	str, 0, len);
355	43.8M	writer->pos += len;
356	43.8M	return 0;
357	43.8M	}
358
359
360		int
361		PyUnicodeWriter_WriteStr(PyUnicodeWriter writer, PyObject obj)
362	4.44M	{
363	4.44M	PyTypeObject *type = Py_TYPE(obj);
364	4.44M	if (type == &PyUnicode_Type) {
365	4.44M	return _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, obj);
366	4.44M	}
367
368	0	if (type == &PyLong_Type) {
369	0	return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
370	0	}
371
372	0	PyObject *str = PyObject_Str(obj);
373	0	if (str == NULL) {
374	0	return -1;
375	0	}
376
377	0	int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, str);
378	0	Py_DECREF(str);
379	0	return res;
380	0	}
381
382
383		int
384		PyUnicodeWriter_WriteRepr(PyUnicodeWriter writer, PyObject obj)
385	7.98M	{
386	7.98M	if (Py_TYPE(obj) == &PyLong_Type) {
387	847k	return _PyLong_FormatWriter((_PyUnicodeWriter*)writer, obj, 10, 0);
388	847k	}
389
390	7.13M	PyObject *repr = PyObject_Repr(obj);
391	7.13M	if (repr == NULL) {
392	0	return -1;
393	0	}
394
395	7.13M	int res = _PyUnicodeWriter_WriteStr((_PyUnicodeWriter*)writer, repr);
396	7.13M	Py_DECREF(repr);
397	7.13M	return res;
398	7.13M	}
399
400
401		int
402		_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter writer, PyObject str,
403		Py_ssize_t start, Py_ssize_t end)
404	72.2M	{
405	72.2M	assert(0 <= start);
406	72.2M	assert(end <= PyUnicode_GET_LENGTH(str));
407	72.2M	assert(start <= end);
408
409	72.2M	if (start == 0 && end == PyUnicode_GET_LENGTH(str))
410	116	return _PyUnicodeWriter_WriteStr(writer, str);
411
412	72.2M	Py_ssize_t len = end - start;
413	72.2M	if (len == 0) {
414	0	return 0;
415	0	}
416
417	72.2M	Py_UCS4 maxchar;
418	72.2M	if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) {
419	14.1M	maxchar = _PyUnicode_FindMaxChar(str, start, end);
420	14.1M	}
421	58.0M	else {
422	58.0M	maxchar = writer->maxchar;
423	58.0M	}
424	72.2M	if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0) {
425	0	return -1;
426	0	}
427
428	72.2M	_PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
429	72.2M	str, start, len);
430	72.2M	writer->pos += len;
431	72.2M	return 0;
432	72.2M	}
433
434
435		int
436		PyUnicodeWriter_WriteSubstring(PyUnicodeWriter writer, PyObject str,
437		Py_ssize_t start, Py_ssize_t end)
438	589k	{
439	589k	if (!PyUnicode_Check(str)) {
440	0	PyErr_Format(PyExc_TypeError, "expect str, not %T", str);
441	0	return -1;
442	0	}
443	589k	if (start < 0 \|\| start > end) {
444	0	PyErr_Format(PyExc_ValueError, "invalid start argument");
445	0	return -1;
446	0	}
447	589k	if (end > PyUnicode_GET_LENGTH(str)) {
448	0	PyErr_Format(PyExc_ValueError, "invalid end argument");
449	0	return -1;
450	0	}
451
452	589k	return _PyUnicodeWriter_WriteSubstring((_PyUnicodeWriter*)writer, str,
453	589k	start, end);
454	589k	}
455
456
457		int
458		_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
459		const char *ascii, Py_ssize_t len)
460	46.0M	{
461	46.0M	if (len == -1)
462	0	len = strlen(ascii);
463
464	46.0M	assert(ucs1lib_find_max_char((const Py_UCS1)ascii, (const Py_UCS1)ascii + len) < 128);
465
466	46.0M	if (writer->buffer == NULL && !writer->overallocate) {
467	8.04k	PyObject *str;
468
469	8.04k	str = _PyUnicode_FromASCII(ascii, len);
470	8.04k	if (str == NULL)
471	0	return -1;
472
473	8.04k	writer->readonly = 1;
474	8.04k	writer->buffer = str;
475	8.04k	_PyUnicodeWriter_Update(writer);
476	8.04k	writer->pos += len;
477	8.04k	return 0;
478	8.04k	}
479
480	45.9M	if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
481	0	return -1;
482
483	45.9M	switch (writer->kind)
484	45.9M	{
485	45.9M	case PyUnicode_1BYTE_KIND:
486	45.9M	{
487	45.9M	const Py_UCS1 str = (const Py_UCS1 )ascii;
488	45.9M	Py_UCS1 *data = writer->data;
489
490	45.9M	memcpy(data + writer->pos, str, len);
491	45.9M	break;
492	0	}
493	12.3k	case PyUnicode_2BYTE_KIND:
494	12.3k	{
495	12.3k	_PyUnicode_CONVERT_BYTES(
496	12.3k	Py_UCS1, Py_UCS2,
497	12.3k	ascii, ascii + len,
498	12.3k	(Py_UCS2 *)writer->data + writer->pos);
499	12.3k	break;
500	0	}
501	3.75k	case PyUnicode_4BYTE_KIND:
502	3.75k	{
503	3.75k	_PyUnicode_CONVERT_BYTES(
504	3.75k	Py_UCS1, Py_UCS4,
505	3.75k	ascii, ascii + len,
506	3.75k	(Py_UCS4 *)writer->data + writer->pos);
507	3.75k	break;
508	0	}
509	0	default:
510	0	Py_UNREACHABLE();
511	45.9M	}
512
513	45.9M	writer->pos += len;
514	45.9M	return 0;
515	45.9M	}
516
517
518		int
519		PyUnicodeWriter_WriteASCII(PyUnicodeWriter *writer,
520		const char *str,
521		Py_ssize_t size)
522	564k	{
523	564k	assert(writer != NULL);
524	564k	_Py_AssertHoldsTstate();
525
526	564k	_PyUnicodeWriter priv_writer = (_PyUnicodeWriter)writer;
527	564k	return _PyUnicodeWriter_WriteASCIIString(priv_writer, str, size);
528	564k	}
529
530
531		int
532		PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
533		const char *str,
534		Py_ssize_t size)
535	0	{
536	0	if (size < 0) {
537	0	size = strlen(str);
538	0	}
539
540	0	_PyUnicodeWriter _writer = (_PyUnicodeWriter)writer;
541	0	Py_ssize_t old_pos = _writer->pos;
542	0	int res = _PyUnicode_DecodeUTF8Writer(_writer, str, size,
543	0	_Py_ERROR_STRICT, NULL, NULL);
544	0	if (res < 0) {
545	0	_writer->pos = old_pos;
546	0	}
547	0	return res;
548	0	}
549
550
551		int
552		PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
553		const char *string,
554		Py_ssize_t length,
555		const char *errors,
556		Py_ssize_t *consumed)
557	0	{
558	0	if (length < 0) {
559	0	length = strlen(string);
560	0	}
561
562	0	_PyUnicodeWriter _writer = (_PyUnicodeWriter)writer;
563	0	Py_ssize_t old_pos = _writer->pos;
564	0	int res = _PyUnicode_DecodeUTF8Writer(_writer, string, length,
565	0	_Py_ERROR_UNKNOWN, errors,
566	0	consumed);
567	0	if (res < 0) {
568	0	_writer->pos = old_pos;
569	0	if (consumed) {
570	0	*consumed = 0;
571	0	}
572	0	}
573	0	return res;
574	0	}
575
576
577		int
578		_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
579		const char *str, Py_ssize_t len)
580	0	{
581	0	Py_UCS4 maxchar;
582
583	0	maxchar = ucs1lib_find_max_char((const Py_UCS1)str, (const Py_UCS1)str + len);
584	0	if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
585	0	return -1;
586	0	unicode_write_cstr(writer->buffer, writer->pos, str, len);
587	0	writer->pos += len;
588	0	return 0;
589	0	}
590
591
592		PyObject *
593		_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
594	49.5M	{
595	49.5M	PyObject *str;
596
597	49.5M	if (writer->pos == 0) {
598	702	Py_CLEAR(writer->buffer);
599	702	return _PyUnicode_GetEmpty();
600	702	}
601
602	49.5M	str = writer->buffer;
603	49.5M	writer->buffer = NULL;
604
605	49.5M	if (writer->readonly) {
606	14.8k	assert(PyUnicode_GET_LENGTH(str) == writer->pos);
607	14.8k	return str;
608	14.8k	}
609
610	49.5M	if (PyUnicode_GET_LENGTH(str) != writer->pos) {
611	48.4M	PyObject *str2;
612	48.4M	str2 = _PyUnicode_ResizeCompact(str, writer->pos);
613	48.4M	if (str2 == NULL) {
614	0	Py_DECREF(str);
615	0	return NULL;
616	0	}
617	48.4M	str = str2;
618	48.4M	}
619
620	49.5M	assert(_PyUnicode_CheckConsistency(str, 1));
621	49.5M	return _PyUnicode_Result(str);
622	49.5M	}
623
624
625		PyObject*
626		PyUnicodeWriter_Finish(PyUnicodeWriter *writer)
627	4.41M	{
628	4.41M	PyObject str = _PyUnicodeWriter_Finish((_PyUnicodeWriter)writer);
629	4.41M	assert(((_PyUnicodeWriter*)writer)->buffer == NULL);
630	4.41M	_Py_FREELIST_FREE(unicode_writers, writer, PyMem_Free);
631	4.41M	return str;
632	4.41M	}
633
634
635		void
636		_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
637	5.30M	{
638		Py_CLEAR(writer->buffer);
639	5.30M	}