/src/cpython/Python/pystrhex.c

Source
/* Format bytes as hexadecimal */

#include "Python.h"
#include "pycore_strhex.h"        // _Py_strhex_with_sep()
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()

/* Scalar hexlify: convert len bytes to 2*len hex characters.
   Uses table lookup via Py_hexdigits for the conversion. */
static inline void
_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
    /* Various optimizations like using math instead of a table lookup,
       manually unrolling the loop, storing the global table pointer locally,
       and doing wider dst writes have been tried and benchmarked; all produced
       nearly identical performance on gcc 15.  Using a 256 entry uint16_t
       table was a bit slower.  So we keep our old simple and obvious code. */
    for (Py_ssize_t i = 0; i < len; i++) {
        unsigned char c = src[i];
        *dst++ = Py_hexdigits[c >> 4];
        *dst++ = Py_hexdigits[c & 0x0f];
    }
}

/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
   Uses __builtin_shufflevector for portable interleave that compiles to
   native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
   NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
   for the target microarch allow it [try -march=native if running 32-bit
   on an RPi3 or later]).

   Performance:
   - For more common small data it varies between 1.1-3x faster.
   - Up to 11x faster on larger data than the scalar code.

   While faster is possible for big data using AVX2 or AVX512, that
   adds a ton of complication. Who ever really hexes huge data?
   The 16-64 byte boosts align nicely with md5 - sha512 hexdigests.
*/
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR

/* 128-bit vector of 16 unsigned bytes */
typedef unsigned char v16u8 __attribute__((vector_size(16)));
/* 128-bit vector of 16 signed bytes - for efficient comparison.
   Using signed comparison generates pcmpgtb on x86-64 instead of
   the slower psubusb+pcmpeqb sequence from unsigned comparison.
   ARM NEON performs the same either way. */
typedef signed char v16s8 __attribute__((vector_size(16)));

/* Splat a byte value across all 16 lanes */
static inline v16u8
v16u8_splat(unsigned char x)
{
    return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}

static inline v16s8
v16s8_splat(signed char x)
{
    return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}

/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
   Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
static void
_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
    const v16u8 mask_0f = v16u8_splat(0x0f);
    const v16u8 ascii_0 = v16u8_splat('0');
    const v16u8 offset = v16u8_splat('a' - '0' - 10);  /* 0x27 */
    const v16s8 nine = v16s8_splat(9);

    Py_ssize_t i = 0;

    /* Process 16 bytes at a time */
    for (; i + 16 <= len; i += 16, dst += 32) {
        /* Load 16 bytes (memcpy for safe unaligned access) */
        v16u8 data;
        memcpy(&data, src + i, 16);

        /* Extract high and low nibbles using vector operators */
        v16u8 hi = (data >> 4) & mask_0f;
        v16u8 lo = data & mask_0f;

        /* Compare > 9 using signed comparison for efficient codegen.
           Nibble values 0-15 are safely in signed byte range.
           This generates pcmpgtb on x86-64, avoiding the slower
           psubusb+pcmpeqb sequence from unsigned comparison. */
        v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
        v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);

        /* Convert nibbles to hex ASCII */
        hi = hi + ascii_0 + (hi_gt9 & offset);
        lo = lo + ascii_0 + (lo_gt9 & offset);

        /* Interleave hi/lo nibbles using portable shufflevector.
           This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
           or vzip on ARM32. */
        v16u8 result0 = __builtin_shufflevector(hi, lo,
            0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
        v16u8 result1 = __builtin_shufflevector(hi, lo,
            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);

        /* Store 32 hex characters */
        memcpy(dst, &result0, 16);
        memcpy(dst + 16, &result1, 16);
    }

    /* Scalar fallback for remaining 0-15 bytes */
    _Py_hexlify_scalar(src + i, dst, len - i);
}

#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */

static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
                                 PyObject* sep, int bytes_per_sep_group,
                                 const int return_bytes)
{
    assert(arglen >= 0);

    Py_UCS1 sep_char = 0;
    if (sep) {
        Py_ssize_t seplen = PyObject_Length((PyObject*)sep);
        if (seplen < 0) {
            return NULL;
        }
        if (seplen != 1) {
            PyErr_SetString(PyExc_ValueError, "sep must be length 1.");
            return NULL;
        }
        if (PyUnicode_Check(sep)) {
            if (PyUnicode_KIND(sep) != PyUnicode_1BYTE_KIND) {
                PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
                return NULL;
            }
            sep_char = PyUnicode_READ_CHAR(sep, 0);
        }
        else if (PyBytes_Check(sep)) {
            sep_char = PyBytes_AS_STRING(sep)[0];
        }
        else {
            PyErr_SetString(PyExc_TypeError, "sep must be str or bytes.");
            return NULL;
        }
        if (sep_char > 127 && !return_bytes) {
            PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
            return NULL;
        }
    }
    else {
        bytes_per_sep_group = 0;
    }
    unsigned int abs_bytes_per_sep = _Py_ABS_CAST(unsigned int, bytes_per_sep_group);
    Py_ssize_t resultlen = 0;
    if (bytes_per_sep_group && arglen > 0) {
        /* How many sep characters we'll be inserting. */
        resultlen = (arglen - 1) / abs_bytes_per_sep;
    }
    /* Bounds checking for our Py_ssize_t indices. */
    if (arglen >= PY_SSIZE_T_MAX / 2 - resultlen) {
        return PyErr_NoMemory();
    }
    resultlen += arglen * 2;

    if ((size_t)abs_bytes_per_sep >= (size_t)arglen) {
        bytes_per_sep_group = 0;
        abs_bytes_per_sep = 0;
    }

    PyObject *retval;
    Py_UCS1 *retbuf;
    if (return_bytes) {
        /* If _PyBytes_FromSize() were public we could avoid malloc+copy. */
        retval = PyBytes_FromStringAndSize(NULL, resultlen);
        if (!retval) {
            return NULL;
        }
        retbuf = (Py_UCS1 *)PyBytes_AS_STRING(retval);
    }
    else {
        retval = PyUnicode_New(resultlen, 127);
        if (!retval) {
            return NULL;
        }
        retbuf = PyUnicode_1BYTE_DATA(retval);
    }

    /* Hexlify */
    Py_ssize_t i, j;
    unsigned char c;

    if (bytes_per_sep_group == 0) {
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
        if (arglen >= 16) {
            _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
        }
        else
#endif
        {
            _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
        }
    }
    else {
        /* The number of complete chunk+sep periods */
        Py_ssize_t chunks = (arglen - 1) / abs_bytes_per_sep;
        Py_ssize_t chunk;
        unsigned int k;

        if (bytes_per_sep_group < 0) {
            i = j = 0;
            for (chunk = 0; chunk < chunks; chunk++) {
                for (k = 0; k < abs_bytes_per_sep; k++) {
                    c = argbuf[i++];
                    retbuf[j++] = Py_hexdigits[c >> 4];
                    retbuf[j++] = Py_hexdigits[c & 0x0f];
                }
                retbuf[j++] = sep_char;
            }
            while (i < arglen) {
                c = argbuf[i++];
                retbuf[j++] = Py_hexdigits[c >> 4];
                retbuf[j++] = Py_hexdigits[c & 0x0f];
            }
            assert(j == resultlen);
        }
        else {
            i = arglen - 1;
            j = resultlen - 1;
            for (chunk = 0; chunk < chunks; chunk++) {
                for (k = 0; k < abs_bytes_per_sep; k++) {
                    c = argbuf[i--];
                    retbuf[j--] = Py_hexdigits[c & 0x0f];
                    retbuf[j--] = Py_hexdigits[c >> 4];
                }
                retbuf[j--] = sep_char;
            }
            while (i >= 0) {
                c = argbuf[i--];
                retbuf[j--] = Py_hexdigits[c & 0x0f];
                retbuf[j--] = Py_hexdigits[c >> 4];
            }
            assert(j == -1);
        }
    }

#ifdef Py_DEBUG
    if (!return_bytes) {
        assert(_PyUnicode_CheckConsistency(retval, 1));
    }
#endif

    return retval;
}

PyObject * _Py_strhex(const char* argbuf, const Py_ssize_t arglen)
{
    return _Py_strhex_impl(argbuf, arglen, NULL, 0, 0);
}

/* Same as above but returns a bytes() instead of str() to avoid the
 * need to decode the str() when bytes are needed. */
PyObject* _Py_strhex_bytes(const char* argbuf, const Py_ssize_t arglen)
{
    return _Py_strhex_impl(argbuf, arglen, NULL, 0, 1);
}

/* These variants include support for a separator between every N bytes: */

PyObject* _Py_strhex_with_sep(const char* argbuf, const Py_ssize_t arglen,
                              PyObject* sep, const int bytes_per_group)
{
    return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 0);
}

/* Same as above but returns a bytes() instead of str() to avoid the
 * need to decode the str() when bytes are needed. */
PyObject* _Py_strhex_bytes_with_sep(const char* argbuf, const Py_ssize_t arglen,
                                    PyObject* sep, const int bytes_per_group)
{
    return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 1);
}

Coverage Report

Created: 2026-02-26 06:53

Line	Count	Source
1		/* Format bytes as hexadecimal */
2
3		#include "Python.h"
4		#include "pycore_strhex.h" // _Py_strhex_with_sep()
5		#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
6
7		/* Scalar hexlify: convert len bytes to 2*len hex characters.
8		Uses table lookup via Py_hexdigits for the conversion. */
9		static inline void
10		_Py_hexlify_scalar(const unsigned char src, Py_UCS1 dst, Py_ssize_t len)
11	90	{
12		/* Various optimizations like using math instead of a table lookup,
13		manually unrolling the loop, storing the global table pointer locally,
14		and doing wider dst writes have been tried and benchmarked; all produced
15		nearly identical performance on gcc 15. Using a 256 entry uint16_t
16		table was a bit slower. So we keep our old simple and obvious code. */
17	90	for (Py_ssize_t i = 0; i < len; i++) {
18	0	unsigned char c = src[i];
19	0	*dst++ = Py_hexdigits[c >> 4];
20	0	*dst++ = Py_hexdigits[c & 0x0f];
21	0	}
22	90	}
23
24		/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
25		Uses __builtin_shufflevector for portable interleave that compiles to
26		native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
27		NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
28		for the target microarch allow it [try -march=native if running 32-bit
29		on an RPi3 or later]).
30
31		Performance:
32		- For more common small data it varies between 1.1-3x faster.
33		- Up to 11x faster on larger data than the scalar code.
34
35		While faster is possible for big data using AVX2 or AVX512, that
36		adds a ton of complication. Who ever really hexes huge data?
37		The 16-64 byte boosts align nicely with md5 - sha512 hexdigests.
38		*/
39		#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
40
41		/* 128-bit vector of 16 unsigned bytes */
42		typedef unsigned char v16u8 __attribute__((vector_size(16)));
43		/* 128-bit vector of 16 signed bytes - for efficient comparison.
44		Using signed comparison generates pcmpgtb on x86-64 instead of
45		the slower psubusb+pcmpeqb sequence from unsigned comparison.
46		ARM NEON performs the same either way. */
47		typedef signed char v16s8 __attribute__((vector_size(16)));
48
49		/* Splat a byte value across all 16 lanes */
50		static inline v16u8
51		v16u8_splat(unsigned char x)
52	270	{
53	270	return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
54	270	}
55
56		static inline v16s8
57		v16s8_splat(signed char x)
58	90	{
59	90	return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
60	90	}
61
62		/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
63		Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
64		static void
65		_Py_hexlify_simd(const unsigned char src, Py_UCS1 dst, Py_ssize_t len)
66	90	{
67	90	const v16u8 mask_0f = v16u8_splat(0x0f);
68	90	const v16u8 ascii_0 = v16u8_splat('0');
69	90	const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */
70	90	const v16s8 nine = v16s8_splat(9);
71
72	90	Py_ssize_t i = 0;
73
74		/* Process 16 bytes at a time */
75	360	for (; i + 16 <= len; i += 16, dst += 32) {
76		/* Load 16 bytes (memcpy for safe unaligned access) */
77	270	v16u8 data;
78	270	memcpy(&data, src + i, 16);
79
80		/* Extract high and low nibbles using vector operators */
81	270	v16u8 hi = (data >> 4) & mask_0f;
82	270	v16u8 lo = data & mask_0f;
83
84		/* Compare > 9 using signed comparison for efficient codegen.
85		Nibble values 0-15 are safely in signed byte range.
86		This generates pcmpgtb on x86-64, avoiding the slower
87		psubusb+pcmpeqb sequence from unsigned comparison. */
88	270	v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
89	270	v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);
90
91		/* Convert nibbles to hex ASCII */
92	270	hi = hi + ascii_0 + (hi_gt9 & offset);
93	270	lo = lo + ascii_0 + (lo_gt9 & offset);
94
95		/* Interleave hi/lo nibbles using portable shufflevector.
96		This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
97		or vzip on ARM32. */
98	270	v16u8 result0 = __builtin_shufflevector(hi, lo,
99	270	0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
100	270	v16u8 result1 = __builtin_shufflevector(hi, lo,
101	270	8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
102
103		/* Store 32 hex characters */
104	270	memcpy(dst, &result0, 16);
105	270	memcpy(dst + 16, &result1, 16);
106	270	}
107
108		/* Scalar fallback for remaining 0-15 bytes */
109	90	_Py_hexlify_scalar(src + i, dst, len - i);
110	90	}
111
112		#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */
113
114		static PyObject _Py_strhex_impl(const char argbuf, const Py_ssize_t arglen,
115		PyObject* sep, int bytes_per_sep_group,
116		const int return_bytes)
117	90	{
118	90	assert(arglen >= 0);
119
120	90	Py_UCS1 sep_char = 0;
121	90	if (sep) {
122	0	Py_ssize_t seplen = PyObject_Length((PyObject*)sep);
123	0	if (seplen < 0) {
124	0	return NULL;
125	0	}
126	0	if (seplen != 1) {
127	0	PyErr_SetString(PyExc_ValueError, "sep must be length 1.");
128	0	return NULL;
129	0	}
130	0	if (PyUnicode_Check(sep)) {
131	0	if (PyUnicode_KIND(sep) != PyUnicode_1BYTE_KIND) {
132	0	PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
133	0	return NULL;
134	0	}
135	0	sep_char = PyUnicode_READ_CHAR(sep, 0);
136	0	}
137	0	else if (PyBytes_Check(sep)) {
138	0	sep_char = PyBytes_AS_STRING(sep)[0];
139	0	}
140	0	else {
141	0	PyErr_SetString(PyExc_TypeError, "sep must be str or bytes.");
142	0	return NULL;
143	0	}
144	0	if (sep_char > 127 && !return_bytes) {
145	0	PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
146	0	return NULL;
147	0	}
148	0	}
149	90	else {
150	90	bytes_per_sep_group = 0;
151	90	}
152	90	unsigned int abs_bytes_per_sep = _Py_ABS_CAST(unsigned int, bytes_per_sep_group);
153	90	Py_ssize_t resultlen = 0;
154	90	if (bytes_per_sep_group && arglen > 0) {
155		/* How many sep characters we'll be inserting. */
156	0	resultlen = (arglen - 1) / abs_bytes_per_sep;
157	0	}
158		/* Bounds checking for our Py_ssize_t indices. */
159	90	if (arglen >= PY_SSIZE_T_MAX / 2 - resultlen) {
160	0	return PyErr_NoMemory();
161	0	}
162	90	resultlen += arglen * 2;
163
164	90	if ((size_t)abs_bytes_per_sep >= (size_t)arglen) {
165	0	bytes_per_sep_group = 0;
166	0	abs_bytes_per_sep = 0;
167	0	}
168
169	90	PyObject *retval;
170	90	Py_UCS1 *retbuf;
171	90	if (return_bytes) {
172		/* If _PyBytes_FromSize() were public we could avoid malloc+copy. */
173	0	retval = PyBytes_FromStringAndSize(NULL, resultlen);
174	0	if (!retval) {
175	0	return NULL;
176	0	}
177	0	retbuf = (Py_UCS1 *)PyBytes_AS_STRING(retval);
178	0	}
179	90	else {
180	90	retval = PyUnicode_New(resultlen, 127);
181	90	if (!retval) {
182	0	return NULL;
183	0	}
184	90	retbuf = PyUnicode_1BYTE_DATA(retval);
185	90	}
186
187		/* Hexlify */
188	90	Py_ssize_t i, j;
189	90	unsigned char c;
190
191	90	if (bytes_per_sep_group == 0) {
192	90	#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
193	90	if (arglen >= 16) {
194	90	_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
195	90	}
196	0	else
197	0	#endif
198	0	{
199	0	_Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
200	0	}
201	90	}
202	0	else {
203		/* The number of complete chunk+sep periods */
204	0	Py_ssize_t chunks = (arglen - 1) / abs_bytes_per_sep;
205	0	Py_ssize_t chunk;
206	0	unsigned int k;
207
208	0	if (bytes_per_sep_group < 0) {
209	0	i = j = 0;
210	0	for (chunk = 0; chunk < chunks; chunk++) {
211	0	for (k = 0; k < abs_bytes_per_sep; k++) {
212	0	c = argbuf[i++];
213	0	retbuf[j++] = Py_hexdigits[c >> 4];
214	0	retbuf[j++] = Py_hexdigits[c & 0x0f];
215	0	}
216	0	retbuf[j++] = sep_char;
217	0	}
218	0	while (i < arglen) {
219	0	c = argbuf[i++];
220	0	retbuf[j++] = Py_hexdigits[c >> 4];
221	0	retbuf[j++] = Py_hexdigits[c & 0x0f];
222	0	}
223	0	assert(j == resultlen);
224	0	}
225	0	else {
226	0	i = arglen - 1;
227	0	j = resultlen - 1;
228	0	for (chunk = 0; chunk < chunks; chunk++) {
229	0	for (k = 0; k < abs_bytes_per_sep; k++) {
230	0	c = argbuf[i--];
231	0	retbuf[j--] = Py_hexdigits[c & 0x0f];
232	0	retbuf[j--] = Py_hexdigits[c >> 4];
233	0	}
234	0	retbuf[j--] = sep_char;
235	0	}
236	0	while (i >= 0) {
237	0	c = argbuf[i--];
238	0	retbuf[j--] = Py_hexdigits[c & 0x0f];
239	0	retbuf[j--] = Py_hexdigits[c >> 4];
240	0	}
241	0	assert(j == -1);
242	0	}
243	0	}
244
245		#ifdef Py_DEBUG
246		if (!return_bytes) {
247		assert(_PyUnicode_CheckConsistency(retval, 1));
248		}
249		#endif
250
251	90	return retval;
252	90	}
253
254		PyObject * _Py_strhex(const char* argbuf, const Py_ssize_t arglen)
255	90	{
256	90	return _Py_strhex_impl(argbuf, arglen, NULL, 0, 0);
257	90	}
258
259		/* Same as above but returns a bytes() instead of str() to avoid the
260		* need to decode the str() when bytes are needed. */
261		PyObject* _Py_strhex_bytes(const char* argbuf, const Py_ssize_t arglen)
262	0	{
263	0	return _Py_strhex_impl(argbuf, arglen, NULL, 0, 1);
264	0	}
265
266		/* These variants include support for a separator between every N bytes: */
267
268		PyObject* _Py_strhex_with_sep(const char* argbuf, const Py_ssize_t arglen,
269		PyObject* sep, const int bytes_per_group)
270	0	{
271	0	return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 0);
272	0	}
273
274		/* Same as above but returns a bytes() instead of str() to avoid the
275		* need to decode the str() when bytes are needed. */
276		PyObject* _Py_strhex_bytes_with_sep(const char* argbuf, const Py_ssize_t arglen,
277		PyObject* sep, const int bytes_per_group)
278	0	{
279	0	return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 1);
280	0	}