Coverage Report

Created: 2026-04-12 06:54

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/pystrhex.c
Line
Count
Source
1
/* Format bytes as hexadecimal */
2
3
#include "Python.h"
4
#include "pycore_strhex.h"        // _Py_strhex_with_sep()
5
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()
6
7
/* Scalar hexlify: convert len bytes to 2*len hex characters.
8
   Uses table lookup via Py_hexdigits for the conversion. */
9
static inline void
10
_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
11
182
{
12
    /* Various optimizations like using math instead of a table lookup,
13
       manually unrolling the loop, storing the global table pointer locally,
14
       and doing wider dst writes have been tried and benchmarked; all produced
15
       nearly identical performance on gcc 15.  Using a 256 entry uint16_t
16
       table was a bit slower.  So we keep our old simple and obvious code. */
17
182
    for (Py_ssize_t i = 0; i < len; i++) {
18
0
        unsigned char c = src[i];
19
0
        *dst++ = Py_hexdigits[c >> 4];
20
0
        *dst++ = Py_hexdigits[c & 0x0f];
21
0
    }
22
182
}
23
24
/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
25
   Uses __builtin_shufflevector for portable interleave that compiles to
26
   native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
27
   NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
28
   for the target microarch allow it [try -march=native if running 32-bit
29
   on an RPi3 or later]).
30
31
   Performance:
32
   - For more common small data it varies between 1.1-3x faster.
33
   - Up to 11x faster on larger data than the scalar code.
34
35
   While faster is possible for big data using AVX2 or AVX512, that
36
   adds a ton of complication. Who ever really hexes huge data?
37
   The 16-64 byte boosts align nicely with md5 - sha512 hexdigests.
38
*/
39
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
40
41
/* 128-bit vector of 16 unsigned bytes */
42
typedef unsigned char v16u8 __attribute__((vector_size(16)));
43
/* 128-bit vector of 16 signed bytes - for efficient comparison.
44
   Using signed comparison generates pcmpgtb on x86-64 instead of
45
   the slower psubusb+pcmpeqb sequence from unsigned comparison.
46
   ARM NEON performs the same either way. */
47
typedef signed char v16s8 __attribute__((vector_size(16)));
48
49
/* Splat a byte value across all 16 lanes */
50
static inline v16u8
51
v16u8_splat(unsigned char x)
52
546
{
53
546
    return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
54
546
}
55
56
static inline v16s8
57
v16s8_splat(signed char x)
58
182
{
59
182
    return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
60
182
}
61
62
/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
63
   Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
64
static void
65
_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
66
182
{
67
182
    const v16u8 mask_0f = v16u8_splat(0x0f);
68
182
    const v16u8 ascii_0 = v16u8_splat('0');
69
182
    const v16u8 offset = v16u8_splat('a' - '0' - 10);  /* 0x27 */
70
182
    const v16s8 nine = v16s8_splat(9);
71
72
182
    Py_ssize_t i = 0;
73
74
    /* Process 16 bytes at a time */
75
728
    for (; i + 16 <= len; i += 16, dst += 32) {
76
        /* Load 16 bytes (memcpy for safe unaligned access) */
77
546
        v16u8 data;
78
546
        memcpy(&data, src + i, 16);
79
80
        /* Extract high and low nibbles using vector operators */
81
546
        v16u8 hi = (data >> 4) & mask_0f;
82
546
        v16u8 lo = data & mask_0f;
83
84
        /* Compare > 9 using signed comparison for efficient codegen.
85
           Nibble values 0-15 are safely in signed byte range.
86
           This generates pcmpgtb on x86-64, avoiding the slower
87
           psubusb+pcmpeqb sequence from unsigned comparison. */
88
546
        v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
89
546
        v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);
90
91
        /* Convert nibbles to hex ASCII */
92
546
        hi = hi + ascii_0 + (hi_gt9 & offset);
93
546
        lo = lo + ascii_0 + (lo_gt9 & offset);
94
95
        /* Interleave hi/lo nibbles using portable shufflevector.
96
           This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
97
           or vzip on ARM32. */
98
546
        v16u8 result0 = __builtin_shufflevector(hi, lo,
99
546
            0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
100
546
        v16u8 result1 = __builtin_shufflevector(hi, lo,
101
546
            8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
102
103
        /* Store 32 hex characters */
104
546
        memcpy(dst, &result0, 16);
105
546
        memcpy(dst + 16, &result1, 16);
106
546
    }
107
108
    /* Scalar fallback for remaining 0-15 bytes */
109
182
    _Py_hexlify_scalar(src + i, dst, len - i);
110
182
}
111
112
#endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */
113
114
static PyObject *
115
_Py_strhex_impl(const char* argbuf, Py_ssize_t arglen,
116
                PyObject* sep, Py_ssize_t bytes_per_sep_group,
117
                int return_bytes)
118
182
{
119
182
    assert(arglen >= 0);
120
121
182
    Py_UCS1 sep_char = 0;
122
182
    if (sep) {
123
0
        Py_ssize_t seplen = PyObject_Length((PyObject*)sep);
124
0
        if (seplen < 0) {
125
0
            return NULL;
126
0
        }
127
0
        if (seplen != 1) {
128
0
            PyErr_SetString(PyExc_ValueError, "sep must be length 1.");
129
0
            return NULL;
130
0
        }
131
0
        if (PyUnicode_Check(sep)) {
132
0
            if (PyUnicode_KIND(sep) != PyUnicode_1BYTE_KIND) {
133
0
                PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
134
0
                return NULL;
135
0
            }
136
0
            sep_char = PyUnicode_READ_CHAR(sep, 0);
137
0
        }
138
0
        else if (PyBytes_Check(sep)) {
139
0
            sep_char = PyBytes_AS_STRING(sep)[0];
140
0
        }
141
0
        else {
142
0
            PyErr_SetString(PyExc_TypeError, "sep must be str or bytes.");
143
0
            return NULL;
144
0
        }
145
0
        if (sep_char > 127 && !return_bytes) {
146
0
            PyErr_SetString(PyExc_ValueError, "sep must be ASCII.");
147
0
            return NULL;
148
0
        }
149
0
    }
150
182
    else {
151
182
        bytes_per_sep_group = 0;
152
182
    }
153
182
    size_t abs_bytes_per_sep = _Py_ABS_CAST(size_t, bytes_per_sep_group);
154
182
    Py_ssize_t resultlen = 0;
155
182
    if (bytes_per_sep_group && arglen > 0) {
156
        /* How many sep characters we'll be inserting. */
157
0
        resultlen = (arglen - 1) / abs_bytes_per_sep;
158
0
    }
159
    /* Bounds checking for our Py_ssize_t indices. */
160
182
    if (arglen >= PY_SSIZE_T_MAX / 2 - resultlen) {
161
0
        return PyErr_NoMemory();
162
0
    }
163
182
    resultlen += arglen * 2;
164
165
182
    if ((size_t)abs_bytes_per_sep >= (size_t)arglen) {
166
0
        bytes_per_sep_group = 0;
167
0
        abs_bytes_per_sep = 0;
168
0
    }
169
170
182
    PyObject *retval;
171
182
    Py_UCS1 *retbuf;
172
182
    if (return_bytes) {
173
        /* If _PyBytes_FromSize() were public we could avoid malloc+copy. */
174
0
        retval = PyBytes_FromStringAndSize(NULL, resultlen);
175
0
        if (!retval) {
176
0
            return NULL;
177
0
        }
178
0
        retbuf = (Py_UCS1 *)PyBytes_AS_STRING(retval);
179
0
    }
180
182
    else {
181
182
        retval = PyUnicode_New(resultlen, 127);
182
182
        if (!retval) {
183
0
            return NULL;
184
0
        }
185
182
        retbuf = PyUnicode_1BYTE_DATA(retval);
186
182
    }
187
188
    /* Hexlify */
189
182
    Py_ssize_t i, j;
190
182
    unsigned char c;
191
192
182
    if (bytes_per_sep_group == 0) {
193
182
#ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR
194
182
        if (arglen >= 16) {
195
182
            _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
196
182
        }
197
0
        else
198
0
#endif
199
0
        {
200
0
            _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
201
0
        }
202
182
    }
203
0
    else {
204
        /* The number of complete chunk+sep periods */
205
0
        Py_ssize_t chunks = (arglen - 1) / abs_bytes_per_sep;
206
0
        Py_ssize_t chunk;
207
0
        size_t k;
208
209
0
        if (bytes_per_sep_group < 0) {
210
0
            i = j = 0;
211
0
            for (chunk = 0; chunk < chunks; chunk++) {
212
0
                for (k = 0; k < abs_bytes_per_sep; k++) {
213
0
                    c = argbuf[i++];
214
0
                    retbuf[j++] = Py_hexdigits[c >> 4];
215
0
                    retbuf[j++] = Py_hexdigits[c & 0x0f];
216
0
                }
217
0
                retbuf[j++] = sep_char;
218
0
            }
219
0
            while (i < arglen) {
220
0
                c = argbuf[i++];
221
0
                retbuf[j++] = Py_hexdigits[c >> 4];
222
0
                retbuf[j++] = Py_hexdigits[c & 0x0f];
223
0
            }
224
0
            assert(j == resultlen);
225
0
        }
226
0
        else {
227
0
            i = arglen - 1;
228
0
            j = resultlen - 1;
229
0
            for (chunk = 0; chunk < chunks; chunk++) {
230
0
                for (k = 0; k < abs_bytes_per_sep; k++) {
231
0
                    c = argbuf[i--];
232
0
                    retbuf[j--] = Py_hexdigits[c & 0x0f];
233
0
                    retbuf[j--] = Py_hexdigits[c >> 4];
234
0
                }
235
0
                retbuf[j--] = sep_char;
236
0
            }
237
0
            while (i >= 0) {
238
0
                c = argbuf[i--];
239
0
                retbuf[j--] = Py_hexdigits[c & 0x0f];
240
0
                retbuf[j--] = Py_hexdigits[c >> 4];
241
0
            }
242
0
            assert(j == -1);
243
0
        }
244
0
    }
245
246
#ifdef Py_DEBUG
247
    if (!return_bytes) {
248
        assert(_PyUnicode_CheckConsistency(retval, 1));
249
    }
250
#endif
251
252
182
    return retval;
253
182
}
254
255
PyObject * _Py_strhex(const char* argbuf, Py_ssize_t arglen)
256
182
{
257
182
    return _Py_strhex_impl(argbuf, arglen, NULL, 0, 0);
258
182
}
259
260
/* Same as above but returns a bytes() instead of str() to avoid the
261
 * need to decode the str() when bytes are needed. */
262
PyObject* _Py_strhex_bytes(const char* argbuf, Py_ssize_t arglen)
263
0
{
264
0
    return _Py_strhex_impl(argbuf, arglen, NULL, 0, 1);
265
0
}
266
267
/* These variants include support for a separator between every N bytes: */
268
269
PyObject* _Py_strhex_with_sep(const char* argbuf, Py_ssize_t arglen,
270
                              PyObject* sep, Py_ssize_t bytes_per_group)
271
0
{
272
0
    return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 0);
273
0
}
274
275
/* Same as above but returns a bytes() instead of str() to avoid the
276
 * need to decode the str() when bytes are needed. */
277
PyObject* _Py_strhex_bytes_with_sep(const char* argbuf, Py_ssize_t arglen,
278
                                    PyObject* sep, Py_ssize_t bytes_per_group)
279
0
{
280
0
    return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 1);
281
0
}