/src/cpython/Python/pystrhex.c
Line | Count | Source |
1 | | /* Format bytes as hexadecimal */ |
2 | | |
3 | | #include "Python.h" |
4 | | #include "pycore_strhex.h" // _Py_strhex_with_sep() |
5 | | #include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency() |
6 | | |
7 | | /* Scalar hexlify: convert len bytes to 2*len hex characters. |
8 | | Uses table lookup via Py_hexdigits for the conversion. */ |
9 | | static inline void |
10 | | _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
11 | 90 | { |
12 | | /* Various optimizations like using math instead of a table lookup, |
13 | | manually unrolling the loop, storing the global table pointer locally, |
14 | | and doing wider dst writes have been tried and benchmarked; all produced |
15 | | nearly identical performance on gcc 15. Using a 256 entry uint16_t |
16 | | table was a bit slower. So we keep our old simple and obvious code. */ |
17 | 90 | for (Py_ssize_t i = 0; i < len; i++) { |
18 | 0 | unsigned char c = src[i]; |
19 | 0 | *dst++ = Py_hexdigits[c >> 4]; |
20 | 0 | *dst++ = Py_hexdigits[c & 0x0f]; |
21 | 0 | } |
22 | 90 | } |
23 | | |
24 | | /* Portable SIMD optimization for hexlify using GCC/Clang vector extensions. |
25 | | Uses __builtin_shufflevector for portable interleave that compiles to |
26 | | native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always], |
27 | | NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags |
28 | | for the target microarch allow it [try -march=native if running 32-bit |
29 | | on an RPi3 or later]). |
30 | | |
31 | | Performance: |
32 | | - For more common small data it varies between 1.1-3x faster. |
33 | | - Up to 11x faster on larger data than the scalar code. |
34 | | |
35 | | While faster is possible for big data using AVX2 or AVX512, that |
36 | | adds a ton of complication. Who ever really hexes huge data? |
37 | | The 16-64 byte boosts align nicely with md5 - sha512 hexdigests. |
38 | | */ |
39 | | #ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR |
40 | | |
41 | | /* 128-bit vector of 16 unsigned bytes */ |
42 | | typedef unsigned char v16u8 __attribute__((vector_size(16))); |
43 | | /* 128-bit vector of 16 signed bytes - for efficient comparison. |
44 | | Using signed comparison generates pcmpgtb on x86-64 instead of |
45 | | the slower psubusb+pcmpeqb sequence from unsigned comparison. |
46 | | ARM NEON performs the same either way. */ |
47 | | typedef signed char v16s8 __attribute__((vector_size(16))); |
48 | | |
49 | | /* Splat a byte value across all 16 lanes */ |
50 | | static inline v16u8 |
51 | | v16u8_splat(unsigned char x) |
52 | 270 | { |
53 | 270 | return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; |
54 | 270 | } |
55 | | |
56 | | static inline v16s8 |
57 | | v16s8_splat(signed char x) |
58 | 90 | { |
59 | 90 | return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}; |
60 | 90 | } |
61 | | |
62 | | /* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration. |
63 | | Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */ |
64 | | static void |
65 | | _Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len) |
66 | 90 | { |
67 | 90 | const v16u8 mask_0f = v16u8_splat(0x0f); |
68 | 90 | const v16u8 ascii_0 = v16u8_splat('0'); |
69 | 90 | const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */ |
70 | 90 | const v16s8 nine = v16s8_splat(9); |
71 | | |
72 | 90 | Py_ssize_t i = 0; |
73 | | |
74 | | /* Process 16 bytes at a time */ |
75 | 360 | for (; i + 16 <= len; i += 16, dst += 32) { |
76 | | /* Load 16 bytes (memcpy for safe unaligned access) */ |
77 | 270 | v16u8 data; |
78 | 270 | memcpy(&data, src + i, 16); |
79 | | |
80 | | /* Extract high and low nibbles using vector operators */ |
81 | 270 | v16u8 hi = (data >> 4) & mask_0f; |
82 | 270 | v16u8 lo = data & mask_0f; |
83 | | |
84 | | /* Compare > 9 using signed comparison for efficient codegen. |
85 | | Nibble values 0-15 are safely in signed byte range. |
86 | | This generates pcmpgtb on x86-64, avoiding the slower |
87 | | psubusb+pcmpeqb sequence from unsigned comparison. */ |
88 | 270 | v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine); |
89 | 270 | v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine); |
90 | | |
91 | | /* Convert nibbles to hex ASCII */ |
92 | 270 | hi = hi + ascii_0 + (hi_gt9 & offset); |
93 | 270 | lo = lo + ascii_0 + (lo_gt9 & offset); |
94 | | |
95 | | /* Interleave hi/lo nibbles using portable shufflevector. |
96 | | This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64, |
97 | | or vzip on ARM32. */ |
98 | 270 | v16u8 result0 = __builtin_shufflevector(hi, lo, |
99 | 270 | 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); |
100 | 270 | v16u8 result1 = __builtin_shufflevector(hi, lo, |
101 | 270 | 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); |
102 | | |
103 | | /* Store 32 hex characters */ |
104 | 270 | memcpy(dst, &result0, 16); |
105 | 270 | memcpy(dst + 16, &result1, 16); |
106 | 270 | } |
107 | | |
108 | | /* Scalar fallback for remaining 0-15 bytes */ |
109 | 90 | _Py_hexlify_scalar(src + i, dst, len - i); |
110 | 90 | } |
111 | | |
112 | | #endif /* HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR */ |
113 | | |
114 | | static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen, |
115 | | PyObject* sep, int bytes_per_sep_group, |
116 | | const int return_bytes) |
117 | 90 | { |
118 | 90 | assert(arglen >= 0); |
119 | | |
120 | 90 | Py_UCS1 sep_char = 0; |
121 | 90 | if (sep) { |
122 | 0 | Py_ssize_t seplen = PyObject_Length((PyObject*)sep); |
123 | 0 | if (seplen < 0) { |
124 | 0 | return NULL; |
125 | 0 | } |
126 | 0 | if (seplen != 1) { |
127 | 0 | PyErr_SetString(PyExc_ValueError, "sep must be length 1."); |
128 | 0 | return NULL; |
129 | 0 | } |
130 | 0 | if (PyUnicode_Check(sep)) { |
131 | 0 | if (PyUnicode_KIND(sep) != PyUnicode_1BYTE_KIND) { |
132 | 0 | PyErr_SetString(PyExc_ValueError, "sep must be ASCII."); |
133 | 0 | return NULL; |
134 | 0 | } |
135 | 0 | sep_char = PyUnicode_READ_CHAR(sep, 0); |
136 | 0 | } |
137 | 0 | else if (PyBytes_Check(sep)) { |
138 | 0 | sep_char = PyBytes_AS_STRING(sep)[0]; |
139 | 0 | } |
140 | 0 | else { |
141 | 0 | PyErr_SetString(PyExc_TypeError, "sep must be str or bytes."); |
142 | 0 | return NULL; |
143 | 0 | } |
144 | 0 | if (sep_char > 127 && !return_bytes) { |
145 | 0 | PyErr_SetString(PyExc_ValueError, "sep must be ASCII."); |
146 | 0 | return NULL; |
147 | 0 | } |
148 | 0 | } |
149 | 90 | else { |
150 | 90 | bytes_per_sep_group = 0; |
151 | 90 | } |
152 | 90 | unsigned int abs_bytes_per_sep = _Py_ABS_CAST(unsigned int, bytes_per_sep_group); |
153 | 90 | Py_ssize_t resultlen = 0; |
154 | 90 | if (bytes_per_sep_group && arglen > 0) { |
155 | | /* How many sep characters we'll be inserting. */ |
156 | 0 | resultlen = (arglen - 1) / abs_bytes_per_sep; |
157 | 0 | } |
158 | | /* Bounds checking for our Py_ssize_t indices. */ |
159 | 90 | if (arglen >= PY_SSIZE_T_MAX / 2 - resultlen) { |
160 | 0 | return PyErr_NoMemory(); |
161 | 0 | } |
162 | 90 | resultlen += arglen * 2; |
163 | | |
164 | 90 | if ((size_t)abs_bytes_per_sep >= (size_t)arglen) { |
165 | 0 | bytes_per_sep_group = 0; |
166 | 0 | abs_bytes_per_sep = 0; |
167 | 0 | } |
168 | | |
169 | 90 | PyObject *retval; |
170 | 90 | Py_UCS1 *retbuf; |
171 | 90 | if (return_bytes) { |
172 | | /* If _PyBytes_FromSize() were public we could avoid malloc+copy. */ |
173 | 0 | retval = PyBytes_FromStringAndSize(NULL, resultlen); |
174 | 0 | if (!retval) { |
175 | 0 | return NULL; |
176 | 0 | } |
177 | 0 | retbuf = (Py_UCS1 *)PyBytes_AS_STRING(retval); |
178 | 0 | } |
179 | 90 | else { |
180 | 90 | retval = PyUnicode_New(resultlen, 127); |
181 | 90 | if (!retval) { |
182 | 0 | return NULL; |
183 | 0 | } |
184 | 90 | retbuf = PyUnicode_1BYTE_DATA(retval); |
185 | 90 | } |
186 | | |
187 | | /* Hexlify */ |
188 | 90 | Py_ssize_t i, j; |
189 | 90 | unsigned char c; |
190 | | |
191 | 90 | if (bytes_per_sep_group == 0) { |
192 | 90 | #ifdef HAVE_EFFICIENT_BUILTIN_SHUFFLEVECTOR |
193 | 90 | if (arglen >= 16) { |
194 | 90 | _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen); |
195 | 90 | } |
196 | 0 | else |
197 | 0 | #endif |
198 | 0 | { |
199 | 0 | _Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen); |
200 | 0 | } |
201 | 90 | } |
202 | 0 | else { |
203 | | /* The number of complete chunk+sep periods */ |
204 | 0 | Py_ssize_t chunks = (arglen - 1) / abs_bytes_per_sep; |
205 | 0 | Py_ssize_t chunk; |
206 | 0 | unsigned int k; |
207 | |
|
208 | 0 | if (bytes_per_sep_group < 0) { |
209 | 0 | i = j = 0; |
210 | 0 | for (chunk = 0; chunk < chunks; chunk++) { |
211 | 0 | for (k = 0; k < abs_bytes_per_sep; k++) { |
212 | 0 | c = argbuf[i++]; |
213 | 0 | retbuf[j++] = Py_hexdigits[c >> 4]; |
214 | 0 | retbuf[j++] = Py_hexdigits[c & 0x0f]; |
215 | 0 | } |
216 | 0 | retbuf[j++] = sep_char; |
217 | 0 | } |
218 | 0 | while (i < arglen) { |
219 | 0 | c = argbuf[i++]; |
220 | 0 | retbuf[j++] = Py_hexdigits[c >> 4]; |
221 | 0 | retbuf[j++] = Py_hexdigits[c & 0x0f]; |
222 | 0 | } |
223 | 0 | assert(j == resultlen); |
224 | 0 | } |
225 | 0 | else { |
226 | 0 | i = arglen - 1; |
227 | 0 | j = resultlen - 1; |
228 | 0 | for (chunk = 0; chunk < chunks; chunk++) { |
229 | 0 | for (k = 0; k < abs_bytes_per_sep; k++) { |
230 | 0 | c = argbuf[i--]; |
231 | 0 | retbuf[j--] = Py_hexdigits[c & 0x0f]; |
232 | 0 | retbuf[j--] = Py_hexdigits[c >> 4]; |
233 | 0 | } |
234 | 0 | retbuf[j--] = sep_char; |
235 | 0 | } |
236 | 0 | while (i >= 0) { |
237 | 0 | c = argbuf[i--]; |
238 | 0 | retbuf[j--] = Py_hexdigits[c & 0x0f]; |
239 | 0 | retbuf[j--] = Py_hexdigits[c >> 4]; |
240 | 0 | } |
241 | 0 | assert(j == -1); |
242 | 0 | } |
243 | 0 | } |
244 | | |
245 | | #ifdef Py_DEBUG |
246 | | if (!return_bytes) { |
247 | | assert(_PyUnicode_CheckConsistency(retval, 1)); |
248 | | } |
249 | | #endif |
250 | | |
251 | 90 | return retval; |
252 | 90 | } |
253 | | |
254 | | PyObject * _Py_strhex(const char* argbuf, const Py_ssize_t arglen) |
255 | 90 | { |
256 | 90 | return _Py_strhex_impl(argbuf, arglen, NULL, 0, 0); |
257 | 90 | } |
258 | | |
259 | | /* Same as above but returns a bytes() instead of str() to avoid the |
260 | | * need to decode the str() when bytes are needed. */ |
261 | | PyObject* _Py_strhex_bytes(const char* argbuf, const Py_ssize_t arglen) |
262 | 0 | { |
263 | 0 | return _Py_strhex_impl(argbuf, arglen, NULL, 0, 1); |
264 | 0 | } |
265 | | |
266 | | /* These variants include support for a separator between every N bytes: */ |
267 | | |
268 | | PyObject* _Py_strhex_with_sep(const char* argbuf, const Py_ssize_t arglen, |
269 | | PyObject* sep, const int bytes_per_group) |
270 | 0 | { |
271 | 0 | return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 0); |
272 | 0 | } |
273 | | |
274 | | /* Same as above but returns a bytes() instead of str() to avoid the |
275 | | * need to decode the str() when bytes are needed. */ |
276 | | PyObject* _Py_strhex_bytes_with_sep(const char* argbuf, const Py_ssize_t arglen, |
277 | | PyObject* sep, const int bytes_per_group) |
278 | 0 | { |
279 | 0 | return _Py_strhex_impl(argbuf, arglen, sep, bytes_per_group, 1); |
280 | 0 | } |