/src/Python-3.8.3/Python/pyhash.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* Set of hash utility functions to help maintaining the invariant that |
2 | | if a==b then hash(a)==hash(b) |
3 | | |
4 | | All the utility functions (_Py_Hash*()) return "-1" to signify an error. |
5 | | */ |
6 | | #include "Python.h" |
7 | | |
8 | | #ifdef __APPLE__ |
9 | | # include <libkern/OSByteOrder.h> |
10 | | #elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H) |
11 | | # include <endian.h> |
12 | | #elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H) |
13 | | # include <sys/endian.h> |
14 | | #endif |
15 | | |
16 | | #ifdef __cplusplus |
17 | | extern "C" { |
18 | | #endif |
19 | | |
20 | | _Py_HashSecret_t _Py_HashSecret = {{0}}; |
21 | | |
22 | | #if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL |
23 | | extern PyHash_FuncDef PyHash_Func; |
24 | | #else |
25 | | static PyHash_FuncDef PyHash_Func; |
26 | | #endif |
27 | | |
28 | | /* Count _Py_HashBytes() calls */ |
29 | | #ifdef Py_HASH_STATS |
30 | | #define Py_HASH_STATS_MAX 32 |
31 | | static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0}; |
32 | | #endif |
33 | | |
34 | | /* For numeric types, the hash of a number x is based on the reduction |
35 | | of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that |
36 | | hash(x) == hash(y) whenever x and y are numerically equal, even if |
37 | | x and y have different types. |
38 | | |
39 | | A quick summary of the hashing strategy: |
40 | | |
41 | | (1) First define the 'reduction of x modulo P' for any rational |
42 | | number x; this is a standard extension of the usual notion of |
43 | | reduction modulo P for integers. If x == p/q (written in lowest |
44 | | terms), the reduction is interpreted as the reduction of p times |
45 | | the inverse of the reduction of q, all modulo P; if q is exactly |
46 | | divisible by P then define the reduction to be infinity. So we've |
47 | | got a well-defined map |
48 | | |
49 | | reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }. |
50 | | |
51 | | (2) Now for a rational number x, define hash(x) by: |
52 | | |
53 | | reduce(x) if x >= 0 |
54 | | -reduce(-x) if x < 0 |
55 | | |
56 | | If the result of the reduction is infinity (this is impossible for |
57 | | integers, floats and Decimals) then use the predefined hash value |
58 | | _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead. |
59 | | _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the |
60 | | hashes of float and Decimal infinities and nans. |
61 | | |
62 | | A selling point for the above strategy is that it makes it possible |
63 | | to compute hashes of decimal and binary floating-point numbers |
64 | | efficiently, even if the exponent of the binary or decimal number |
65 | | is large. The key point is that |
66 | | |
67 | | reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS) |
68 | | |
69 | | provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a |
70 | | binary or decimal float is never infinity, since the denominator is a power |
71 | | of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have, |
72 | | for nonnegative x, |
73 | | |
74 | | reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS |
75 | | |
76 | | reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS |
77 | | |
78 | | and reduce(10**e) can be computed efficiently by the usual modular |
79 | | exponentiation algorithm. For reduce(2**e) it's even better: since |
80 | | P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication |
81 | | by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits. |
82 | | |
83 | | */ |
84 | | |
85 | | Py_hash_t |
86 | | _Py_HashDouble(double v) |
87 | 10 | { |
88 | 10 | int e, sign; |
89 | 10 | double m; |
90 | 10 | Py_uhash_t x, y; |
91 | | |
92 | 10 | if (!Py_IS_FINITE(v)) { |
93 | 0 | if (Py_IS_INFINITY(v)) |
94 | 0 | return v > 0 ? _PyHASH_INF : -_PyHASH_INF; |
95 | 0 | else |
96 | 0 | return _PyHASH_NAN; |
97 | 0 | } |
98 | | |
99 | 10 | m = frexp(v, &e); |
100 | | |
101 | 10 | sign = 1; |
102 | 10 | if (m < 0) { |
103 | 0 | sign = -1; |
104 | 0 | m = -m; |
105 | 0 | } |
106 | | |
107 | | /* process 28 bits at a time; this should work well both for binary |
108 | | and hexadecimal floating point. */ |
109 | 10 | x = 0; |
110 | 20 | while (m) { |
111 | 10 | x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28); |
112 | 10 | m *= 268435456.0; /* 2**28 */ |
113 | 10 | e -= 28; |
114 | 10 | y = (Py_uhash_t)m; /* pull out integer part */ |
115 | 10 | m -= y; |
116 | 10 | x += y; |
117 | 10 | if (x >= _PyHASH_MODULUS) |
118 | 0 | x -= _PyHASH_MODULUS; |
119 | 10 | } |
120 | | |
121 | | /* adjust for the exponent; first reduce it modulo _PyHASH_BITS */ |
122 | 10 | e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS); |
123 | 10 | x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e); |
124 | | |
125 | 10 | x = x * sign; |
126 | 10 | if (x == (Py_uhash_t)-1) |
127 | 0 | x = (Py_uhash_t)-2; |
128 | 10 | return (Py_hash_t)x; |
129 | 10 | } |
130 | | |
131 | | Py_hash_t |
132 | | _Py_HashPointer(void *p) |
133 | 2.65k | { |
134 | 2.65k | Py_hash_t x; |
135 | 2.65k | size_t y = (size_t)p; |
136 | | /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid |
137 | | excessive hash collisions for dicts and sets */ |
138 | 2.65k | y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4)); |
139 | 2.65k | x = (Py_hash_t)y; |
140 | 2.65k | if (x == -1) |
141 | 0 | x = -2; |
142 | 2.65k | return x; |
143 | 2.65k | } |
144 | | |
145 | | Py_hash_t |
146 | | _Py_HashBytes(const void *src, Py_ssize_t len) |
147 | 216k | { |
148 | 216k | Py_hash_t x; |
149 | | /* |
150 | | We make the hash of the empty string be 0, rather than using |
151 | | (prefix ^ suffix), since this slightly obfuscates the hash secret |
152 | | */ |
153 | 216k | if (len == 0) { |
154 | 15 | return 0; |
155 | 15 | } |
156 | | |
157 | | #ifdef Py_HASH_STATS |
158 | | hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++; |
159 | | #endif |
160 | | |
161 | | #if Py_HASH_CUTOFF > 0 |
162 | | if (len < Py_HASH_CUTOFF) { |
163 | | /* Optimize hashing of very small strings with inline DJBX33A. */ |
164 | | Py_uhash_t hash; |
165 | | const unsigned char *p = src; |
166 | | hash = 5381; /* DJBX33A starts with 5381 */ |
167 | | |
168 | | switch(len) { |
169 | | /* ((hash << 5) + hash) + *p == hash * 33 + *p */ |
170 | | case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ |
171 | | case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ |
172 | | case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ |
173 | | case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ |
174 | | case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ |
175 | | case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */ |
176 | | case 1: hash = ((hash << 5) + hash) + *p++; break; |
177 | | default: |
178 | | Py_UNREACHABLE(); |
179 | | } |
180 | | hash ^= len; |
181 | | hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix; |
182 | | x = (Py_hash_t)hash; |
183 | | } |
184 | | else |
185 | | #endif /* Py_HASH_CUTOFF */ |
186 | 216k | x = PyHash_Func.hash(src, len); |
187 | | |
188 | 216k | if (x == -1) |
189 | 0 | return -2; |
190 | 216k | return x; |
191 | 216k | } |
192 | | |
193 | | void |
194 | | _PyHash_Fini(void) |
195 | 0 | { |
196 | | #ifdef Py_HASH_STATS |
197 | | int i; |
198 | | Py_ssize_t total = 0; |
199 | | const char *fmt = "%2i %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n"; |
200 | | |
201 | | fprintf(stderr, "len calls total\n"); |
202 | | for (i = 1; i <= Py_HASH_STATS_MAX; i++) { |
203 | | total += hashstats[i]; |
204 | | fprintf(stderr, fmt, i, hashstats[i], total); |
205 | | } |
206 | | total += hashstats[0]; |
207 | | fprintf(stderr, "> %8" PY_FORMAT_SIZE_T "d %8" PY_FORMAT_SIZE_T "d\n", |
208 | | hashstats[0], total); |
209 | | #endif |
210 | 0 | } |
211 | | |
212 | | PyHash_FuncDef * |
213 | | PyHash_GetFuncDef(void) |
214 | 14 | { |
215 | 14 | return &PyHash_Func; |
216 | 14 | } |
217 | | |
218 | | /* Optimized memcpy() for Windows */ |
219 | | #ifdef _MSC_VER |
220 | | # if SIZEOF_PY_UHASH_T == 4 |
221 | | # define PY_UHASH_CPY(dst, src) do { \ |
222 | | dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \ |
223 | | } while(0) |
224 | | # elif SIZEOF_PY_UHASH_T == 8 |
225 | | # define PY_UHASH_CPY(dst, src) do { \ |
226 | | dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \ |
227 | | dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \ |
228 | | } while(0) |
229 | | # else |
230 | | # error SIZEOF_PY_UHASH_T must be 4 or 8 |
231 | | # endif /* SIZEOF_PY_UHASH_T */ |
232 | | #else /* not Windows */ |
233 | | # define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T) |
234 | | #endif /* _MSC_VER */ |
235 | | |
236 | | |
237 | | #if Py_HASH_ALGORITHM == Py_HASH_FNV |
238 | | /* ************************************************************************** |
239 | | * Modified Fowler-Noll-Vo (FNV) hash function |
240 | | */ |
241 | | static Py_hash_t |
242 | | fnv(const void *src, Py_ssize_t len) |
243 | | { |
244 | | const unsigned char *p = src; |
245 | | Py_uhash_t x; |
246 | | Py_ssize_t remainder, blocks; |
247 | | union { |
248 | | Py_uhash_t value; |
249 | | unsigned char bytes[SIZEOF_PY_UHASH_T]; |
250 | | } block; |
251 | | |
252 | | #ifdef Py_DEBUG |
253 | | assert(_Py_HashSecret_Initialized); |
254 | | #endif |
255 | | remainder = len % SIZEOF_PY_UHASH_T; |
256 | | if (remainder == 0) { |
257 | | /* Process at least one block byte by byte to reduce hash collisions |
258 | | * for strings with common prefixes. */ |
259 | | remainder = SIZEOF_PY_UHASH_T; |
260 | | } |
261 | | blocks = (len - remainder) / SIZEOF_PY_UHASH_T; |
262 | | |
263 | | x = (Py_uhash_t) _Py_HashSecret.fnv.prefix; |
264 | | x ^= (Py_uhash_t) *p << 7; |
265 | | while (blocks--) { |
266 | | PY_UHASH_CPY(block.bytes, p); |
267 | | x = (_PyHASH_MULTIPLIER * x) ^ block.value; |
268 | | p += SIZEOF_PY_UHASH_T; |
269 | | } |
270 | | /* add remainder */ |
271 | | for (; remainder > 0; remainder--) |
272 | | x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++; |
273 | | x ^= (Py_uhash_t) len; |
274 | | x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix; |
275 | | if (x == (Py_uhash_t) -1) { |
276 | | x = (Py_uhash_t) -2; |
277 | | } |
278 | | return x; |
279 | | } |
280 | | |
281 | | static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T, |
282 | | 16 * SIZEOF_PY_HASH_T}; |
283 | | |
284 | | #endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */ |
285 | | |
286 | | |
287 | | /* ************************************************************************** |
288 | | <MIT License> |
289 | | Copyright (c) 2013 Marek Majkowski <marek@popcount.org> |
290 | | |
291 | | Permission is hereby granted, free of charge, to any person obtaining a copy |
292 | | of this software and associated documentation files (the "Software"), to deal |
293 | | in the Software without restriction, including without limitation the rights |
294 | | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
295 | | copies of the Software, and to permit persons to whom the Software is |
296 | | furnished to do so, subject to the following conditions: |
297 | | |
298 | | The above copyright notice and this permission notice shall be included in |
299 | | all copies or substantial portions of the Software. |
300 | | |
301 | | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
302 | | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
303 | | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
304 | | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
305 | | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
306 | | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
307 | | THE SOFTWARE. |
308 | | </MIT License> |
309 | | |
310 | | Original location: |
311 | | https://github.com/majek/csiphash/ |
312 | | |
313 | | Solution inspired by code from: |
314 | | Samuel Neves (supercop/crypto_auth/siphash24/little) |
315 | | djb (supercop/crypto_auth/siphash24/little2) |
316 | | Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) |
317 | | |
318 | | Modified for Python by Christian Heimes: |
319 | | - C89 / MSVC compatibility |
320 | | - _rotl64() on Windows |
321 | | - letoh64() fallback |
322 | | */ |
323 | | |
324 | | /* byte swap little endian to host endian |
325 | | * Endian conversion not only ensures that the hash function returns the same |
326 | | * value on all platforms. It is also required to for a good dispersion of |
327 | | * the hash values' least significant bits. |
328 | | */ |
329 | | #if PY_LITTLE_ENDIAN |
330 | 1.22M | # define _le64toh(x) ((uint64_t)(x)) |
331 | | #elif defined(__APPLE__) |
332 | | # define _le64toh(x) OSSwapLittleToHostInt64(x) |
333 | | #elif defined(HAVE_LETOH64) |
334 | | # define _le64toh(x) le64toh(x) |
335 | | #else |
336 | | # define _le64toh(x) (((uint64_t)(x) << 56) | \ |
337 | | (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \ |
338 | | (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \ |
339 | | (((uint64_t)(x) << 8) & 0xff00000000ULL) | \ |
340 | | (((uint64_t)(x) >> 8) & 0xff000000ULL) | \ |
341 | | (((uint64_t)(x) >> 24) & 0xff0000ULL) | \ |
342 | | (((uint64_t)(x) >> 40) & 0xff00ULL) | \ |
343 | | ((uint64_t)(x) >> 56)) |
344 | | #endif |
345 | | |
346 | | |
347 | | #ifdef _MSC_VER |
348 | | # define ROTATE(x, b) _rotl64(x, b) |
349 | | #else |
350 | 14.6M | # define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) ) |
351 | | #endif |
352 | | |
353 | | #define HALF_ROUND(a,b,c,d,s,t) \ |
354 | 4.89M | a += b; c += d; \ |
355 | 4.89M | b = ROTATE(b, s) ^ a; \ |
356 | 4.89M | d = ROTATE(d, t) ^ c; \ |
357 | 4.89M | a = ROTATE(a, 32); |
358 | | |
359 | | #define DOUBLE_ROUND(v0,v1,v2,v3) \ |
360 | 1.22M | HALF_ROUND(v0,v1,v2,v3,13,16); \ |
361 | 1.22M | HALF_ROUND(v2,v1,v0,v3,17,21); \ |
362 | 1.22M | HALF_ROUND(v0,v1,v2,v3,13,16); \ |
363 | 1.22M | HALF_ROUND(v2,v1,v0,v3,17,21); |
364 | | |
365 | | |
366 | | static uint64_t |
367 | 216k | siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) { |
368 | 216k | uint64_t b = (uint64_t)src_sz << 56; |
369 | 216k | const uint8_t *in = (const uint8_t*)src; |
370 | | |
371 | 216k | uint64_t v0 = k0 ^ 0x736f6d6570736575ULL; |
372 | 216k | uint64_t v1 = k1 ^ 0x646f72616e646f6dULL; |
373 | 216k | uint64_t v2 = k0 ^ 0x6c7967656e657261ULL; |
374 | 216k | uint64_t v3 = k1 ^ 0x7465646279746573ULL; |
375 | | |
376 | 216k | uint64_t t; |
377 | 216k | uint8_t *pt; |
378 | | |
379 | 791k | while (src_sz >= 8) { |
380 | 574k | uint64_t mi; |
381 | 574k | memcpy(&mi, in, sizeof(mi)); |
382 | 574k | mi = _le64toh(mi); |
383 | 574k | in += sizeof(mi); |
384 | 574k | src_sz -= sizeof(mi); |
385 | 574k | v3 ^= mi; |
386 | 574k | DOUBLE_ROUND(v0,v1,v2,v3); |
387 | 574k | v0 ^= mi; |
388 | 574k | } |
389 | | |
390 | 216k | t = 0; |
391 | 216k | pt = (uint8_t *)&t; |
392 | 216k | switch (src_sz) { |
393 | 27.8k | case 7: pt[6] = in[6]; /* fall through */ |
394 | 52.3k | case 6: pt[5] = in[5]; /* fall through */ |
395 | 75.5k | case 5: pt[4] = in[4]; /* fall through */ |
396 | 100k | case 4: memcpy(pt, in, sizeof(uint32_t)); break; |
397 | 19.8k | case 3: pt[2] = in[2]; /* fall through */ |
398 | 43.5k | case 2: pt[1] = in[1]; /* fall through */ |
399 | 72.8k | case 1: pt[0] = in[0]; /* fall through */ |
400 | 216k | } |
401 | 216k | b |= _le64toh(t); |
402 | | |
403 | 216k | v3 ^= b; |
404 | 216k | DOUBLE_ROUND(v0,v1,v2,v3); |
405 | 216k | v0 ^= b; |
406 | 216k | v2 ^= 0xff; |
407 | 216k | DOUBLE_ROUND(v0,v1,v2,v3); |
408 | 216k | DOUBLE_ROUND(v0,v1,v2,v3); |
409 | | |
410 | | /* modified */ |
411 | 216k | t = (v0 ^ v1) ^ (v2 ^ v3); |
412 | 216k | return t; |
413 | 216k | } |
414 | | |
415 | | uint64_t |
416 | | _Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz) |
417 | 0 | { |
418 | 0 | return siphash24(key, 0, src, src_sz); |
419 | 0 | } |
420 | | |
421 | | |
422 | | #if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 |
423 | | static Py_hash_t |
424 | 216k | pysiphash(const void *src, Py_ssize_t src_sz) { |
425 | 216k | return (Py_hash_t)siphash24( |
426 | 216k | _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1), |
427 | 216k | src, src_sz); |
428 | 216k | } |
429 | | |
430 | | static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128}; |
431 | | #endif |
432 | | |
433 | | #ifdef __cplusplus |
434 | | } |
435 | | #endif |