Coverage Report

Created: 2025-07-04 06:49

/src/cpython/Python/pyhash.c
Line
Count
Source (jump to first uncovered line)
1
/* Set of hash utility functions to help maintaining the invariant that
2
    if a==b then hash(a)==hash(b)
3
4
   All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5
*/
6
#include "Python.h"
7
#include "pycore_pyhash.h"        // _Py_HashSecret_t
8
9
#ifdef __APPLE__
10
#  include <libkern/OSByteOrder.h>
11
#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
12
#  include <endian.h>
13
#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
14
#  include <sys/endian.h>
15
#endif
16
17
_Py_HashSecret_t _Py_HashSecret = {{0}};
18
19
#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
20
extern PyHash_FuncDef PyHash_Func;
21
#else
22
static PyHash_FuncDef PyHash_Func;
23
#endif
24
25
/* Count Py_HashBuffer() calls */
26
#ifdef Py_HASH_STATS
27
#define Py_HASH_STATS_MAX 32
28
static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
29
#endif
30
31
/* For numeric types, the hash of a number x is based on the reduction
32
   of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
33
   hash(x) == hash(y) whenever x and y are numerically equal, even if
34
   x and y have different types.
35
36
   A quick summary of the hashing strategy:
37
38
   (1) First define the 'reduction of x modulo P' for any rational
39
   number x; this is a standard extension of the usual notion of
40
   reduction modulo P for integers.  If x == p/q (written in lowest
41
   terms), the reduction is interpreted as the reduction of p times
42
   the inverse of the reduction of q, all modulo P; if q is exactly
43
   divisible by P then define the reduction to be infinity.  So we've
44
   got a well-defined map
45
46
      reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
47
48
   (2) Now for a rational number x, define hash(x) by:
49
50
      reduce(x)   if x >= 0
51
      -reduce(-x) if x < 0
52
53
   If the result of the reduction is infinity (this is impossible for
54
   integers, floats and Decimals) then use the predefined hash value
55
   _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
56
   _PyHASH_INF and -_PyHASH_INF are also used for the
57
   hashes of float and Decimal infinities.
58
59
   NaNs hash with a pointer hash.  Having distinct hash values prevents
60
   catastrophic pileups from distinct NaN instances which used to always
61
   have the same hash value but would compare unequal.
62
63
   A selling point for the above strategy is that it makes it possible
64
   to compute hashes of decimal and binary floating-point numbers
65
   efficiently, even if the exponent of the binary or decimal number
66
   is large.  The key point is that
67
68
      reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
69
70
   provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
71
   binary or decimal float is never infinity, since the denominator is a power
72
   of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
73
   for nonnegative x,
74
75
      reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
76
77
      reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
78
79
   and reduce(10**e) can be computed efficiently by the usual modular
80
   exponentiation algorithm.  For reduce(2**e) it's even better: since
81
   P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
82
   by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
83
84
   */
85
86
Py_hash_t
87
_Py_HashDouble(PyObject *inst, double v)
88
365
{
89
365
    int e, sign;
90
365
    double m;
91
365
    Py_uhash_t x, y;
92
93
365
    if (!isfinite(v)) {
94
0
        if (isinf(v))
95
0
            return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
96
0
        else
97
0
            return PyObject_GenericHash(inst);
98
0
    }
99
100
365
    m = frexp(v, &e);
101
102
365
    sign = 1;
103
365
    if (m < 0) {
104
12
        sign = -1;
105
12
        m = -m;
106
12
    }
107
108
    /* process 28 bits at a time;  this should work well both for binary
109
       and hexadecimal floating point. */
110
365
    x = 0;
111
745
    while (m) {
112
380
        x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
113
380
        m *= 268435456.0;  /* 2**28 */
114
380
        e -= 28;
115
380
        y = (Py_uhash_t)m;  /* pull out integer part */
116
380
        m -= y;
117
380
        x += y;
118
380
        if (x >= _PyHASH_MODULUS)
119
0
            x -= _PyHASH_MODULUS;
120
380
    }
121
122
    /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
123
365
    e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
124
365
    x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
125
126
365
    x = x * sign;
127
365
    if (x == (Py_uhash_t)-1)
128
3
        x = (Py_uhash_t)-2;
129
365
    return (Py_hash_t)x;
130
365
}
131
132
Py_hash_t
133
Py_HashPointer(const void *ptr)
134
940k
{
135
940k
    Py_hash_t hash = _Py_HashPointerRaw(ptr);
136
940k
    if (hash == -1) {
137
0
        hash = -2;
138
0
    }
139
940k
    return hash;
140
940k
}
141
142
Py_hash_t
143
PyObject_GenericHash(PyObject *obj)
144
938k
{
145
938k
    return Py_HashPointer(obj);
146
938k
}
147
148
Py_hash_t
149
Py_HashBuffer(const void *ptr, Py_ssize_t len)
150
51.8M
{
151
    /*
152
      We make the hash of the empty string be 0, rather than using
153
      (prefix ^ suffix), since this slightly obfuscates the hash secret
154
    */
155
51.8M
    if (len == 0) {
156
32
        return 0;
157
32
    }
158
159
#ifdef Py_HASH_STATS
160
    hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
161
#endif
162
163
51.8M
    Py_hash_t x;
164
#if Py_HASH_CUTOFF > 0
165
    if (len < Py_HASH_CUTOFF) {
166
        /* Optimize hashing of very small strings with inline DJBX33A. */
167
        Py_uhash_t hash;
168
        const unsigned char *p = ptr;
169
        hash = 5381; /* DJBX33A starts with 5381 */
170
171
        switch(len) {
172
            /* ((hash << 5) + hash) + *p == hash * 33 + *p */
173
            case 7: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
174
            case 6: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
175
            case 5: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
176
            case 4: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
177
            case 3: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
178
            case 2: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
179
            case 1: hash = ((hash << 5) + hash) + *p++; break;
180
            default:
181
                Py_UNREACHABLE();
182
        }
183
        hash ^= len;
184
        hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
185
        x = (Py_hash_t)hash;
186
    }
187
    else
188
#endif /* Py_HASH_CUTOFF */
189
51.8M
    {
190
51.8M
        x = PyHash_Func.hash(ptr, len);
191
51.8M
    }
192
193
51.8M
    if (x == -1) {
194
0
        return -2;
195
0
    }
196
51.8M
    return x;
197
51.8M
}
198
199
void
200
_PyHash_Fini(void)
201
0
{
202
#ifdef Py_HASH_STATS
203
    fprintf(stderr, "len   calls    total\n");
204
    Py_ssize_t total = 0;
205
    for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
206
        total += hashstats[i];
207
        fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
208
    }
209
    total += hashstats[0];
210
    fprintf(stderr, ">  %8zd %8zd\n", hashstats[0], total);
211
#endif
212
0
}
213
214
PyHash_FuncDef *
215
PyHash_GetFuncDef(void)
216
16
{
217
16
    return &PyHash_Func;
218
16
}
219
220
/* Optimized memcpy() for Windows */
221
#ifdef _MSC_VER
222
#  if SIZEOF_PY_UHASH_T == 4
223
#    define PY_UHASH_CPY(dst, src) do {                                    \
224
       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
225
       } while(0)
226
#  elif SIZEOF_PY_UHASH_T == 8
227
#    define PY_UHASH_CPY(dst, src) do {                                    \
228
       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
229
       dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
230
       } while(0)
231
#  else
232
#    error SIZEOF_PY_UHASH_T must be 4 or 8
233
#  endif /* SIZEOF_PY_UHASH_T */
234
#else /* not Windows */
235
#  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
236
#endif /* _MSC_VER */
237
238
239
#if Py_HASH_ALGORITHM == Py_HASH_FNV
240
/* **************************************************************************
241
 * Modified Fowler-Noll-Vo (FNV) hash function
242
 */
243
static Py_hash_t
244
fnv(const void *src, Py_ssize_t len)
245
{
246
    const unsigned char *p = src;
247
    Py_uhash_t x;
248
    Py_ssize_t remainder, blocks;
249
    union {
250
        Py_uhash_t value;
251
        unsigned char bytes[SIZEOF_PY_UHASH_T];
252
    } block;
253
254
#ifdef Py_DEBUG
255
    assert(_Py_HashSecret_Initialized);
256
#endif
257
    remainder = len % SIZEOF_PY_UHASH_T;
258
    if (remainder == 0) {
259
        /* Process at least one block byte by byte to reduce hash collisions
260
         * for strings with common prefixes. */
261
        remainder = SIZEOF_PY_UHASH_T;
262
    }
263
    blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
264
265
    x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
266
    x ^= (Py_uhash_t) *p << 7;
267
    while (blocks--) {
268
        PY_UHASH_CPY(block.bytes, p);
269
        x = (PyHASH_MULTIPLIER * x) ^ block.value;
270
        p += SIZEOF_PY_UHASH_T;
271
    }
272
    /* add remainder */
273
    for (; remainder > 0; remainder--)
274
        x = (PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
275
    x ^= (Py_uhash_t) len;
276
    x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
277
    if (x == (Py_uhash_t) -1) {
278
        x = (Py_uhash_t) -2;
279
    }
280
    return x;
281
}
282
283
static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
284
                                     16 * SIZEOF_PY_HASH_T};
285
286
#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
287
288
289
/* **************************************************************************
290
 <MIT License>
291
 Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
292
293
 Permission is hereby granted, free of charge, to any person obtaining a copy
294
 of this software and associated documentation files (the "Software"), to deal
295
 in the Software without restriction, including without limitation the rights
296
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
297
 copies of the Software, and to permit persons to whom the Software is
298
 furnished to do so, subject to the following conditions:
299
300
 The above copyright notice and this permission notice shall be included in
301
 all copies or substantial portions of the Software.
302
303
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
304
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
305
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
306
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
307
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
308
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
309
 THE SOFTWARE.
310
 </MIT License>
311
312
 Original location:
313
    https://github.com/majek/csiphash/
314
315
 Solution inspired by code from:
316
    Samuel Neves (supercop/crypto_auth/siphash24/little)
317
    djb (supercop/crypto_auth/siphash24/little2)
318
    Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
319
320
 Modified for Python by Christian Heimes:
321
    - C89 / MSVC compatibility
322
    - _rotl64() on Windows
323
    - letoh64() fallback
324
*/
325
326
/* byte swap little endian to host endian
327
 * Endian conversion not only ensures that the hash function returns the same
328
 * value on all platforms. It is also required to for a good dispersion of
329
 * the hash values' least significant bits.
330
 */
331
#if PY_LITTLE_ENDIAN
332
322M
#  define _le64toh(x) ((uint64_t)(x))
333
#elif defined(__APPLE__)
334
#  define _le64toh(x) OSSwapLittleToHostInt64(x)
335
#elif defined(HAVE_LETOH64)
336
#  define _le64toh(x) le64toh(x)
337
#else
338
#  define _le64toh(x) (((uint64_t)(x) << 56) | \
339
                      (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
340
                      (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
341
                      (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
342
                      (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
343
                      (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
344
                      (((uint64_t)(x) >> 40) & 0xff00ULL) | \
345
                      ((uint64_t)(x)  >> 56))
346
#endif
347
348
349
#ifdef _MSC_VER
350
#  define ROTATE(x, b)  _rotl64(x, b)
351
#else
352
2.24G
#  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
353
#endif
354
355
#define HALF_ROUND(a,b,c,d,s,t)     \
356
748M
    a += b; c += d;                 \
357
748M
    b = ROTATE(b, s) ^ a;           \
358
748M
    d = ROTATE(d, t) ^ c;           \
359
748M
    a = ROTATE(a, 32);
360
361
#define SINGLE_ROUND(v0,v1,v2,v3)   \
362
374M
    HALF_ROUND(v0,v1,v2,v3,13,16);  \
363
374M
    HALF_ROUND(v2,v1,v0,v3,17,21);
364
365
#define DOUBLE_ROUND(v0,v1,v2,v3)   \
366
    SINGLE_ROUND(v0,v1,v2,v3);      \
367
    SINGLE_ROUND(v0,v1,v2,v3);
368
369
370
static uint64_t
371
51.8M
siphash13(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
372
51.8M
    uint64_t b = (uint64_t)src_sz << 56;
373
51.8M
    const uint8_t *in = (const uint8_t*)src;
374
375
51.8M
    uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
376
51.8M
    uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
377
51.8M
    uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
378
51.8M
    uint64_t v3 = k1 ^ 0x7465646279746573ULL;
379
380
51.8M
    uint64_t t;
381
51.8M
    uint8_t *pt;
382
383
218M
    while (src_sz >= 8) {
384
167M
        uint64_t mi;
385
167M
        memcpy(&mi, in, sizeof(mi));
386
167M
        mi = _le64toh(mi);
387
167M
        in += sizeof(mi);
388
167M
        src_sz -= sizeof(mi);
389
167M
        v3 ^= mi;
390
167M
        SINGLE_ROUND(v0,v1,v2,v3);
391
167M
        v0 ^= mi;
392
167M
    }
393
394
51.8M
    t = 0;
395
51.8M
    pt = (uint8_t *)&t;
396
51.8M
    switch (src_sz) {
397
255k
        case 7: pt[6] = in[6]; _Py_FALLTHROUGH;
398
3.14M
        case 6: pt[5] = in[5]; _Py_FALLTHROUGH;
399
3.63M
        case 5: pt[4] = in[4]; _Py_FALLTHROUGH;
400
6.52M
        case 4: memcpy(pt, in, sizeof(uint32_t)); break;
401
2.02M
        case 3: pt[2] = in[2]; _Py_FALLTHROUGH;
402
41.3M
        case 2: pt[1] = in[1]; _Py_FALLTHROUGH;
403
42.7M
        case 1: pt[0] = in[0]; break;
404
51.8M
    }
405
51.8M
    b |= _le64toh(t);
406
407
51.8M
    v3 ^= b;
408
51.8M
    SINGLE_ROUND(v0,v1,v2,v3);
409
51.8M
    v0 ^= b;
410
51.8M
    v2 ^= 0xff;
411
51.8M
    SINGLE_ROUND(v0,v1,v2,v3);
412
51.8M
    SINGLE_ROUND(v0,v1,v2,v3);
413
51.8M
    SINGLE_ROUND(v0,v1,v2,v3);
414
415
    /* modified */
416
51.8M
    t = (v0 ^ v1) ^ (v2 ^ v3);
417
51.8M
    return t;
418
51.8M
}
419
420
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
421
static uint64_t
422
siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
423
    uint64_t b = (uint64_t)src_sz << 56;
424
    const uint8_t *in = (const uint8_t*)src;
425
426
    uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
427
    uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
428
    uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
429
    uint64_t v3 = k1 ^ 0x7465646279746573ULL;
430
431
    uint64_t t;
432
    uint8_t *pt;
433
434
    while (src_sz >= 8) {
435
        uint64_t mi;
436
        memcpy(&mi, in, sizeof(mi));
437
        mi = _le64toh(mi);
438
        in += sizeof(mi);
439
        src_sz -= sizeof(mi);
440
        v3 ^= mi;
441
        DOUBLE_ROUND(v0,v1,v2,v3);
442
        v0 ^= mi;
443
    }
444
445
    t = 0;
446
    pt = (uint8_t *)&t;
447
    switch (src_sz) {
448
        case 7: pt[6] = in[6]; _Py_FALLTHROUGH;
449
        case 6: pt[5] = in[5]; _Py_FALLTHROUGH;
450
        case 5: pt[4] = in[4]; _Py_FALLTHROUGH;
451
        case 4: memcpy(pt, in, sizeof(uint32_t)); break;
452
        case 3: pt[2] = in[2]; _Py_FALLTHROUGH;
453
        case 2: pt[1] = in[1]; _Py_FALLTHROUGH;
454
        case 1: pt[0] = in[0]; break;
455
    }
456
    b |= _le64toh(t);
457
458
    v3 ^= b;
459
    DOUBLE_ROUND(v0,v1,v2,v3);
460
    v0 ^= b;
461
    v2 ^= 0xff;
462
    DOUBLE_ROUND(v0,v1,v2,v3);
463
    DOUBLE_ROUND(v0,v1,v2,v3);
464
465
    /* modified */
466
    t = (v0 ^ v1) ^ (v2 ^ v3);
467
    return t;
468
}
469
#endif
470
471
uint64_t
472
_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
473
0
{
474
0
    return siphash13(key, 0, src, src_sz);
475
0
}
476
477
478
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH13
479
static Py_hash_t
480
51.8M
pysiphash(const void *src, Py_ssize_t src_sz) {
481
51.8M
    return (Py_hash_t)siphash13(
482
51.8M
        _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
483
51.8M
        src, src_sz);
484
51.8M
}
485
486
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash13", 64, 128};
487
#endif
488
489
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
490
static Py_hash_t
491
pysiphash(const void *src, Py_ssize_t src_sz) {
492
    return (Py_hash_t)siphash24(
493
        _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
494
        src, src_sz);
495
}
496
497
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
498
#endif