Coverage Report

Created: 2026-03-08 06:40

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/cpython/Python/pyhash.c
Line
Count
Source
1
/* Set of hash utility functions to help maintaining the invariant that
2
    if a==b then hash(a)==hash(b)
3
4
   All the utility functions (_Py_Hash*()) return "-1" to signify an error.
5
*/
6
#include "Python.h"
7
#include "pycore_pyhash.h"        // _Py_HashSecret_t
8
9
#ifdef __APPLE__
10
#  include <libkern/OSByteOrder.h>
11
#elif defined(HAVE_LE64TOH) && defined(HAVE_ENDIAN_H)
12
#  include <endian.h>
13
#elif defined(HAVE_LE64TOH) && defined(HAVE_SYS_ENDIAN_H)
14
#  include <sys/endian.h>
15
#endif
16
17
_Py_HashSecret_t _Py_HashSecret = {{0}};
18
19
#if Py_HASH_ALGORITHM == Py_HASH_EXTERNAL
20
Py_DEPRECATED(3.15) extern PyHash_FuncDef PyHash_Func;
21
#else
22
static PyHash_FuncDef PyHash_Func;
23
#endif
24
25
/* Count Py_HashBuffer() calls */
26
#ifdef Py_HASH_STATS
27
#define Py_HASH_STATS_MAX 32
28
static Py_ssize_t hashstats[Py_HASH_STATS_MAX + 1] = {0};
29
#endif
30
31
/* For numeric types, the hash of a number x is based on the reduction
32
   of x modulo the prime P = 2**PyHASH_BITS - 1.  It's designed so that
33
   hash(x) == hash(y) whenever x and y are numerically equal, even if
34
   x and y have different types.
35
36
   A quick summary of the hashing strategy:
37
38
   (1) First define the 'reduction of x modulo P' for any rational
39
   number x; this is a standard extension of the usual notion of
40
   reduction modulo P for integers.  If x == p/q (written in lowest
41
   terms), the reduction is interpreted as the reduction of p times
42
   the inverse of the reduction of q, all modulo P; if q is exactly
43
   divisible by P then define the reduction to be infinity.  So we've
44
   got a well-defined map
45
46
      reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
47
48
   (2) Now for a rational number x, define hash(x) by:
49
50
      reduce(x)   if x >= 0
51
      -reduce(-x) if x < 0
52
53
   If the result of the reduction is infinity (this is impossible for
54
   integers, floats and Decimals) then use the predefined hash value
55
   PyHASH_INF for x >= 0, or -PyHASH_INF for x < 0, instead.
56
   PyHASH_INF and -PyHASH_INF are also used for the
57
   hashes of float and Decimal infinities.
58
59
   NaNs hash with a pointer hash.  Having distinct hash values prevents
60
   catastrophic pileups from distinct NaN instances which used to always
61
   have the same hash value but would compare unequal.
62
63
   A selling point for the above strategy is that it makes it possible
64
   to compute hashes of decimal and binary floating-point numbers
65
   efficiently, even if the exponent of the binary or decimal number
66
   is large.  The key point is that
67
68
      reduce(x * y) == reduce(x) * reduce(y) (modulo PyHASH_MODULUS)
69
70
   provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
71
   binary or decimal float is never infinity, since the denominator is a power
72
   of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
73
   for nonnegative x,
74
75
      reduce(x * 2**e) == reduce(x) * reduce(2**e) % PyHASH_MODULUS
76
77
      reduce(x * 10**e) == reduce(x) * reduce(10**e) % PyHASH_MODULUS
78
79
   and reduce(10**e) can be computed efficiently by the usual modular
80
   exponentiation algorithm.  For reduce(2**e) it's even better: since
81
   P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
82
   by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
83
84
   */
85
86
Py_hash_t
87
_Py_HashDouble(PyObject *inst, double v)
88
2.27M
{
89
2.27M
    int e, sign;
90
2.27M
    double m;
91
2.27M
    Py_uhash_t x, y;
92
93
2.27M
    if (!isfinite(v)) {
94
0
        if (isinf(v))
95
0
            return v > 0 ? PyHASH_INF : -PyHASH_INF;
96
0
        else
97
0
            return PyObject_GenericHash(inst);
98
0
    }
99
100
2.27M
    m = frexp(v, &e);
101
102
2.27M
    sign = 1;
103
2.27M
    if (m < 0) {
104
124
        sign = -1;
105
124
        m = -m;
106
124
    }
107
108
    /* process 28 bits at a time;  this should work well both for binary
109
       and hexadecimal floating point. */
110
2.27M
    x = 0;
111
6.76M
    while (m) {
112
4.48M
        x = ((x << 28) & PyHASH_MODULUS) | x >> (PyHASH_BITS - 28);
113
4.48M
        m *= 268435456.0;  /* 2**28 */
114
4.48M
        e -= 28;
115
4.48M
        y = (Py_uhash_t)m;  /* pull out integer part */
116
4.48M
        m -= y;
117
4.48M
        x += y;
118
4.48M
        if (x >= PyHASH_MODULUS)
119
0
            x -= PyHASH_MODULUS;
120
4.48M
    }
121
122
    /* adjust for the exponent;  first reduce it modulo PyHASH_BITS */
123
2.27M
    e = e >= 0 ? e % PyHASH_BITS : PyHASH_BITS-1-((-1-e) % PyHASH_BITS);
124
2.27M
    x = ((x << e) & PyHASH_MODULUS) | x >> (PyHASH_BITS - e);
125
126
2.27M
    x = x * sign;
127
2.27M
    if (x == (Py_uhash_t)-1)
128
0
        x = (Py_uhash_t)-2;
129
2.27M
    return (Py_hash_t)x;
130
2.27M
}
131
132
Py_hash_t
133
Py_HashPointer(const void *ptr)
134
20.8M
{
135
20.8M
    Py_hash_t hash = _Py_HashPointerRaw(ptr);
136
20.8M
    if (hash == -1) {
137
0
        hash = -2;
138
0
    }
139
20.8M
    return hash;
140
20.8M
}
141
142
Py_hash_t
143
PyObject_GenericHash(PyObject *obj)
144
20.7M
{
145
20.7M
    return Py_HashPointer(obj);
146
20.7M
}
147
148
Py_hash_t
149
Py_HashBuffer(const void *ptr, Py_ssize_t len)
150
108M
{
151
    /*
152
      We make the hash of the empty string be 0, rather than using
153
      (prefix ^ suffix), since this slightly obfuscates the hash secret
154
    */
155
108M
    if (len == 0) {
156
68
        return 0;
157
68
    }
158
159
#ifdef Py_HASH_STATS
160
    hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
161
#endif
162
163
108M
    Py_hash_t x;
164
#if Py_HASH_CUTOFF > 0
165
    if (len < Py_HASH_CUTOFF) {
166
        /* Optimize hashing of very small strings with inline DJBX33A. */
167
        Py_uhash_t hash;
168
        const unsigned char *p = ptr;
169
        hash = 5381; /* DJBX33A starts with 5381 */
170
171
        switch(len) {
172
            /* ((hash << 5) + hash) + *p == hash * 33 + *p */
173
            case 7: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
174
            case 6: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
175
            case 5: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
176
            case 4: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
177
            case 3: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
178
            case 2: hash = ((hash << 5) + hash) + *p++; _Py_FALLTHROUGH;
179
            case 1: hash = ((hash << 5) + hash) + *p++; break;
180
            default:
181
                Py_UNREACHABLE();
182
        }
183
        hash ^= len;
184
        hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
185
        x = (Py_hash_t)hash;
186
    }
187
    else
188
#endif /* Py_HASH_CUTOFF */
189
108M
    {
190
108M
        x = PyHash_Func.hash(ptr, len);
191
108M
    }
192
193
108M
    if (x == -1) {
194
0
        return -2;
195
0
    }
196
108M
    return x;
197
108M
}
198
199
void
200
_PyHash_Fini(void)
201
0
{
202
#ifdef Py_HASH_STATS
203
    fprintf(stderr, "len   calls    total\n");
204
    Py_ssize_t total = 0;
205
    for (int i = 1; i <= Py_HASH_STATS_MAX; i++) {
206
        total += hashstats[i];
207
        fprintf(stderr, "%2i %8zd %8zd\n", i, hashstats[i], total);
208
    }
209
    total += hashstats[0];
210
    fprintf(stderr, ">  %8zd %8zd\n", hashstats[0], total);
211
#endif
212
0
}
213
214
PyHash_FuncDef *
215
PyHash_GetFuncDef(void)
216
34
{
217
34
    return &PyHash_Func;
218
34
}
219
220
/* Optimized memcpy() for Windows */
221
#ifdef _MSC_VER
222
#  if SIZEOF_PY_UHASH_T == 4
223
#    define PY_UHASH_CPY(dst, src) do {                                    \
224
       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
225
       } while(0)
226
#  elif SIZEOF_PY_UHASH_T == 8
227
#    define PY_UHASH_CPY(dst, src) do {                                    \
228
       dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; \
229
       dst[4] = src[4]; dst[5] = src[5]; dst[6] = src[6]; dst[7] = src[7]; \
230
       } while(0)
231
#  else
232
#    error SIZEOF_PY_UHASH_T must be 4 or 8
233
#  endif /* SIZEOF_PY_UHASH_T */
234
#else /* not Windows */
235
#  define PY_UHASH_CPY(dst, src) memcpy(dst, src, SIZEOF_PY_UHASH_T)
236
#endif /* _MSC_VER */
237
238
239
#if Py_HASH_ALGORITHM == Py_HASH_FNV
240
/* **************************************************************************
241
 * Modified Fowler-Noll-Vo (FNV) hash function
242
 */
243
static Py_hash_t
244
fnv(const void *src, Py_ssize_t len)
245
{
246
    const unsigned char *p = src;
247
    Py_uhash_t x;
248
    Py_ssize_t remainder, blocks;
249
    union {
250
        Py_uhash_t value;
251
        unsigned char bytes[SIZEOF_PY_UHASH_T];
252
    } block;
253
254
#ifdef Py_DEBUG
255
    assert(_Py_HashSecret_Initialized);
256
#endif
257
    remainder = len % SIZEOF_PY_UHASH_T;
258
    if (remainder == 0) {
259
        /* Process at least one block byte by byte to reduce hash collisions
260
         * for strings with common prefixes. */
261
        remainder = SIZEOF_PY_UHASH_T;
262
    }
263
    blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
264
265
    x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
266
    x ^= (Py_uhash_t) *p << 7;
267
    while (blocks--) {
268
        PY_UHASH_CPY(block.bytes, p);
269
        x = (PyHASH_MULTIPLIER * x) ^ block.value;
270
        p += SIZEOF_PY_UHASH_T;
271
    }
272
    /* add remainder */
273
    for (; remainder > 0; remainder--)
274
        x = (PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
275
    x ^= (Py_uhash_t) len;
276
    x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
277
    if (x == (Py_uhash_t) -1) {
278
        x = (Py_uhash_t) -2;
279
    }
280
    return x;
281
}
282
283
static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
284
                                     16 * SIZEOF_PY_HASH_T};
285
286
#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
287
288
289
/* **************************************************************************
290
 <MIT License>
291
 Copyright (c) 2013  Marek Majkowski <marek@popcount.org>
292
293
 Permission is hereby granted, free of charge, to any person obtaining a copy
294
 of this software and associated documentation files (the "Software"), to deal
295
 in the Software without restriction, including without limitation the rights
296
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
297
 copies of the Software, and to permit persons to whom the Software is
298
 furnished to do so, subject to the following conditions:
299
300
 The above copyright notice and this permission notice shall be included in
301
 all copies or substantial portions of the Software.
302
303
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
304
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
305
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
306
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
307
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
308
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
309
 THE SOFTWARE.
310
 </MIT License>
311
312
 Original location:
313
    https://github.com/majek/csiphash/
314
315
 Solution inspired by code from:
316
    Samuel Neves (supercop/crypto_auth/siphash24/little)
317
    djb (supercop/crypto_auth/siphash24/little2)
318
    Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c)
319
320
 Modified for Python by Christian Heimes:
321
    - C89 / MSVC compatibility
322
    - _rotl64() on Windows
323
    - letoh64() fallback
324
*/
325
326
/* byte swap little endian to host endian
327
 * Endian conversion not only ensures that the hash function returns the same
328
 * value on all platforms. It is also required to for a good dispersion of
329
 * the hash values' least significant bits.
330
 */
331
#if PY_LITTLE_ENDIAN
332
1.99G
#  define _le64toh(x) ((uint64_t)(x))
333
#elif defined(__APPLE__)
334
#  define _le64toh(x) OSSwapLittleToHostInt64(x)
335
#elif defined(HAVE_LETOH64)
336
#  define _le64toh(x) le64toh(x)
337
#else
338
#  define _le64toh(x) (((uint64_t)(x) << 56) | \
339
                      (((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
340
                      (((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
341
                      (((uint64_t)(x) << 8)  & 0xff00000000ULL) | \
342
                      (((uint64_t)(x) >> 8)  & 0xff000000ULL) | \
343
                      (((uint64_t)(x) >> 24) & 0xff0000ULL) | \
344
                      (((uint64_t)(x) >> 40) & 0xff00ULL) | \
345
                      ((uint64_t)(x)  >> 56))
346
#endif
347
348
349
#ifdef _MSC_VER
350
#  define ROTATE(x, b)  _rotl64(x, b)
351
#else
352
12.6G
#  define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
353
#endif
354
355
#define HALF_ROUND(a,b,c,d,s,t)     \
356
4.20G
    a += b; c += d;                 \
357
4.20G
    b = ROTATE(b, s) ^ a;           \
358
4.20G
    d = ROTATE(d, t) ^ c;           \
359
4.20G
    a = ROTATE(a, 32);
360
361
#define SINGLE_ROUND(v0,v1,v2,v3)   \
362
2.10G
    HALF_ROUND(v0,v1,v2,v3,13,16);  \
363
2.10G
    HALF_ROUND(v2,v1,v0,v3,17,21);
364
365
#define DOUBLE_ROUND(v0,v1,v2,v3)   \
366
    SINGLE_ROUND(v0,v1,v2,v3);      \
367
    SINGLE_ROUND(v0,v1,v2,v3);
368
369
370
static uint64_t
371
108M
siphash13(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
372
108M
    uint64_t b = (uint64_t)src_sz << 56;
373
108M
    const uint8_t *in = (const uint8_t*)src;
374
375
108M
    uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
376
108M
    uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
377
108M
    uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
378
108M
    uint64_t v3 = k1 ^ 0x7465646279746573ULL;
379
380
108M
    uint64_t t;
381
108M
    uint8_t *pt;
382
383
1.77G
    while (src_sz >= 8) {
384
1.66G
        uint64_t mi;
385
1.66G
        memcpy(&mi, in, sizeof(mi));
386
1.66G
        mi = _le64toh(mi);
387
1.66G
        in += sizeof(mi);
388
1.66G
        src_sz -= sizeof(mi);
389
1.66G
        v3 ^= mi;
390
1.66G
        SINGLE_ROUND(v0,v1,v2,v3);
391
1.66G
        v0 ^= mi;
392
1.66G
    }
393
394
108M
    t = 0;
395
108M
    pt = (uint8_t *)&t;
396
108M
    switch (src_sz) {
397
2.44M
        case 7: pt[6] = in[6]; _Py_FALLTHROUGH;
398
10.5M
        case 6: pt[5] = in[5]; _Py_FALLTHROUGH;
399
15.0M
        case 5: pt[4] = in[4]; _Py_FALLTHROUGH;
400
26.8M
        case 4: memcpy(pt, in, sizeof(uint32_t)); break;
401
3.68M
        case 3: pt[2] = in[2]; _Py_FALLTHROUGH;
402
49.9M
        case 2: pt[1] = in[1]; _Py_FALLTHROUGH;
403
55.6M
        case 1: pt[0] = in[0]; break;
404
108M
    }
405
108M
    b |= _le64toh(t);
406
407
108M
    v3 ^= b;
408
108M
    SINGLE_ROUND(v0,v1,v2,v3);
409
108M
    v0 ^= b;
410
108M
    v2 ^= 0xff;
411
108M
    SINGLE_ROUND(v0,v1,v2,v3);
412
108M
    SINGLE_ROUND(v0,v1,v2,v3);
413
108M
    SINGLE_ROUND(v0,v1,v2,v3);
414
415
    /* modified */
416
108M
    t = (v0 ^ v1) ^ (v2 ^ v3);
417
108M
    return t;
418
108M
}
419
420
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
421
static uint64_t
422
siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
423
    uint64_t b = (uint64_t)src_sz << 56;
424
    const uint8_t *in = (const uint8_t*)src;
425
426
    uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
427
    uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
428
    uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
429
    uint64_t v3 = k1 ^ 0x7465646279746573ULL;
430
431
    uint64_t t;
432
    uint8_t *pt;
433
434
    while (src_sz >= 8) {
435
        uint64_t mi;
436
        memcpy(&mi, in, sizeof(mi));
437
        mi = _le64toh(mi);
438
        in += sizeof(mi);
439
        src_sz -= sizeof(mi);
440
        v3 ^= mi;
441
        DOUBLE_ROUND(v0,v1,v2,v3);
442
        v0 ^= mi;
443
    }
444
445
    t = 0;
446
    pt = (uint8_t *)&t;
447
    switch (src_sz) {
448
        case 7: pt[6] = in[6]; _Py_FALLTHROUGH;
449
        case 6: pt[5] = in[5]; _Py_FALLTHROUGH;
450
        case 5: pt[4] = in[4]; _Py_FALLTHROUGH;
451
        case 4: memcpy(pt, in, sizeof(uint32_t)); break;
452
        case 3: pt[2] = in[2]; _Py_FALLTHROUGH;
453
        case 2: pt[1] = in[1]; _Py_FALLTHROUGH;
454
        case 1: pt[0] = in[0]; break;
455
    }
456
    b |= _le64toh(t);
457
458
    v3 ^= b;
459
    DOUBLE_ROUND(v0,v1,v2,v3);
460
    v0 ^= b;
461
    v2 ^= 0xff;
462
    DOUBLE_ROUND(v0,v1,v2,v3);
463
    DOUBLE_ROUND(v0,v1,v2,v3);
464
465
    /* modified */
466
    t = (v0 ^ v1) ^ (v2 ^ v3);
467
    return t;
468
}
469
#endif
470
471
uint64_t
472
_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
473
0
{
474
0
    return siphash13(key, 0, src, src_sz);
475
0
}
476
477
478
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH13
479
static Py_hash_t
480
108M
pysiphash(const void *src, Py_ssize_t src_sz) {
481
108M
    return (Py_hash_t)siphash13(
482
108M
        _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
483
108M
        src, src_sz);
484
108M
}
485
486
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash13", 64, 128};
487
#endif
488
489
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
490
static Py_hash_t
491
pysiphash(const void *src, Py_ssize_t src_sz) {
492
    return (Py_hash_t)siphash24(
493
        _le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
494
        src, src_sz);
495
}
496
497
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
498
#endif