1"""
2data hash pandas / numpy objects
3"""
4from __future__ import annotations
5
6import itertools
7from typing import TYPE_CHECKING
8
9import numpy as np
10
11from pandas._libs.hashing import hash_object_array
12
13from pandas.core.dtypes.common import is_list_like
14from pandas.core.dtypes.dtypes import CategoricalDtype
15from pandas.core.dtypes.generic import (
16 ABCDataFrame,
17 ABCExtensionArray,
18 ABCIndex,
19 ABCMultiIndex,
20 ABCSeries,
21)
22
23if TYPE_CHECKING:
24 from collections.abc import (
25 Hashable,
26 Iterable,
27 Iterator,
28 )
29
30 from pandas._typing import (
31 ArrayLike,
32 npt,
33 )
34
35 from pandas import (
36 DataFrame,
37 Index,
38 MultiIndex,
39 Series,
40 )
41
42
43# 16 byte long hashing key
44_default_hash_key = "0123456789123456"
45
46
47def combine_hash_arrays(
48 arrays: Iterator[np.ndarray], num_items: int
49) -> npt.NDArray[np.uint64]:
50 """
51 Parameters
52 ----------
53 arrays : Iterator[np.ndarray]
54 num_items : int
55
56 Returns
57 -------
58 np.ndarray[uint64]
59
60 Should be the same as CPython's tupleobject.c
61 """
62 try:
63 first = next(arrays)
64 except StopIteration:
65 return np.array([], dtype=np.uint64)
66
67 arrays = itertools.chain([first], arrays)
68
69 mult = np.uint64(1000003)
70 out = np.zeros_like(first) + np.uint64(0x345678)
71 last_i = 0
72 for i, a in enumerate(arrays):
73 inverse_i = num_items - i
74 out ^= a
75 out *= mult
76 mult += np.uint64(82520 + inverse_i + inverse_i)
77 last_i = i
78 assert last_i + 1 == num_items, "Fed in wrong num_items"
79 out += np.uint64(97531)
80 return out
81
82
83def hash_pandas_object(
84 obj: Index | DataFrame | Series,
85 index: bool = True,
86 encoding: str = "utf8",
87 hash_key: str | None = _default_hash_key,
88 categorize: bool = True,
89) -> Series:
90 """
91 Return a data hash of the Index/Series/DataFrame.
92
93 Parameters
94 ----------
95 obj : Index, Series, or DataFrame
96 index : bool, default True
97 Include the index in the hash (if Series/DataFrame).
98 encoding : str, default 'utf8'
99 Encoding for data & key when strings.
100 hash_key : str, default _default_hash_key
101 Hash_key for string key to encode.
102 categorize : bool, default True
103 Whether to first categorize object arrays before hashing. This is more
104 efficient when the array contains duplicate values.
105
106 Returns
107 -------
108 Series of uint64, same length as the object
109
110 Examples
111 --------
112 >>> pd.util.hash_pandas_object(pd.Series([1, 2, 3]))
113 0 14639053686158035780
114 1 3869563279212530728
115 2 393322362522515241
116 dtype: uint64
117 """
118 from pandas import Series
119
120 if hash_key is None:
121 hash_key = _default_hash_key
122
123 if isinstance(obj, ABCMultiIndex):
124 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
125
126 elif isinstance(obj, ABCIndex):
127 h = hash_array(obj._values, encoding, hash_key, categorize).astype(
128 "uint64", copy=False
129 )
130 ser = Series(h, index=obj, dtype="uint64", copy=False)
131
132 elif isinstance(obj, ABCSeries):
133 h = hash_array(obj._values, encoding, hash_key, categorize).astype(
134 "uint64", copy=False
135 )
136 if index:
137 index_iter = (
138 hash_pandas_object(
139 obj.index,
140 index=False,
141 encoding=encoding,
142 hash_key=hash_key,
143 categorize=categorize,
144 )._values
145 for _ in [None]
146 )
147 arrays = itertools.chain([h], index_iter)
148 h = combine_hash_arrays(arrays, 2)
149
150 ser = Series(h, index=obj.index, dtype="uint64", copy=False)
151
152 elif isinstance(obj, ABCDataFrame):
153 hashes = (
154 hash_array(series._values, encoding, hash_key, categorize)
155 for _, series in obj.items()
156 )
157 num_items = len(obj.columns)
158 if index:
159 index_hash_generator = (
160 hash_pandas_object(
161 obj.index,
162 index=False,
163 encoding=encoding,
164 hash_key=hash_key,
165 categorize=categorize,
166 )._values
167 for _ in [None]
168 )
169 num_items += 1
170
171 # keep `hashes` specifically a generator to keep mypy happy
172 _hashes = itertools.chain(hashes, index_hash_generator)
173 hashes = (x for x in _hashes)
174 h = combine_hash_arrays(hashes, num_items)
175
176 ser = Series(h, index=obj.index, dtype="uint64", copy=False)
177 else:
178 raise TypeError(f"Unexpected type for hashing {type(obj)}")
179
180 return ser
181
182
183def hash_tuples(
184 vals: MultiIndex | Iterable[tuple[Hashable, ...]],
185 encoding: str = "utf8",
186 hash_key: str = _default_hash_key,
187) -> npt.NDArray[np.uint64]:
188 """
189 Hash an MultiIndex / listlike-of-tuples efficiently.
190
191 Parameters
192 ----------
193 vals : MultiIndex or listlike-of-tuples
194 encoding : str, default 'utf8'
195 hash_key : str, default _default_hash_key
196
197 Returns
198 -------
199 ndarray[np.uint64] of hashed values
200 """
201 if not is_list_like(vals):
202 raise TypeError("must be convertible to a list-of-tuples")
203
204 from pandas import (
205 Categorical,
206 MultiIndex,
207 )
208
209 if not isinstance(vals, ABCMultiIndex):
210 mi = MultiIndex.from_tuples(vals)
211 else:
212 mi = vals
213
214 # create a list-of-Categoricals
215 cat_vals = [
216 Categorical._simple_new(
217 mi.codes[level],
218 CategoricalDtype(categories=mi.levels[level], ordered=False),
219 )
220 for level in range(mi.nlevels)
221 ]
222
223 # hash the list-of-ndarrays
224 hashes = (
225 cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False)
226 for cat in cat_vals
227 )
228 h = combine_hash_arrays(hashes, len(cat_vals))
229
230 return h
231
232
233def hash_array(
234 vals: ArrayLike,
235 encoding: str = "utf8",
236 hash_key: str = _default_hash_key,
237 categorize: bool = True,
238) -> npt.NDArray[np.uint64]:
239 """
240 Given a 1d array, return an array of deterministic integers.
241
242 Parameters
243 ----------
244 vals : ndarray or ExtensionArray
245 encoding : str, default 'utf8'
246 Encoding for data & key when strings.
247 hash_key : str, default _default_hash_key
248 Hash_key for string key to encode.
249 categorize : bool, default True
250 Whether to first categorize object arrays before hashing. This is more
251 efficient when the array contains duplicate values.
252
253 Returns
254 -------
255 ndarray[np.uint64, ndim=1]
256 Hashed values, same length as the vals.
257
258 Examples
259 --------
260 >>> pd.util.hash_array(np.array([1, 2, 3]))
261 array([ 6238072747940578789, 15839785061582574730, 2185194620014831856],
262 dtype=uint64)
263 """
264 if not hasattr(vals, "dtype"):
265 raise TypeError("must pass a ndarray-like")
266
267 if isinstance(vals, ABCExtensionArray):
268 return vals._hash_pandas_object(
269 encoding=encoding, hash_key=hash_key, categorize=categorize
270 )
271
272 if not isinstance(vals, np.ndarray):
273 # GH#42003
274 raise TypeError(
275 "hash_array requires np.ndarray or ExtensionArray, not "
276 f"{type(vals).__name__}. Use hash_pandas_object instead."
277 )
278
279 return _hash_ndarray(vals, encoding, hash_key, categorize)
280
281
282def _hash_ndarray(
283 vals: np.ndarray,
284 encoding: str = "utf8",
285 hash_key: str = _default_hash_key,
286 categorize: bool = True,
287) -> npt.NDArray[np.uint64]:
288 """
289 See hash_array.__doc__.
290 """
291 dtype = vals.dtype
292
293 # _hash_ndarray only takes 64-bit values, so handle 128-bit by parts
294 if np.issubdtype(dtype, np.complex128):
295 hash_real = _hash_ndarray(vals.real, encoding, hash_key, categorize)
296 hash_imag = _hash_ndarray(vals.imag, encoding, hash_key, categorize)
297 return hash_real + 23 * hash_imag
298
299 # First, turn whatever array this is into unsigned 64-bit ints, if we can
300 # manage it.
301 if dtype == bool:
302 vals = vals.astype("u8")
303 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
304 vals = vals.view("i8").astype("u8", copy=False)
305 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
306 vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
307 else:
308 # With repeated values, its MUCH faster to categorize object dtypes,
309 # then hash and rename categories. We allow skipping the categorization
310 # when the values are known/likely to be unique.
311 if categorize:
312 from pandas import (
313 Categorical,
314 Index,
315 factorize,
316 )
317
318 codes, categories = factorize(vals, sort=False)
319 dtype = CategoricalDtype(categories=Index(categories), ordered=False)
320 cat = Categorical._simple_new(codes, dtype)
321 return cat._hash_pandas_object(
322 encoding=encoding, hash_key=hash_key, categorize=False
323 )
324
325 try:
326 vals = hash_object_array(vals, hash_key, encoding)
327 except TypeError:
328 # we have mixed types
329 vals = hash_object_array(
330 vals.astype(str).astype(object), hash_key, encoding
331 )
332
333 # Then, redistribute these 64-bit ints within the space of 64-bit ints
334 vals ^= vals >> 30
335 vals *= np.uint64(0xBF58476D1CE4E5B9)
336 vals ^= vals >> 27
337 vals *= np.uint64(0x94D049BB133111EB)
338 vals ^= vals >> 31
339 return vals