1"""
2data hash pandas / numpy objects
3"""
4from __future__ import annotations
5
6import itertools
7from typing import (
8 TYPE_CHECKING,
9 Hashable,
10 Iterable,
11 Iterator,
12 cast,
13)
14
15import numpy as np
16
17from pandas._libs import lib
18from pandas._libs.hashing import hash_object_array
19from pandas._typing import (
20 ArrayLike,
21 npt,
22)
23
24from pandas.core.dtypes.common import (
25 is_categorical_dtype,
26 is_list_like,
27)
28from pandas.core.dtypes.generic import (
29 ABCDataFrame,
30 ABCExtensionArray,
31 ABCIndex,
32 ABCMultiIndex,
33 ABCSeries,
34)
35
36if TYPE_CHECKING:
37 from pandas import (
38 Categorical,
39 DataFrame,
40 Index,
41 MultiIndex,
42 Series,
43 )
44
45
46# 16 byte long hashing key
47_default_hash_key = "0123456789123456"
48
49
50def combine_hash_arrays(
51 arrays: Iterator[np.ndarray], num_items: int
52) -> npt.NDArray[np.uint64]:
53 """
54 Parameters
55 ----------
56 arrays : Iterator[np.ndarray]
57 num_items : int
58
59 Returns
60 -------
61 np.ndarray[uint64]
62
63 Should be the same as CPython's tupleobject.c
64 """
65 try:
66 first = next(arrays)
67 except StopIteration:
68 return np.array([], dtype=np.uint64)
69
70 arrays = itertools.chain([first], arrays)
71
72 mult = np.uint64(1000003)
73 out = np.zeros_like(first) + np.uint64(0x345678)
74 last_i = 0
75 for i, a in enumerate(arrays):
76 inverse_i = num_items - i
77 out ^= a
78 out *= mult
79 mult += np.uint64(82520 + inverse_i + inverse_i)
80 last_i = i
81 assert last_i + 1 == num_items, "Fed in wrong num_items"
82 out += np.uint64(97531)
83 return out
84
85
86def hash_pandas_object(
87 obj: Index | DataFrame | Series,
88 index: bool = True,
89 encoding: str = "utf8",
90 hash_key: str | None = _default_hash_key,
91 categorize: bool = True,
92) -> Series:
93 """
94 Return a data hash of the Index/Series/DataFrame.
95
96 Parameters
97 ----------
98 obj : Index, Series, or DataFrame
99 index : bool, default True
100 Include the index in the hash (if Series/DataFrame).
101 encoding : str, default 'utf8'
102 Encoding for data & key when strings.
103 hash_key : str, default _default_hash_key
104 Hash_key for string key to encode.
105 categorize : bool, default True
106 Whether to first categorize object arrays before hashing. This is more
107 efficient when the array contains duplicate values.
108
109 Returns
110 -------
111 Series of uint64, same length as the object
112 """
113 from pandas import Series
114
115 if hash_key is None:
116 hash_key = _default_hash_key
117
118 if isinstance(obj, ABCMultiIndex):
119 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
120
121 elif isinstance(obj, ABCIndex):
122 h = hash_array(obj._values, encoding, hash_key, categorize).astype(
123 "uint64", copy=False
124 )
125 ser = Series(h, index=obj, dtype="uint64", copy=False)
126
127 elif isinstance(obj, ABCSeries):
128 h = hash_array(obj._values, encoding, hash_key, categorize).astype(
129 "uint64", copy=False
130 )
131 if index:
132 index_iter = (
133 hash_pandas_object(
134 obj.index,
135 index=False,
136 encoding=encoding,
137 hash_key=hash_key,
138 categorize=categorize,
139 )._values
140 for _ in [None]
141 )
142 arrays = itertools.chain([h], index_iter)
143 h = combine_hash_arrays(arrays, 2)
144
145 ser = Series(h, index=obj.index, dtype="uint64", copy=False)
146
147 elif isinstance(obj, ABCDataFrame):
148 hashes = (
149 hash_array(series._values, encoding, hash_key, categorize)
150 for _, series in obj.items()
151 )
152 num_items = len(obj.columns)
153 if index:
154 index_hash_generator = (
155 hash_pandas_object(
156 obj.index,
157 index=False,
158 encoding=encoding,
159 hash_key=hash_key,
160 categorize=categorize,
161 )._values
162 for _ in [None]
163 )
164 num_items += 1
165
166 # keep `hashes` specifically a generator to keep mypy happy
167 _hashes = itertools.chain(hashes, index_hash_generator)
168 hashes = (x for x in _hashes)
169 h = combine_hash_arrays(hashes, num_items)
170
171 ser = Series(h, index=obj.index, dtype="uint64", copy=False)
172 else:
173 raise TypeError(f"Unexpected type for hashing {type(obj)}")
174
175 return ser
176
177
178def hash_tuples(
179 vals: MultiIndex | Iterable[tuple[Hashable, ...]],
180 encoding: str = "utf8",
181 hash_key: str = _default_hash_key,
182) -> npt.NDArray[np.uint64]:
183 """
184 Hash an MultiIndex / listlike-of-tuples efficiently.
185
186 Parameters
187 ----------
188 vals : MultiIndex or listlike-of-tuples
189 encoding : str, default 'utf8'
190 hash_key : str, default _default_hash_key
191
192 Returns
193 -------
194 ndarray[np.uint64] of hashed values
195 """
196 if not is_list_like(vals):
197 raise TypeError("must be convertible to a list-of-tuples")
198
199 from pandas import (
200 Categorical,
201 MultiIndex,
202 )
203
204 if not isinstance(vals, ABCMultiIndex):
205 mi = MultiIndex.from_tuples(vals)
206 else:
207 mi = vals
208
209 # create a list-of-Categoricals
210 cat_vals = [
211 Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True)
212 for level in range(mi.nlevels)
213 ]
214
215 # hash the list-of-ndarrays
216 hashes = (
217 _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
218 )
219 h = combine_hash_arrays(hashes, len(cat_vals))
220
221 return h
222
223
224def _hash_categorical(
225 cat: Categorical, encoding: str, hash_key: str
226) -> npt.NDArray[np.uint64]:
227 """
228 Hash a Categorical by hashing its categories, and then mapping the codes
229 to the hashes
230
231 Parameters
232 ----------
233 cat : Categorical
234 encoding : str
235 hash_key : str
236
237 Returns
238 -------
239 ndarray[np.uint64] of hashed values, same size as len(c)
240 """
241 # Convert ExtensionArrays to ndarrays
242 values = np.asarray(cat.categories._values)
243 hashed = hash_array(values, encoding, hash_key, categorize=False)
244
245 # we have uint64, as we don't directly support missing values
246 # we don't want to use take_nd which will coerce to float
247 # instead, directly construct the result with a
248 # max(np.uint64) as the missing value indicator
249 #
250 # TODO: GH 15362
251
252 mask = cat.isna()
253 if len(hashed):
254 result = hashed.take(cat.codes)
255 else:
256 result = np.zeros(len(mask), dtype="uint64")
257
258 if mask.any():
259 result[mask] = lib.u8max
260
261 return result
262
263
264def hash_array(
265 vals: ArrayLike,
266 encoding: str = "utf8",
267 hash_key: str = _default_hash_key,
268 categorize: bool = True,
269) -> npt.NDArray[np.uint64]:
270 """
271 Given a 1d array, return an array of deterministic integers.
272
273 Parameters
274 ----------
275 vals : ndarray or ExtensionArray
276 encoding : str, default 'utf8'
277 Encoding for data & key when strings.
278 hash_key : str, default _default_hash_key
279 Hash_key for string key to encode.
280 categorize : bool, default True
281 Whether to first categorize object arrays before hashing. This is more
282 efficient when the array contains duplicate values.
283
284 Returns
285 -------
286 ndarray[np.uint64, ndim=1]
287 Hashed values, same length as the vals.
288 """
289 if not hasattr(vals, "dtype"):
290 raise TypeError("must pass a ndarray-like")
291 dtype = vals.dtype
292
293 # For categoricals, we hash the categories, then remap the codes to the
294 # hash values. (This check is above the complex check so that we don't ask
295 # numpy if categorical is a subdtype of complex, as it will choke).
296 if is_categorical_dtype(dtype):
297 vals = cast("Categorical", vals)
298 return _hash_categorical(vals, encoding, hash_key)
299
300 elif isinstance(vals, ABCExtensionArray):
301 vals, _ = vals._values_for_factorize()
302
303 elif not isinstance(vals, np.ndarray):
304 # GH#42003
305 raise TypeError(
306 "hash_array requires np.ndarray or ExtensionArray, not "
307 f"{type(vals).__name__}. Use hash_pandas_object instead."
308 )
309
310 return _hash_ndarray(vals, encoding, hash_key, categorize)
311
312
313def _hash_ndarray(
314 vals: np.ndarray,
315 encoding: str = "utf8",
316 hash_key: str = _default_hash_key,
317 categorize: bool = True,
318) -> npt.NDArray[np.uint64]:
319 """
320 See hash_array.__doc__.
321 """
322 dtype = vals.dtype
323
324 # we'll be working with everything as 64-bit values, so handle this
325 # 128-bit value early
326 if np.issubdtype(dtype, np.complex128):
327 return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))
328
329 # First, turn whatever array this is into unsigned 64-bit ints, if we can
330 # manage it.
331 elif dtype == bool:
332 vals = vals.astype("u8")
333 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
334 vals = vals.view("i8").astype("u8", copy=False)
335 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
336 vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
337 else:
338 # With repeated values, its MUCH faster to categorize object dtypes,
339 # then hash and rename categories. We allow skipping the categorization
340 # when the values are known/likely to be unique.
341 if categorize:
342 from pandas import (
343 Categorical,
344 Index,
345 factorize,
346 )
347
348 codes, categories = factorize(vals, sort=False)
349 cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
350 return _hash_categorical(cat, encoding, hash_key)
351
352 try:
353 vals = hash_object_array(vals, hash_key, encoding)
354 except TypeError:
355 # we have mixed types
356 vals = hash_object_array(
357 vals.astype(str).astype(object), hash_key, encoding
358 )
359
360 # Then, redistribute these 64-bit ints within the space of 64-bit ints
361 vals ^= vals >> 30
362 vals *= np.uint64(0xBF58476D1CE4E5B9)
363 vals ^= vals >> 27
364 vals *= np.uint64(0x94D049BB133111EB)
365 vals ^= vals >> 31
366 return vals