Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/util/hashing.py: 14%

1"""

2data hash pandas / numpy objects

3"""

4from __future__ import annotations

6import itertools

7from typing import TYPE_CHECKING

9import numpy as np

11from pandas._libs.hashing import hash_object_array

13from pandas.core.dtypes.common import is_list_like

14from pandas.core.dtypes.dtypes import CategoricalDtype

15from pandas.core.dtypes.generic import (

16 ABCDataFrame,

17 ABCExtensionArray,

18 ABCIndex,

19 ABCMultiIndex,

20 ABCSeries,

21)

23if TYPE_CHECKING:

24 from collections.abc import (

25 Hashable,

26 Iterable,

27 Iterator,

28 )

30 from pandas._typing import (

31 ArrayLike,

32 npt,

33 )

35 from pandas import (

36 DataFrame,

37 Index,

38 MultiIndex,

39 Series,

40 )

43# 16 byte long hashing key

44_default_hash_key = "0123456789123456"

47def combine_hash_arrays(

48 arrays: Iterator[np.ndarray], num_items: int

49) -> npt.NDArray[np.uint64]:

50 """

51 Parameters

52 ----------

53 arrays : Iterator[np.ndarray]

54 num_items : int

56 Returns

57 -------

58 np.ndarray[uint64]

60 Should be the same as CPython's tupleobject.c

61 """

62 try:

63 first = next(arrays)

64 except StopIteration:

65 return np.array([], dtype=np.uint64)

67 arrays = itertools.chain([first], arrays)

69 mult = np.uint64(1000003)

70 out = np.zeros_like(first) + np.uint64(0x345678)

71 last_i = 0

72 for i, a in enumerate(arrays):

73 inverse_i = num_items - i

74 out ^= a

75 out *= mult

76 mult += np.uint64(82520 + inverse_i + inverse_i)

77 last_i = i

78 assert last_i + 1 == num_items, "Fed in wrong num_items"

79 out += np.uint64(97531)

80 return out

83def hash_pandas_object(

84 obj: Index | DataFrame | Series,

85 index: bool = True,

86 encoding: str = "utf8",

87 hash_key: str | None = _default_hash_key,

88 categorize: bool = True,

89) -> Series:

90 """

91 Return a data hash of the Index/Series/DataFrame.

93 Parameters

94 ----------

95 obj : Index, Series, or DataFrame

96 index : bool, default True

97 Include the index in the hash (if Series/DataFrame).

98 encoding : str, default 'utf8'

99 Encoding for data & key when strings.

100 hash_key : str, default _default_hash_key

101 Hash_key for string key to encode.

102 categorize : bool, default True

103 Whether to first categorize object arrays before hashing. This is more

104 efficient when the array contains duplicate values.

105

106 Returns

107 -------

108 Series of uint64, same length as the object

109

110 Examples

111 --------

112 >>> pd.util.hash_pandas_object(pd.Series([1, 2, 3]))

113 0 14639053686158035780

114 1 3869563279212530728

115 2 393322362522515241

116 dtype: uint64

117 """

118 from pandas import Series

119

120 if hash_key is None:

121 hash_key = _default_hash_key

122

123 if isinstance(obj, ABCMultiIndex):

124 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)

125

126 elif isinstance(obj, ABCIndex):

127 h = hash_array(obj._values, encoding, hash_key, categorize).astype(

128 "uint64", copy=False

129 )

130 ser = Series(h, index=obj, dtype="uint64", copy=False)

131

132 elif isinstance(obj, ABCSeries):

133 h = hash_array(obj._values, encoding, hash_key, categorize).astype(

134 "uint64", copy=False

135 )

136 if index:

137 index_iter = (

138 hash_pandas_object(

139 obj.index,

140 index=False,

141 encoding=encoding,

142 hash_key=hash_key,

143 categorize=categorize,

144 )._values

145 for _ in [None]

146 )

147 arrays = itertools.chain([h], index_iter)

148 h = combine_hash_arrays(arrays, 2)

149

150 ser = Series(h, index=obj.index, dtype="uint64", copy=False)

151

152 elif isinstance(obj, ABCDataFrame):

153 hashes = (

154 hash_array(series._values, encoding, hash_key, categorize)

155 for _, series in obj.items()

156 )

157 num_items = len(obj.columns)

158 if index:

159 index_hash_generator = (

160 hash_pandas_object(

161 obj.index,

162 index=False,

163 encoding=encoding,

164 hash_key=hash_key,

165 categorize=categorize,

166 )._values

167 for _ in [None]

168 )

169 num_items += 1

170

171 # keep `hashes` specifically a generator to keep mypy happy

172 _hashes = itertools.chain(hashes, index_hash_generator)

173 hashes = (x for x in _hashes)

174 h = combine_hash_arrays(hashes, num_items)

175

176 ser = Series(h, index=obj.index, dtype="uint64", copy=False)

177 else:

178 raise TypeError(f"Unexpected type for hashing {type(obj)}")

179

180 return ser

181

182

183def hash_tuples(

184 vals: MultiIndex | Iterable[tuple[Hashable, ...]],

185 encoding: str = "utf8",

186 hash_key: str = _default_hash_key,

187) -> npt.NDArray[np.uint64]:

188 """

189 Hash an MultiIndex / listlike-of-tuples efficiently.

190

191 Parameters

192 ----------

193 vals : MultiIndex or listlike-of-tuples

194 encoding : str, default 'utf8'

195 hash_key : str, default _default_hash_key

196

197 Returns

198 -------

199 ndarray[np.uint64] of hashed values

200 """

201 if not is_list_like(vals):

202 raise TypeError("must be convertible to a list-of-tuples")

203

204 from pandas import (

205 Categorical,

206 MultiIndex,

207 )

208

209 if not isinstance(vals, ABCMultiIndex):

210 mi = MultiIndex.from_tuples(vals)

211 else:

212 mi = vals

213

214 # create a list-of-Categoricals

215 cat_vals = [

216 Categorical._simple_new(

217 mi.codes[level],

218 CategoricalDtype(categories=mi.levels[level], ordered=False),

219 )

220 for level in range(mi.nlevels)

221 ]

222

223 # hash the list-of-ndarrays

224 hashes = (

225 cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False)

226 for cat in cat_vals

227 )

228 h = combine_hash_arrays(hashes, len(cat_vals))

229

230 return h

231

232

233def hash_array(

234 vals: ArrayLike,

235 encoding: str = "utf8",

236 hash_key: str = _default_hash_key,

237 categorize: bool = True,

238) -> npt.NDArray[np.uint64]:

239 """

240 Given a 1d array, return an array of deterministic integers.

241

242 Parameters

243 ----------

244 vals : ndarray or ExtensionArray

245 encoding : str, default 'utf8'

246 Encoding for data & key when strings.

247 hash_key : str, default _default_hash_key

248 Hash_key for string key to encode.

249 categorize : bool, default True

250 Whether to first categorize object arrays before hashing. This is more

251 efficient when the array contains duplicate values.

252

253 Returns

254 -------

255 ndarray[np.uint64, ndim=1]

256 Hashed values, same length as the vals.

257

258 Examples

259 --------

260 >>> pd.util.hash_array(np.array([1, 2, 3]))

261 array([ 6238072747940578789, 15839785061582574730, 2185194620014831856],

262 dtype=uint64)

263 """

264 if not hasattr(vals, "dtype"):

265 raise TypeError("must pass a ndarray-like")

266

267 if isinstance(vals, ABCExtensionArray):

268 return vals._hash_pandas_object(

269 encoding=encoding, hash_key=hash_key, categorize=categorize

270 )

271

272 if not isinstance(vals, np.ndarray):

273 # GH#42003

274 raise TypeError(

275 "hash_array requires np.ndarray or ExtensionArray, not "

276 f"{type(vals).__name__}. Use hash_pandas_object instead."

277 )

278

279 return _hash_ndarray(vals, encoding, hash_key, categorize)

280

281

282def _hash_ndarray(

283 vals: np.ndarray,

284 encoding: str = "utf8",

285 hash_key: str = _default_hash_key,

286 categorize: bool = True,

287) -> npt.NDArray[np.uint64]:

288 """

289 See hash_array.__doc__.

290 """

291 dtype = vals.dtype

292

293 # _hash_ndarray only takes 64-bit values, so handle 128-bit by parts

294 if np.issubdtype(dtype, np.complex128):

295 hash_real = _hash_ndarray(vals.real, encoding, hash_key, categorize)

296 hash_imag = _hash_ndarray(vals.imag, encoding, hash_key, categorize)

297 return hash_real + 23 * hash_imag

298

299 # First, turn whatever array this is into unsigned 64-bit ints, if we can

300 # manage it.

301 if dtype == bool:

302 vals = vals.astype("u8")

303 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):

304 vals = vals.view("i8").astype("u8", copy=False)

305 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:

306 vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")

307 else:

308 # With repeated values, its MUCH faster to categorize object dtypes,

309 # then hash and rename categories. We allow skipping the categorization

310 # when the values are known/likely to be unique.

311 if categorize:

312 from pandas import (

313 Categorical,

314 Index,

315 factorize,

316 )

317

318 codes, categories = factorize(vals, sort=False)

319 dtype = CategoricalDtype(categories=Index(categories), ordered=False)

320 cat = Categorical._simple_new(codes, dtype)

321 return cat._hash_pandas_object(

322 encoding=encoding, hash_key=hash_key, categorize=False

323 )

324

325 try:

326 vals = hash_object_array(vals, hash_key, encoding)

327 except TypeError:

328 # we have mixed types

329 vals = hash_object_array(

330 vals.astype(str).astype(object), hash_key, encoding

331 )

332

333 # Then, redistribute these 64-bit ints within the space of 64-bit ints

334 vals ^= vals >> 30

335 vals *= np.uint64(0xBF58476D1CE4E5B9)

336 vals ^= vals >> 27

337 vals *= np.uint64(0x94D049BB133111EB)

338 vals ^= vals >> 31

339 return vals