Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/util/hashing.py: 14%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

106 statements  

1""" 

2data hash pandas / numpy objects 

3""" 

4from __future__ import annotations 

5 

6import itertools 

7from typing import TYPE_CHECKING 

8 

9import numpy as np 

10 

11from pandas._libs.hashing import hash_object_array 

12 

13from pandas.core.dtypes.common import is_list_like 

14from pandas.core.dtypes.dtypes import CategoricalDtype 

15from pandas.core.dtypes.generic import ( 

16 ABCDataFrame, 

17 ABCExtensionArray, 

18 ABCIndex, 

19 ABCMultiIndex, 

20 ABCSeries, 

21) 

22 

23if TYPE_CHECKING: 

24 from collections.abc import ( 

25 Hashable, 

26 Iterable, 

27 Iterator, 

28 ) 

29 

30 from pandas._typing import ( 

31 ArrayLike, 

32 npt, 

33 ) 

34 

35 from pandas import ( 

36 DataFrame, 

37 Index, 

38 MultiIndex, 

39 Series, 

40 ) 

41 

42 

43# 16 byte long hashing key 

44_default_hash_key = "0123456789123456" 

45 

46 

47def combine_hash_arrays( 

48 arrays: Iterator[np.ndarray], num_items: int 

49) -> npt.NDArray[np.uint64]: 

50 """ 

51 Parameters 

52 ---------- 

53 arrays : Iterator[np.ndarray] 

54 num_items : int 

55 

56 Returns 

57 ------- 

58 np.ndarray[uint64] 

59 

60 Should be the same as CPython's tupleobject.c 

61 """ 

62 try: 

63 first = next(arrays) 

64 except StopIteration: 

65 return np.array([], dtype=np.uint64) 

66 

67 arrays = itertools.chain([first], arrays) 

68 

69 mult = np.uint64(1000003) 

70 out = np.zeros_like(first) + np.uint64(0x345678) 

71 last_i = 0 

72 for i, a in enumerate(arrays): 

73 inverse_i = num_items - i 

74 out ^= a 

75 out *= mult 

76 mult += np.uint64(82520 + inverse_i + inverse_i) 

77 last_i = i 

78 assert last_i + 1 == num_items, "Fed in wrong num_items" 

79 out += np.uint64(97531) 

80 return out 

81 

82 

83def hash_pandas_object( 

84 obj: Index | DataFrame | Series, 

85 index: bool = True, 

86 encoding: str = "utf8", 

87 hash_key: str | None = _default_hash_key, 

88 categorize: bool = True, 

89) -> Series: 

90 """ 

91 Return a data hash of the Index/Series/DataFrame. 

92 

93 Parameters 

94 ---------- 

95 obj : Index, Series, or DataFrame 

96 index : bool, default True 

97 Include the index in the hash (if Series/DataFrame). 

98 encoding : str, default 'utf8' 

99 Encoding for data & key when strings. 

100 hash_key : str, default _default_hash_key 

101 Hash_key for string key to encode. 

102 categorize : bool, default True 

103 Whether to first categorize object arrays before hashing. This is more 

104 efficient when the array contains duplicate values. 

105 

106 Returns 

107 ------- 

108 Series of uint64, same length as the object 

109 

110 Examples 

111 -------- 

112 >>> pd.util.hash_pandas_object(pd.Series([1, 2, 3])) 

113 0 14639053686158035780 

114 1 3869563279212530728 

115 2 393322362522515241 

116 dtype: uint64 

117 """ 

118 from pandas import Series 

119 

120 if hash_key is None: 

121 hash_key = _default_hash_key 

122 

123 if isinstance(obj, ABCMultiIndex): 

124 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False) 

125 

126 elif isinstance(obj, ABCIndex): 

127 h = hash_array(obj._values, encoding, hash_key, categorize).astype( 

128 "uint64", copy=False 

129 ) 

130 ser = Series(h, index=obj, dtype="uint64", copy=False) 

131 

132 elif isinstance(obj, ABCSeries): 

133 h = hash_array(obj._values, encoding, hash_key, categorize).astype( 

134 "uint64", copy=False 

135 ) 

136 if index: 

137 index_iter = ( 

138 hash_pandas_object( 

139 obj.index, 

140 index=False, 

141 encoding=encoding, 

142 hash_key=hash_key, 

143 categorize=categorize, 

144 )._values 

145 for _ in [None] 

146 ) 

147 arrays = itertools.chain([h], index_iter) 

148 h = combine_hash_arrays(arrays, 2) 

149 

150 ser = Series(h, index=obj.index, dtype="uint64", copy=False) 

151 

152 elif isinstance(obj, ABCDataFrame): 

153 hashes = ( 

154 hash_array(series._values, encoding, hash_key, categorize) 

155 for _, series in obj.items() 

156 ) 

157 num_items = len(obj.columns) 

158 if index: 

159 index_hash_generator = ( 

160 hash_pandas_object( 

161 obj.index, 

162 index=False, 

163 encoding=encoding, 

164 hash_key=hash_key, 

165 categorize=categorize, 

166 )._values 

167 for _ in [None] 

168 ) 

169 num_items += 1 

170 

171 # keep `hashes` specifically a generator to keep mypy happy 

172 _hashes = itertools.chain(hashes, index_hash_generator) 

173 hashes = (x for x in _hashes) 

174 h = combine_hash_arrays(hashes, num_items) 

175 

176 ser = Series(h, index=obj.index, dtype="uint64", copy=False) 

177 else: 

178 raise TypeError(f"Unexpected type for hashing {type(obj)}") 

179 

180 return ser 

181 

182 

183def hash_tuples( 

184 vals: MultiIndex | Iterable[tuple[Hashable, ...]], 

185 encoding: str = "utf8", 

186 hash_key: str = _default_hash_key, 

187) -> npt.NDArray[np.uint64]: 

188 """ 

189 Hash an MultiIndex / listlike-of-tuples efficiently. 

190 

191 Parameters 

192 ---------- 

193 vals : MultiIndex or listlike-of-tuples 

194 encoding : str, default 'utf8' 

195 hash_key : str, default _default_hash_key 

196 

197 Returns 

198 ------- 

199 ndarray[np.uint64] of hashed values 

200 """ 

201 if not is_list_like(vals): 

202 raise TypeError("must be convertible to a list-of-tuples") 

203 

204 from pandas import ( 

205 Categorical, 

206 MultiIndex, 

207 ) 

208 

209 if not isinstance(vals, ABCMultiIndex): 

210 mi = MultiIndex.from_tuples(vals) 

211 else: 

212 mi = vals 

213 

214 # create a list-of-Categoricals 

215 cat_vals = [ 

216 Categorical._simple_new( 

217 mi.codes[level], 

218 CategoricalDtype(categories=mi.levels[level], ordered=False), 

219 ) 

220 for level in range(mi.nlevels) 

221 ] 

222 

223 # hash the list-of-ndarrays 

224 hashes = ( 

225 cat._hash_pandas_object(encoding=encoding, hash_key=hash_key, categorize=False) 

226 for cat in cat_vals 

227 ) 

228 h = combine_hash_arrays(hashes, len(cat_vals)) 

229 

230 return h 

231 

232 

233def hash_array( 

234 vals: ArrayLike, 

235 encoding: str = "utf8", 

236 hash_key: str = _default_hash_key, 

237 categorize: bool = True, 

238) -> npt.NDArray[np.uint64]: 

239 """ 

240 Given a 1d array, return an array of deterministic integers. 

241 

242 Parameters 

243 ---------- 

244 vals : ndarray or ExtensionArray 

245 encoding : str, default 'utf8' 

246 Encoding for data & key when strings. 

247 hash_key : str, default _default_hash_key 

248 Hash_key for string key to encode. 

249 categorize : bool, default True 

250 Whether to first categorize object arrays before hashing. This is more 

251 efficient when the array contains duplicate values. 

252 

253 Returns 

254 ------- 

255 ndarray[np.uint64, ndim=1] 

256 Hashed values, same length as the vals. 

257 

258 Examples 

259 -------- 

260 >>> pd.util.hash_array(np.array([1, 2, 3])) 

261 array([ 6238072747940578789, 15839785061582574730, 2185194620014831856], 

262 dtype=uint64) 

263 """ 

264 if not hasattr(vals, "dtype"): 

265 raise TypeError("must pass a ndarray-like") 

266 

267 if isinstance(vals, ABCExtensionArray): 

268 return vals._hash_pandas_object( 

269 encoding=encoding, hash_key=hash_key, categorize=categorize 

270 ) 

271 

272 if not isinstance(vals, np.ndarray): 

273 # GH#42003 

274 raise TypeError( 

275 "hash_array requires np.ndarray or ExtensionArray, not " 

276 f"{type(vals).__name__}. Use hash_pandas_object instead." 

277 ) 

278 

279 return _hash_ndarray(vals, encoding, hash_key, categorize) 

280 

281 

282def _hash_ndarray( 

283 vals: np.ndarray, 

284 encoding: str = "utf8", 

285 hash_key: str = _default_hash_key, 

286 categorize: bool = True, 

287) -> npt.NDArray[np.uint64]: 

288 """ 

289 See hash_array.__doc__. 

290 """ 

291 dtype = vals.dtype 

292 

293 # _hash_ndarray only takes 64-bit values, so handle 128-bit by parts 

294 if np.issubdtype(dtype, np.complex128): 

295 hash_real = _hash_ndarray(vals.real, encoding, hash_key, categorize) 

296 hash_imag = _hash_ndarray(vals.imag, encoding, hash_key, categorize) 

297 return hash_real + 23 * hash_imag 

298 

299 # First, turn whatever array this is into unsigned 64-bit ints, if we can 

300 # manage it. 

301 if dtype == bool: 

302 vals = vals.astype("u8") 

303 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): 

304 vals = vals.view("i8").astype("u8", copy=False) 

305 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: 

306 vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8") 

307 else: 

308 # With repeated values, its MUCH faster to categorize object dtypes, 

309 # then hash and rename categories. We allow skipping the categorization 

310 # when the values are known/likely to be unique. 

311 if categorize: 

312 from pandas import ( 

313 Categorical, 

314 Index, 

315 factorize, 

316 ) 

317 

318 codes, categories = factorize(vals, sort=False) 

319 dtype = CategoricalDtype(categories=Index(categories), ordered=False) 

320 cat = Categorical._simple_new(codes, dtype) 

321 return cat._hash_pandas_object( 

322 encoding=encoding, hash_key=hash_key, categorize=False 

323 ) 

324 

325 try: 

326 vals = hash_object_array(vals, hash_key, encoding) 

327 except TypeError: 

328 # we have mixed types 

329 vals = hash_object_array( 

330 vals.astype(str).astype(object), hash_key, encoding 

331 ) 

332 

333 # Then, redistribute these 64-bit ints within the space of 64-bit ints 

334 vals ^= vals >> 30 

335 vals *= np.uint64(0xBF58476D1CE4E5B9) 

336 vals ^= vals >> 27 

337 vals *= np.uint64(0x94D049BB133111EB) 

338 vals ^= vals >> 31 

339 return vals