Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/util/hashing.py: 15%

1"""

2data hash pandas / numpy objects

3"""

4from __future__ import annotations

6import itertools

7from typing import (

8 TYPE_CHECKING,

9 Hashable,

10 Iterable,

11 Iterator,

12 cast,

13)

15import numpy as np

17from pandas._libs import lib

18from pandas._libs.hashing import hash_object_array

19from pandas._typing import (

20 ArrayLike,

21 npt,

22)

24from pandas.core.dtypes.common import (

25 is_categorical_dtype,

26 is_list_like,

27)

28from pandas.core.dtypes.generic import (

29 ABCDataFrame,

30 ABCExtensionArray,

31 ABCIndex,

32 ABCMultiIndex,

33 ABCSeries,

34)

36if TYPE_CHECKING:

37 from pandas import (

38 Categorical,

39 DataFrame,

40 Index,

41 MultiIndex,

42 Series,

43 )

46# 16 byte long hashing key

47_default_hash_key = "0123456789123456"

50def combine_hash_arrays(

51 arrays: Iterator[np.ndarray], num_items: int

52) -> npt.NDArray[np.uint64]:

53 """

54 Parameters

55 ----------

56 arrays : Iterator[np.ndarray]

57 num_items : int

59 Returns

60 -------

61 np.ndarray[uint64]

63 Should be the same as CPython's tupleobject.c

64 """

65 try:

66 first = next(arrays)

67 except StopIteration:

68 return np.array([], dtype=np.uint64)

70 arrays = itertools.chain([first], arrays)

72 mult = np.uint64(1000003)

73 out = np.zeros_like(first) + np.uint64(0x345678)

74 last_i = 0

75 for i, a in enumerate(arrays):

76 inverse_i = num_items - i

77 out ^= a

78 out *= mult

79 mult += np.uint64(82520 + inverse_i + inverse_i)

80 last_i = i

81 assert last_i + 1 == num_items, "Fed in wrong num_items"

82 out += np.uint64(97531)

83 return out

86def hash_pandas_object(

87 obj: Index | DataFrame | Series,

88 index: bool = True,

89 encoding: str = "utf8",

90 hash_key: str | None = _default_hash_key,

91 categorize: bool = True,

92) -> Series:

93 """

94 Return a data hash of the Index/Series/DataFrame.

96 Parameters

97 ----------

98 obj : Index, Series, or DataFrame

99 index : bool, default True

100 Include the index in the hash (if Series/DataFrame).

101 encoding : str, default 'utf8'

102 Encoding for data & key when strings.

103 hash_key : str, default _default_hash_key

104 Hash_key for string key to encode.

105 categorize : bool, default True

106 Whether to first categorize object arrays before hashing. This is more

107 efficient when the array contains duplicate values.

108

109 Returns

110 -------

111 Series of uint64, same length as the object

112 """

113 from pandas import Series

114

115 if hash_key is None:

116 hash_key = _default_hash_key

117

118 if isinstance(obj, ABCMultiIndex):

119 return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)

120

121 elif isinstance(obj, ABCIndex):

122 h = hash_array(obj._values, encoding, hash_key, categorize).astype(

123 "uint64", copy=False

124 )

125 ser = Series(h, index=obj, dtype="uint64", copy=False)

126

127 elif isinstance(obj, ABCSeries):

128 h = hash_array(obj._values, encoding, hash_key, categorize).astype(

129 "uint64", copy=False

130 )

131 if index:

132 index_iter = (

133 hash_pandas_object(

134 obj.index,

135 index=False,

136 encoding=encoding,

137 hash_key=hash_key,

138 categorize=categorize,

139 )._values

140 for _ in [None]

141 )

142 arrays = itertools.chain([h], index_iter)

143 h = combine_hash_arrays(arrays, 2)

144

145 ser = Series(h, index=obj.index, dtype="uint64", copy=False)

146

147 elif isinstance(obj, ABCDataFrame):

148 hashes = (

149 hash_array(series._values, encoding, hash_key, categorize)

150 for _, series in obj.items()

151 )

152 num_items = len(obj.columns)

153 if index:

154 index_hash_generator = (

155 hash_pandas_object(

156 obj.index,

157 index=False,

158 encoding=encoding,

159 hash_key=hash_key,

160 categorize=categorize,

161 )._values

162 for _ in [None]

163 )

164 num_items += 1

165

166 # keep `hashes` specifically a generator to keep mypy happy

167 _hashes = itertools.chain(hashes, index_hash_generator)

168 hashes = (x for x in _hashes)

169 h = combine_hash_arrays(hashes, num_items)

170

171 ser = Series(h, index=obj.index, dtype="uint64", copy=False)

172 else:

173 raise TypeError(f"Unexpected type for hashing {type(obj)}")

174

175 return ser

176

177

178def hash_tuples(

179 vals: MultiIndex | Iterable[tuple[Hashable, ...]],

180 encoding: str = "utf8",

181 hash_key: str = _default_hash_key,

182) -> npt.NDArray[np.uint64]:

183 """

184 Hash an MultiIndex / listlike-of-tuples efficiently.

185

186 Parameters

187 ----------

188 vals : MultiIndex or listlike-of-tuples

189 encoding : str, default 'utf8'

190 hash_key : str, default _default_hash_key

191

192 Returns

193 -------

194 ndarray[np.uint64] of hashed values

195 """

196 if not is_list_like(vals):

197 raise TypeError("must be convertible to a list-of-tuples")

198

199 from pandas import (

200 Categorical,

201 MultiIndex,

202 )

203

204 if not isinstance(vals, ABCMultiIndex):

205 mi = MultiIndex.from_tuples(vals)

206 else:

207 mi = vals

208

209 # create a list-of-Categoricals

210 cat_vals = [

211 Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True)

212 for level in range(mi.nlevels)

213 ]

214

215 # hash the list-of-ndarrays

216 hashes = (

217 _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals

218 )

219 h = combine_hash_arrays(hashes, len(cat_vals))

220

221 return h

222

223

224def _hash_categorical(

225 cat: Categorical, encoding: str, hash_key: str

226) -> npt.NDArray[np.uint64]:

227 """

228 Hash a Categorical by hashing its categories, and then mapping the codes

229 to the hashes

230

231 Parameters

232 ----------

233 cat : Categorical

234 encoding : str

235 hash_key : str

236

237 Returns

238 -------

239 ndarray[np.uint64] of hashed values, same size as len(c)

240 """

241 # Convert ExtensionArrays to ndarrays

242 values = np.asarray(cat.categories._values)

243 hashed = hash_array(values, encoding, hash_key, categorize=False)

244

245 # we have uint64, as we don't directly support missing values

246 # we don't want to use take_nd which will coerce to float

247 # instead, directly construct the result with a

248 # max(np.uint64) as the missing value indicator

249 #

250 # TODO: GH 15362

251

252 mask = cat.isna()

253 if len(hashed):

254 result = hashed.take(cat.codes)

255 else:

256 result = np.zeros(len(mask), dtype="uint64")

257

258 if mask.any():

259 result[mask] = lib.u8max

260

261 return result

262

263

264def hash_array(

265 vals: ArrayLike,

266 encoding: str = "utf8",

267 hash_key: str = _default_hash_key,

268 categorize: bool = True,

269) -> npt.NDArray[np.uint64]:

270 """

271 Given a 1d array, return an array of deterministic integers.

272

273 Parameters

274 ----------

275 vals : ndarray or ExtensionArray

276 encoding : str, default 'utf8'

277 Encoding for data & key when strings.

278 hash_key : str, default _default_hash_key

279 Hash_key for string key to encode.

280 categorize : bool, default True

281 Whether to first categorize object arrays before hashing. This is more

282 efficient when the array contains duplicate values.

283

284 Returns

285 -------

286 ndarray[np.uint64, ndim=1]

287 Hashed values, same length as the vals.

288 """

289 if not hasattr(vals, "dtype"):

290 raise TypeError("must pass a ndarray-like")

291 dtype = vals.dtype

292

293 # For categoricals, we hash the categories, then remap the codes to the

294 # hash values. (This check is above the complex check so that we don't ask

295 # numpy if categorical is a subdtype of complex, as it will choke).

296 if is_categorical_dtype(dtype):

297 vals = cast("Categorical", vals)

298 return _hash_categorical(vals, encoding, hash_key)

299

300 elif isinstance(vals, ABCExtensionArray):

301 vals, _ = vals._values_for_factorize()

302

303 elif not isinstance(vals, np.ndarray):

304 # GH#42003

305 raise TypeError(

306 "hash_array requires np.ndarray or ExtensionArray, not "

307 f"{type(vals).__name__}. Use hash_pandas_object instead."

308 )

309

310 return _hash_ndarray(vals, encoding, hash_key, categorize)

311

312

313def _hash_ndarray(

314 vals: np.ndarray,

315 encoding: str = "utf8",

316 hash_key: str = _default_hash_key,

317 categorize: bool = True,

318) -> npt.NDArray[np.uint64]:

319 """

320 See hash_array.__doc__.

321 """

322 dtype = vals.dtype

323

324 # we'll be working with everything as 64-bit values, so handle this

325 # 128-bit value early

326 if np.issubdtype(dtype, np.complex128):

327 return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))

328

329 # First, turn whatever array this is into unsigned 64-bit ints, if we can

330 # manage it.

331 elif dtype == bool:

332 vals = vals.astype("u8")

333 elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):

334 vals = vals.view("i8").astype("u8", copy=False)

335 elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:

336 vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")

337 else:

338 # With repeated values, its MUCH faster to categorize object dtypes,

339 # then hash and rename categories. We allow skipping the categorization

340 # when the values are known/likely to be unique.

341 if categorize:

342 from pandas import (

343 Categorical,

344 Index,

345 factorize,

346 )

347

348 codes, categories = factorize(vals, sort=False)

349 cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)

350 return _hash_categorical(cat, encoding, hash_key)

351

352 try:

353 vals = hash_object_array(vals, hash_key, encoding)

354 except TypeError:

355 # we have mixed types

356 vals = hash_object_array(

357 vals.astype(str).astype(object), hash_key, encoding

358 )

359

360 # Then, redistribute these 64-bit ints within the space of 64-bit ints

361 vals ^= vals >> 30

362 vals *= np.uint64(0xBF58476D1CE4E5B9)

363 vals ^= vals >> 27

364 vals *= np.uint64(0x94D049BB133111EB)

365 vals ^= vals >> 31

366 return vals