Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/sparse/dtype.py: 34%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

143 statements  

1"""Sparse Dtype""" 

2from __future__ import annotations 

3 

4import re 

5from typing import ( 

6 TYPE_CHECKING, 

7 Any, 

8) 

9import warnings 

10 

11import numpy as np 

12 

13from pandas._typing import ( 

14 Dtype, 

15 DtypeObj, 

16 type_t, 

17) 

18from pandas.errors import PerformanceWarning 

19from pandas.util._exceptions import find_stack_level 

20 

21from pandas.core.dtypes.astype import astype_array 

22from pandas.core.dtypes.base import ( 

23 ExtensionDtype, 

24 register_extension_dtype, 

25) 

26from pandas.core.dtypes.common import ( 

27 is_bool_dtype, 

28 is_object_dtype, 

29 is_scalar, 

30 is_string_dtype, 

31 pandas_dtype, 

32) 

33from pandas.core.dtypes.missing import ( 

34 isna, 

35 na_value_for_dtype, 

36) 

37 

38if TYPE_CHECKING: 

39 from pandas.core.arrays.sparse.array import SparseArray 

40 

41 

42@register_extension_dtype 

43class SparseDtype(ExtensionDtype): 

44 """ 

45 Dtype for data stored in :class:`SparseArray`. 

46 

47 This dtype implements the pandas ExtensionDtype interface. 

48 

49 Parameters 

50 ---------- 

51 dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64 

52 The dtype of the underlying array storing the non-fill value values. 

53 fill_value : scalar, optional 

54 The scalar value not stored in the SparseArray. By default, this 

55 depends on `dtype`. 

56 

57 =========== ========== 

58 dtype na_value 

59 =========== ========== 

60 float ``np.nan`` 

61 int ``0`` 

62 bool ``False`` 

63 datetime64 ``pd.NaT`` 

64 timedelta64 ``pd.NaT`` 

65 =========== ========== 

66 

67 The default value may be overridden by specifying a `fill_value`. 

68 

69 Attributes 

70 ---------- 

71 None 

72 

73 Methods 

74 ------- 

75 None 

76 """ 

77 

78 # We include `_is_na_fill_value` in the metadata to avoid hash collisions 

79 # between SparseDtype(float, 0.0) and SparseDtype(float, nan). 

80 # Without is_na_fill_value in the comparison, those would be equal since 

81 # hash(nan) is (sometimes?) 0. 

82 _metadata = ("_dtype", "_fill_value", "_is_na_fill_value") 

83 

84 def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None: 

85 if isinstance(dtype, type(self)): 

86 if fill_value is None: 

87 fill_value = dtype.fill_value 

88 dtype = dtype.subtype 

89 

90 dtype = pandas_dtype(dtype) 

91 if is_string_dtype(dtype): 

92 dtype = np.dtype("object") 

93 

94 if fill_value is None: 

95 fill_value = na_value_for_dtype(dtype) 

96 

97 self._dtype = dtype 

98 self._fill_value = fill_value 

99 self._check_fill_value() 

100 

101 def __hash__(self) -> int: 

102 # Python3 doesn't inherit __hash__ when a base class overrides 

103 # __eq__, so we explicitly do it here. 

104 return super().__hash__() 

105 

106 def __eq__(self, other: Any) -> bool: 

107 # We have to override __eq__ to handle NA values in _metadata. 

108 # The base class does simple == checks, which fail for NA. 

109 if isinstance(other, str): 

110 try: 

111 other = self.construct_from_string(other) 

112 except TypeError: 

113 return False 

114 

115 if isinstance(other, type(self)): 

116 subtype = self.subtype == other.subtype 

117 if self._is_na_fill_value: 

118 # this case is complicated by two things: 

119 # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan) 

120 # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT) 

121 # i.e. we want to treat any floating-point NaN as equal, but 

122 # not a floating-point NaN and a datetime NaT. 

123 fill_value = ( 

124 other._is_na_fill_value 

125 and isinstance(self.fill_value, type(other.fill_value)) 

126 or isinstance(other.fill_value, type(self.fill_value)) 

127 ) 

128 else: 

129 with warnings.catch_warnings(): 

130 # Ignore spurious numpy warning 

131 warnings.filterwarnings( 

132 "ignore", 

133 "elementwise comparison failed", 

134 category=DeprecationWarning, 

135 ) 

136 

137 fill_value = self.fill_value == other.fill_value 

138 

139 return subtype and fill_value 

140 return False 

141 

142 @property 

143 def fill_value(self): 

144 """ 

145 The fill value of the array. 

146 

147 Converting the SparseArray to a dense ndarray will fill the 

148 array with this value. 

149 

150 .. warning:: 

151 

152 It's possible to end up with a SparseArray that has ``fill_value`` 

153 values in ``sp_values``. This can occur, for example, when setting 

154 ``SparseArray.fill_value`` directly. 

155 """ 

156 return self._fill_value 

157 

158 def _check_fill_value(self): 

159 if not is_scalar(self._fill_value): 

160 raise ValueError( 

161 f"fill_value must be a scalar. Got {self._fill_value} instead" 

162 ) 

163 # TODO: Right now we can use Sparse boolean array 

164 # with any fill_value. Here was an attempt 

165 # to allow only 3 value: True, False or nan 

166 # but plenty test has failed. 

167 # see pull 44955 

168 # if self._is_boolean and not ( 

169 # is_bool(self._fill_value) or isna(self._fill_value) 

170 # ): 

171 # raise ValueError( 

172 # "fill_value must be True, False or nan " 

173 # f"for boolean type. Got {self._fill_value} instead" 

174 # ) 

175 

176 @property 

177 def _is_na_fill_value(self) -> bool: 

178 return isna(self.fill_value) 

179 

180 @property 

181 def _is_numeric(self) -> bool: 

182 return not is_object_dtype(self.subtype) 

183 

184 @property 

185 def _is_boolean(self) -> bool: 

186 return is_bool_dtype(self.subtype) 

187 

188 @property 

189 def kind(self) -> str: 

190 """ 

191 The sparse kind. Either 'integer', or 'block'. 

192 """ 

193 return self.subtype.kind 

194 

195 @property 

196 def type(self): 

197 return self.subtype.type 

198 

199 @property 

200 def subtype(self): 

201 return self._dtype 

202 

203 @property 

204 def name(self) -> str: 

205 return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]" 

206 

207 def __repr__(self) -> str: 

208 return self.name 

209 

210 @classmethod 

211 def construct_array_type(cls) -> type_t[SparseArray]: 

212 """ 

213 Return the array type associated with this dtype. 

214 

215 Returns 

216 ------- 

217 type 

218 """ 

219 from pandas.core.arrays.sparse.array import SparseArray 

220 

221 return SparseArray 

222 

223 @classmethod 

224 def construct_from_string(cls, string: str) -> SparseDtype: 

225 """ 

226 Construct a SparseDtype from a string form. 

227 

228 Parameters 

229 ---------- 

230 string : str 

231 Can take the following forms. 

232 

233 string dtype 

234 ================ ============================ 

235 'int' SparseDtype[np.int64, 0] 

236 'Sparse' SparseDtype[np.float64, nan] 

237 'Sparse[int]' SparseDtype[np.int64, 0] 

238 'Sparse[int, 0]' SparseDtype[np.int64, 0] 

239 ================ ============================ 

240 

241 It is not possible to specify non-default fill values 

242 with a string. An argument like ``'Sparse[int, 1]'`` 

243 will raise a ``TypeError`` because the default fill value 

244 for integers is 0. 

245 

246 Returns 

247 ------- 

248 SparseDtype 

249 """ 

250 if not isinstance(string, str): 

251 raise TypeError( 

252 f"'construct_from_string' expects a string, got {type(string)}" 

253 ) 

254 msg = f"Cannot construct a 'SparseDtype' from '{string}'" 

255 if string.startswith("Sparse"): 

256 try: 

257 sub_type, has_fill_value = cls._parse_subtype(string) 

258 except ValueError as err: 

259 raise TypeError(msg) from err 

260 else: 

261 result = SparseDtype(sub_type) 

262 msg = ( 

263 f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt " 

264 "looks like the fill_value in the string is not " 

265 "the default for the dtype. Non-default fill_values " 

266 "are not supported. Use the 'SparseDtype()' " 

267 "constructor instead." 

268 ) 

269 if has_fill_value and str(result) != string: 

270 raise TypeError(msg) 

271 return result 

272 else: 

273 raise TypeError(msg) 

274 

275 @staticmethod 

276 def _parse_subtype(dtype: str) -> tuple[str, bool]: 

277 """ 

278 Parse a string to get the subtype 

279 

280 Parameters 

281 ---------- 

282 dtype : str 

283 A string like 

284 

285 * Sparse[subtype] 

286 * Sparse[subtype, fill_value] 

287 

288 Returns 

289 ------- 

290 subtype : str 

291 

292 Raises 

293 ------ 

294 ValueError 

295 When the subtype cannot be extracted. 

296 """ 

297 xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$") 

298 m = xpr.match(dtype) 

299 has_fill_value = False 

300 if m: 

301 subtype = m.groupdict()["subtype"] 

302 has_fill_value = bool(m.groupdict()["fill_value"]) 

303 elif dtype == "Sparse": 

304 subtype = "float64" 

305 else: 

306 raise ValueError(f"Cannot parse {dtype}") 

307 return subtype, has_fill_value 

308 

309 @classmethod 

310 def is_dtype(cls, dtype: object) -> bool: 

311 dtype = getattr(dtype, "dtype", dtype) 

312 if isinstance(dtype, str) and dtype.startswith("Sparse"): 

313 sub_type, _ = cls._parse_subtype(dtype) 

314 dtype = np.dtype(sub_type) 

315 elif isinstance(dtype, cls): 

316 return True 

317 return isinstance(dtype, np.dtype) or dtype == "Sparse" 

318 

319 def update_dtype(self, dtype) -> SparseDtype: 

320 """ 

321 Convert the SparseDtype to a new dtype. 

322 

323 This takes care of converting the ``fill_value``. 

324 

325 Parameters 

326 ---------- 

327 dtype : Union[str, numpy.dtype, SparseDtype] 

328 The new dtype to use. 

329 

330 * For a SparseDtype, it is simply returned 

331 * For a NumPy dtype (or str), the current fill value 

332 is converted to the new dtype, and a SparseDtype 

333 with `dtype` and the new fill value is returned. 

334 

335 Returns 

336 ------- 

337 SparseDtype 

338 A new SparseDtype with the correct `dtype` and fill value 

339 for that `dtype`. 

340 

341 Raises 

342 ------ 

343 ValueError 

344 When the current fill value cannot be converted to the 

345 new `dtype` (e.g. trying to convert ``np.nan`` to an 

346 integer dtype). 

347 

348 

349 Examples 

350 -------- 

351 >>> SparseDtype(int, 0).update_dtype(float) 

352 Sparse[float64, 0.0] 

353 

354 >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan)) 

355 Sparse[float64, nan] 

356 """ 

357 cls = type(self) 

358 dtype = pandas_dtype(dtype) 

359 

360 if not isinstance(dtype, cls): 

361 if not isinstance(dtype, np.dtype): 

362 raise TypeError("sparse arrays of extension dtypes not supported") 

363 

364 fv_asarray = np.atleast_1d(np.array(self.fill_value)) 

365 fvarr = astype_array(fv_asarray, dtype) 

366 # NB: not fv_0d.item(), as that casts dt64->int 

367 fill_value = fvarr[0] 

368 dtype = cls(dtype, fill_value=fill_value) 

369 

370 return dtype 

371 

372 @property 

373 def _subtype_with_str(self): 

374 """ 

375 Whether the SparseDtype's subtype should be considered ``str``. 

376 

377 Typically, pandas will store string data in an object-dtype array. 

378 When converting values to a dtype, e.g. in ``.astype``, we need to 

379 be more specific, we need the actual underlying type. 

380 

381 Returns 

382 ------- 

383 >>> SparseDtype(int, 1)._subtype_with_str 

384 dtype('int64') 

385 

386 >>> SparseDtype(object, 1)._subtype_with_str 

387 dtype('O') 

388 

389 >>> dtype = SparseDtype(str, '') 

390 >>> dtype.subtype 

391 dtype('O') 

392 

393 >>> dtype._subtype_with_str 

394 <class 'str'> 

395 """ 

396 if isinstance(self.fill_value, str): 

397 return type(self.fill_value) 

398 return self.subtype 

399 

400 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: 

401 # TODO for now only handle SparseDtypes and numpy dtypes => extend 

402 # with other compatible extension dtypes 

403 from pandas.core.dtypes.cast import np_find_common_type 

404 

405 if any( 

406 isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype) 

407 for x in dtypes 

408 ): 

409 return None 

410 

411 fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)] 

412 fill_value = fill_values[0] 

413 

414 # np.nan isn't a singleton, so we may end up with multiple 

415 # NaNs here, so we ignore the all NA case too. 

416 if not (len(set(fill_values)) == 1 or isna(fill_values).all()): 

417 warnings.warn( 

418 "Concatenating sparse arrays with multiple fill " 

419 f"values: '{fill_values}'. Picking the first and " 

420 "converting the rest.", 

421 PerformanceWarning, 

422 stacklevel=find_stack_level(), 

423 ) 

424 

425 np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes) 

426 return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value)