Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/sparse/dtype.py: 34%

1"""Sparse Dtype"""

2from __future__ import annotations

4import re

5from typing import (

6 TYPE_CHECKING,

7 Any,

9import warnings

11import numpy as np

13from pandas._typing import (

14 Dtype,

15 DtypeObj,

16 type_t,

17)

18from pandas.errors import PerformanceWarning

19from pandas.util._exceptions import find_stack_level

21from pandas.core.dtypes.astype import astype_array

22from pandas.core.dtypes.base import (

23 ExtensionDtype,

24 register_extension_dtype,

25)

26from pandas.core.dtypes.common import (

27 is_bool_dtype,

28 is_object_dtype,

29 is_scalar,

30 is_string_dtype,

31 pandas_dtype,

32)

33from pandas.core.dtypes.missing import (

34 isna,

35 na_value_for_dtype,

36)

38if TYPE_CHECKING:

39 from pandas.core.arrays.sparse.array import SparseArray

42@register_extension_dtype

43class SparseDtype(ExtensionDtype):

44 """

45 Dtype for data stored in :class:`SparseArray`.

47 This dtype implements the pandas ExtensionDtype interface.

49 Parameters

50 ----------

51 dtype : str, ExtensionDtype, numpy.dtype, type, default numpy.float64

52 The dtype of the underlying array storing the non-fill value values.

53 fill_value : scalar, optional

54 The scalar value not stored in the SparseArray. By default, this

55 depends on `dtype`.

57 =========== ==========

58 dtype na_value

59 =========== ==========

60 float ``np.nan``

61 int ``0``

62 bool ``False``

63 datetime64 ``pd.NaT``

64 timedelta64 ``pd.NaT``

65 =========== ==========

67 The default value may be overridden by specifying a `fill_value`.

69 Attributes

70 ----------

71 None

73 Methods

74 -------

75 None

76 """

78 # We include `_is_na_fill_value` in the metadata to avoid hash collisions

79 # between SparseDtype(float, 0.0) and SparseDtype(float, nan).

80 # Without is_na_fill_value in the comparison, those would be equal since

81 # hash(nan) is (sometimes?) 0.

82 _metadata = ("_dtype", "_fill_value", "_is_na_fill_value")

84 def __init__(self, dtype: Dtype = np.float64, fill_value: Any = None) -> None:

85 if isinstance(dtype, type(self)):

86 if fill_value is None:

87 fill_value = dtype.fill_value

88 dtype = dtype.subtype

90 dtype = pandas_dtype(dtype)

91 if is_string_dtype(dtype):

92 dtype = np.dtype("object")

94 if fill_value is None:

95 fill_value = na_value_for_dtype(dtype)

97 self._dtype = dtype

98 self._fill_value = fill_value

99 self._check_fill_value()

100

101 def __hash__(self) -> int:

102 # Python3 doesn't inherit __hash__ when a base class overrides

103 # __eq__, so we explicitly do it here.

104 return super().__hash__()

105

106 def __eq__(self, other: Any) -> bool:

107 # We have to override __eq__ to handle NA values in _metadata.

108 # The base class does simple == checks, which fail for NA.

109 if isinstance(other, str):

110 try:

111 other = self.construct_from_string(other)

112 except TypeError:

113 return False

114

115 if isinstance(other, type(self)):

116 subtype = self.subtype == other.subtype

117 if self._is_na_fill_value:

118 # this case is complicated by two things:

119 # SparseDtype(float, float(nan)) == SparseDtype(float, np.nan)

120 # SparseDtype(float, np.nan) != SparseDtype(float, pd.NaT)

121 # i.e. we want to treat any floating-point NaN as equal, but

122 # not a floating-point NaN and a datetime NaT.

123 fill_value = (

124 other._is_na_fill_value

125 and isinstance(self.fill_value, type(other.fill_value))

126 or isinstance(other.fill_value, type(self.fill_value))

127 )

128 else:

129 with warnings.catch_warnings():

130 # Ignore spurious numpy warning

131 warnings.filterwarnings(

132 "ignore",

133 "elementwise comparison failed",

134 category=DeprecationWarning,

135 )

136

137 fill_value = self.fill_value == other.fill_value

138

139 return subtype and fill_value

140 return False

141

142 @property

143 def fill_value(self):

144 """

145 The fill value of the array.

146

147 Converting the SparseArray to a dense ndarray will fill the

148 array with this value.

149

150 .. warning::

151

152 It's possible to end up with a SparseArray that has ``fill_value``

153 values in ``sp_values``. This can occur, for example, when setting

154 ``SparseArray.fill_value`` directly.

155 """

156 return self._fill_value

157

158 def _check_fill_value(self):

159 if not is_scalar(self._fill_value):

160 raise ValueError(

161 f"fill_value must be a scalar. Got {self._fill_value} instead"

162 )

163 # TODO: Right now we can use Sparse boolean array

164 # with any fill_value. Here was an attempt

165 # to allow only 3 value: True, False or nan

166 # but plenty test has failed.

167 # see pull 44955

168 # if self._is_boolean and not (

169 # is_bool(self._fill_value) or isna(self._fill_value)

170 # ):

171 # raise ValueError(

172 # "fill_value must be True, False or nan "

173 # f"for boolean type. Got {self._fill_value} instead"

174 # )

175

176 @property

177 def _is_na_fill_value(self) -> bool:

178 return isna(self.fill_value)

179

180 @property

181 def _is_numeric(self) -> bool:

182 return not is_object_dtype(self.subtype)

183

184 @property

185 def _is_boolean(self) -> bool:

186 return is_bool_dtype(self.subtype)

187

188 @property

189 def kind(self) -> str:

190 """

191 The sparse kind. Either 'integer', or 'block'.

192 """

193 return self.subtype.kind

194

195 @property

196 def type(self):

197 return self.subtype.type

198

199 @property

200 def subtype(self):

201 return self._dtype

202

203 @property

204 def name(self) -> str:

205 return f"Sparse[{self.subtype.name}, {repr(self.fill_value)}]"

206

207 def __repr__(self) -> str:

208 return self.name

209

210 @classmethod

211 def construct_array_type(cls) -> type_t[SparseArray]:

212 """

213 Return the array type associated with this dtype.

214

215 Returns

216 -------

217 type

218 """

219 from pandas.core.arrays.sparse.array import SparseArray

220

221 return SparseArray

222

223 @classmethod

224 def construct_from_string(cls, string: str) -> SparseDtype:

225 """

226 Construct a SparseDtype from a string form.

227

228 Parameters

229 ----------

230 string : str

231 Can take the following forms.

232

233 string dtype

234 ================ ============================

235 'int' SparseDtype[np.int64, 0]

236 'Sparse' SparseDtype[np.float64, nan]

237 'Sparse[int]' SparseDtype[np.int64, 0]

238 'Sparse[int, 0]' SparseDtype[np.int64, 0]

239 ================ ============================

240

241 It is not possible to specify non-default fill values

242 with a string. An argument like ``'Sparse[int, 1]'``

243 will raise a ``TypeError`` because the default fill value

244 for integers is 0.

245

246 Returns

247 -------

248 SparseDtype

249 """

250 if not isinstance(string, str):

251 raise TypeError(

252 f"'construct_from_string' expects a string, got {type(string)}"

253 )

254 msg = f"Cannot construct a 'SparseDtype' from '{string}'"

255 if string.startswith("Sparse"):

256 try:

257 sub_type, has_fill_value = cls._parse_subtype(string)

258 except ValueError as err:

259 raise TypeError(msg) from err

260 else:

261 result = SparseDtype(sub_type)

262 msg = (

263 f"Cannot construct a 'SparseDtype' from '{string}'.\n\nIt "

264 "looks like the fill_value in the string is not "

265 "the default for the dtype. Non-default fill_values "

266 "are not supported. Use the 'SparseDtype()' "

267 "constructor instead."

268 )

269 if has_fill_value and str(result) != string:

270 raise TypeError(msg)

271 return result

272 else:

273 raise TypeError(msg)

274

275 @staticmethod

276 def _parse_subtype(dtype: str) -> tuple[str, bool]:

277 """

278 Parse a string to get the subtype

279

280 Parameters

281 ----------

282 dtype : str

283 A string like

284

285 * Sparse[subtype]

286 * Sparse[subtype, fill_value]

287

288 Returns

289 -------

290 subtype : str

291

292 Raises

293 ------

294 ValueError

295 When the subtype cannot be extracted.

296 """

297 xpr = re.compile(r"Sparse\[(?P<subtype>[^,]*)(, )?(?P<fill_value>.*?)?\]$")

298 m = xpr.match(dtype)

299 has_fill_value = False

300 if m:

301 subtype = m.groupdict()["subtype"]

302 has_fill_value = bool(m.groupdict()["fill_value"])

303 elif dtype == "Sparse":

304 subtype = "float64"

305 else:

306 raise ValueError(f"Cannot parse {dtype}")

307 return subtype, has_fill_value

308

309 @classmethod

310 def is_dtype(cls, dtype: object) -> bool:

311 dtype = getattr(dtype, "dtype", dtype)

312 if isinstance(dtype, str) and dtype.startswith("Sparse"):

313 sub_type, _ = cls._parse_subtype(dtype)

314 dtype = np.dtype(sub_type)

315 elif isinstance(dtype, cls):

316 return True

317 return isinstance(dtype, np.dtype) or dtype == "Sparse"

318

319 def update_dtype(self, dtype) -> SparseDtype:

320 """

321 Convert the SparseDtype to a new dtype.

322

323 This takes care of converting the ``fill_value``.

324

325 Parameters

326 ----------

327 dtype : Union[str, numpy.dtype, SparseDtype]

328 The new dtype to use.

329

330 * For a SparseDtype, it is simply returned

331 * For a NumPy dtype (or str), the current fill value

332 is converted to the new dtype, and a SparseDtype

333 with `dtype` and the new fill value is returned.

334

335 Returns

336 -------

337 SparseDtype

338 A new SparseDtype with the correct `dtype` and fill value

339 for that `dtype`.

340

341 Raises

342 ------

343 ValueError

344 When the current fill value cannot be converted to the

345 new `dtype` (e.g. trying to convert ``np.nan`` to an

346 integer dtype).

347

348

349 Examples

350 --------

351 >>> SparseDtype(int, 0).update_dtype(float)

352 Sparse[float64, 0.0]

353

354 >>> SparseDtype(int, 1).update_dtype(SparseDtype(float, np.nan))

355 Sparse[float64, nan]

356 """

357 cls = type(self)

358 dtype = pandas_dtype(dtype)

359

360 if not isinstance(dtype, cls):

361 if not isinstance(dtype, np.dtype):

362 raise TypeError("sparse arrays of extension dtypes not supported")

363

364 fv_asarray = np.atleast_1d(np.array(self.fill_value))

365 fvarr = astype_array(fv_asarray, dtype)

366 # NB: not fv_0d.item(), as that casts dt64->int

367 fill_value = fvarr[0]

368 dtype = cls(dtype, fill_value=fill_value)

369

370 return dtype

371

372 @property

373 def _subtype_with_str(self):

374 """

375 Whether the SparseDtype's subtype should be considered ``str``.

376

377 Typically, pandas will store string data in an object-dtype array.

378 When converting values to a dtype, e.g. in ``.astype``, we need to

379 be more specific, we need the actual underlying type.

380

381 Returns

382 -------

383 >>> SparseDtype(int, 1)._subtype_with_str

384 dtype('int64')

385

386 >>> SparseDtype(object, 1)._subtype_with_str

387 dtype('O')

388

389 >>> dtype = SparseDtype(str, '')

390 >>> dtype.subtype

391 dtype('O')

392

393 >>> dtype._subtype_with_str

394 <class 'str'>

395 """

396 if isinstance(self.fill_value, str):

397 return type(self.fill_value)

398 return self.subtype

399

400 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:

401 # TODO for now only handle SparseDtypes and numpy dtypes => extend

402 # with other compatible extension dtypes

403 from pandas.core.dtypes.cast import np_find_common_type

404

405 if any(

406 isinstance(x, ExtensionDtype) and not isinstance(x, SparseDtype)

407 for x in dtypes

408 ):

409 return None

410

411 fill_values = [x.fill_value for x in dtypes if isinstance(x, SparseDtype)]

412 fill_value = fill_values[0]

413

414 # np.nan isn't a singleton, so we may end up with multiple

415 # NaNs here, so we ignore the all NA case too.

416 if not (len(set(fill_values)) == 1 or isna(fill_values).all()):

417 warnings.warn(

418 "Concatenating sparse arrays with multiple fill "

419 f"values: '{fill_values}'. Picking the first and "

420 "converting the rest.",

421 PerformanceWarning,

422 stacklevel=find_stack_level(),

423 )

424

425 np_dtypes = (x.subtype if isinstance(x, SparseDtype) else x for x in dtypes)

426 return SparseDtype(np_find_common_type(*np_dtypes), fill_value=fill_value)