Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/dtypes/base.py: 65%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

155 statements  

1""" 

2Extend pandas with custom array types. 

3""" 

4from __future__ import annotations 

5 

6from typing import ( 

7 TYPE_CHECKING, 

8 Any, 

9 TypeVar, 

10 cast, 

11 overload, 

12) 

13 

14import numpy as np 

15 

16from pandas._libs import missing as libmissing 

17from pandas._libs.hashtable import object_hash 

18from pandas._libs.properties import cache_readonly 

19from pandas.errors import AbstractMethodError 

20 

21from pandas.core.dtypes.generic import ( 

22 ABCDataFrame, 

23 ABCIndex, 

24 ABCSeries, 

25) 

26 

27if TYPE_CHECKING: 

28 from pandas._typing import ( 

29 DtypeObj, 

30 Self, 

31 Shape, 

32 npt, 

33 type_t, 

34 ) 

35 

36 from pandas import Index 

37 from pandas.core.arrays import ExtensionArray 

38 

39 # To parameterize on same ExtensionDtype 

40 ExtensionDtypeT = TypeVar("ExtensionDtypeT", bound="ExtensionDtype") 

41 

42 

43class ExtensionDtype: 

44 """ 

45 A custom data type, to be paired with an ExtensionArray. 

46 

47 See Also 

48 -------- 

49 extensions.register_extension_dtype: Register an ExtensionType 

50 with pandas as class decorator. 

51 extensions.ExtensionArray: Abstract base class for custom 1-D array types. 

52 

53 Notes 

54 ----- 

55 The interface includes the following abstract methods that must 

56 be implemented by subclasses: 

57 

58 * type 

59 * name 

60 * construct_array_type 

61 

62 The following attributes and methods influence the behavior of the dtype in 

63 pandas operations 

64 

65 * _is_numeric 

66 * _is_boolean 

67 * _get_common_dtype 

68 

69 The `na_value` class attribute can be used to set the default NA value 

70 for this type. :attr:`numpy.nan` is used by default. 

71 

72 ExtensionDtypes are required to be hashable. The base class provides 

73 a default implementation, which relies on the ``_metadata`` class 

74 attribute. ``_metadata`` should be a tuple containing the strings 

75 that define your data type. For example, with ``PeriodDtype`` that's 

76 the ``freq`` attribute. 

77 

78 **If you have a parametrized dtype you should set the ``_metadata`` 

79 class property**. 

80 

81 Ideally, the attributes in ``_metadata`` will match the 

82 parameters to your ``ExtensionDtype.__init__`` (if any). If any of 

83 the attributes in ``_metadata`` don't implement the standard 

84 ``__eq__`` or ``__hash__``, the default implementations here will not 

85 work. 

86 

87 Examples 

88 -------- 

89 

90 For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method 

91 can be implemented: this method receives a pyarrow Array or ChunkedArray 

92 as only argument and is expected to return the appropriate pandas 

93 ExtensionArray for this dtype and the passed values: 

94 

95 >>> import pyarrow 

96 >>> from pandas.api.extensions import ExtensionArray 

97 >>> class ExtensionDtype: 

98 ... def __from_arrow__( 

99 ... self, 

100 ... array: pyarrow.Array | pyarrow.ChunkedArray 

101 ... ) -> ExtensionArray: 

102 ... ... 

103 

104 This class does not inherit from 'abc.ABCMeta' for performance reasons. 

105 Methods and properties required by the interface raise 

106 ``pandas.errors.AbstractMethodError`` and no ``register`` method is 

107 provided for registering virtual subclasses. 

108 """ 

109 

110 _metadata: tuple[str, ...] = () 

111 

112 def __str__(self) -> str: 

113 return self.name 

114 

115 def __eq__(self, other: object) -> bool: 

116 """ 

117 Check whether 'other' is equal to self. 

118 

119 By default, 'other' is considered equal if either 

120 

121 * it's a string matching 'self.name'. 

122 * it's an instance of this type and all of the attributes 

123 in ``self._metadata`` are equal between `self` and `other`. 

124 

125 Parameters 

126 ---------- 

127 other : Any 

128 

129 Returns 

130 ------- 

131 bool 

132 """ 

133 if isinstance(other, str): 

134 try: 

135 other = self.construct_from_string(other) 

136 except TypeError: 

137 return False 

138 if isinstance(other, type(self)): 

139 return all( 

140 getattr(self, attr) == getattr(other, attr) for attr in self._metadata 

141 ) 

142 return False 

143 

144 def __hash__(self) -> int: 

145 # for python>=3.10, different nan objects have different hashes 

146 # we need to avoid that and thus use hash function with old behavior 

147 return object_hash(tuple(getattr(self, attr) for attr in self._metadata)) 

148 

149 def __ne__(self, other: object) -> bool: 

150 return not self.__eq__(other) 

151 

152 @property 

153 def na_value(self) -> object: 

154 """ 

155 Default NA value to use for this type. 

156 

157 This is used in e.g. ExtensionArray.take. This should be the 

158 user-facing "boxed" version of the NA value, not the physical NA value 

159 for storage. e.g. for JSONArray, this is an empty dictionary. 

160 """ 

161 return np.nan 

162 

163 @property 

164 def type(self) -> type_t[Any]: 

165 """ 

166 The scalar type for the array, e.g. ``int`` 

167 

168 It's expected ``ExtensionArray[item]`` returns an instance 

169 of ``ExtensionDtype.type`` for scalar ``item``, assuming 

170 that value is valid (not NA). NA values do not need to be 

171 instances of `type`. 

172 """ 

173 raise AbstractMethodError(self) 

174 

175 @property 

176 def kind(self) -> str: 

177 """ 

178 A character code (one of 'biufcmMOSUV'), default 'O' 

179 

180 This should match the NumPy dtype used when the array is 

181 converted to an ndarray, which is probably 'O' for object if 

182 the extension type cannot be represented as a built-in NumPy 

183 type. 

184 

185 See Also 

186 -------- 

187 numpy.dtype.kind 

188 """ 

189 return "O" 

190 

191 @property 

192 def name(self) -> str: 

193 """ 

194 A string identifying the data type. 

195 

196 Will be used for display in, e.g. ``Series.dtype`` 

197 """ 

198 raise AbstractMethodError(self) 

199 

200 @property 

201 def names(self) -> list[str] | None: 

202 """ 

203 Ordered list of field names, or None if there are no fields. 

204 

205 This is for compatibility with NumPy arrays, and may be removed in the 

206 future. 

207 """ 

208 return None 

209 

210 @classmethod 

211 def construct_array_type(cls) -> type_t[ExtensionArray]: 

212 """ 

213 Return the array type associated with this dtype. 

214 

215 Returns 

216 ------- 

217 type 

218 """ 

219 raise AbstractMethodError(cls) 

220 

221 def empty(self, shape: Shape) -> ExtensionArray: 

222 """ 

223 Construct an ExtensionArray of this dtype with the given shape. 

224 

225 Analogous to numpy.empty. 

226 

227 Parameters 

228 ---------- 

229 shape : int or tuple[int] 

230 

231 Returns 

232 ------- 

233 ExtensionArray 

234 """ 

235 cls = self.construct_array_type() 

236 return cls._empty(shape, dtype=self) 

237 

238 @classmethod 

239 def construct_from_string(cls, string: str) -> Self: 

240 r""" 

241 Construct this type from a string. 

242 

243 This is useful mainly for data types that accept parameters. 

244 For example, a period dtype accepts a frequency parameter that 

245 can be set as ``period[h]`` (where H means hourly frequency). 

246 

247 By default, in the abstract class, just the name of the type is 

248 expected. But subclasses can overwrite this method to accept 

249 parameters. 

250 

251 Parameters 

252 ---------- 

253 string : str 

254 The name of the type, for example ``category``. 

255 

256 Returns 

257 ------- 

258 ExtensionDtype 

259 Instance of the dtype. 

260 

261 Raises 

262 ------ 

263 TypeError 

264 If a class cannot be constructed from this 'string'. 

265 

266 Examples 

267 -------- 

268 For extension dtypes with arguments the following may be an 

269 adequate implementation. 

270 

271 >>> import re 

272 >>> @classmethod 

273 ... def construct_from_string(cls, string): 

274 ... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$") 

275 ... match = pattern.match(string) 

276 ... if match: 

277 ... return cls(**match.groupdict()) 

278 ... else: 

279 ... raise TypeError( 

280 ... f"Cannot construct a '{cls.__name__}' from '{string}'" 

281 ... ) 

282 """ 

283 if not isinstance(string, str): 

284 raise TypeError( 

285 f"'construct_from_string' expects a string, got {type(string)}" 

286 ) 

287 # error: Non-overlapping equality check (left operand type: "str", right 

288 # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap] 

289 assert isinstance(cls.name, str), (cls, type(cls.name)) 

290 if string != cls.name: 

291 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") 

292 return cls() 

293 

294 @classmethod 

295 def is_dtype(cls, dtype: object) -> bool: 

296 """ 

297 Check if we match 'dtype'. 

298 

299 Parameters 

300 ---------- 

301 dtype : object 

302 The object to check. 

303 

304 Returns 

305 ------- 

306 bool 

307 

308 Notes 

309 ----- 

310 The default implementation is True if 

311 

312 1. ``cls.construct_from_string(dtype)`` is an instance 

313 of ``cls``. 

314 2. ``dtype`` is an object and is an instance of ``cls`` 

315 3. ``dtype`` has a ``dtype`` attribute, and any of the above 

316 conditions is true for ``dtype.dtype``. 

317 """ 

318 dtype = getattr(dtype, "dtype", dtype) 

319 

320 if isinstance(dtype, (ABCSeries, ABCIndex, ABCDataFrame, np.dtype)): 

321 # https://github.com/pandas-dev/pandas/issues/22960 

322 # avoid passing data to `construct_from_string`. This could 

323 # cause a FutureWarning from numpy about failing elementwise 

324 # comparison from, e.g., comparing DataFrame == 'category'. 

325 return False 

326 elif dtype is None: 

327 return False 

328 elif isinstance(dtype, cls): 

329 return True 

330 if isinstance(dtype, str): 

331 try: 

332 return cls.construct_from_string(dtype) is not None 

333 except TypeError: 

334 return False 

335 return False 

336 

337 @property 

338 def _is_numeric(self) -> bool: 

339 """ 

340 Whether columns with this dtype should be considered numeric. 

341 

342 By default ExtensionDtypes are assumed to be non-numeric. 

343 They'll be excluded from operations that exclude non-numeric 

344 columns, like (groupby) reductions, plotting, etc. 

345 """ 

346 return False 

347 

348 @property 

349 def _is_boolean(self) -> bool: 

350 """ 

351 Whether this dtype should be considered boolean. 

352 

353 By default, ExtensionDtypes are assumed to be non-numeric. 

354 Setting this to True will affect the behavior of several places, 

355 e.g. 

356 

357 * is_bool 

358 * boolean indexing 

359 

360 Returns 

361 ------- 

362 bool 

363 """ 

364 return False 

365 

366 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: 

367 """ 

368 Return the common dtype, if one exists. 

369 

370 Used in `find_common_type` implementation. This is for example used 

371 to determine the resulting dtype in a concat operation. 

372 

373 If no common dtype exists, return None (which gives the other dtypes 

374 the chance to determine a common dtype). If all dtypes in the list 

375 return None, then the common dtype will be "object" dtype (this means 

376 it is never needed to return "object" dtype from this method itself). 

377 

378 Parameters 

379 ---------- 

380 dtypes : list of dtypes 

381 The dtypes for which to determine a common dtype. This is a list 

382 of np.dtype or ExtensionDtype instances. 

383 

384 Returns 

385 ------- 

386 Common dtype (np.dtype or ExtensionDtype) or None 

387 """ 

388 if len(set(dtypes)) == 1: 

389 # only itself 

390 return self 

391 else: 

392 return None 

393 

394 @property 

395 def _can_hold_na(self) -> bool: 

396 """ 

397 Can arrays of this dtype hold NA values? 

398 """ 

399 return True 

400 

401 @property 

402 def _is_immutable(self) -> bool: 

403 """ 

404 Can arrays with this dtype be modified with __setitem__? If not, return 

405 True. 

406 

407 Immutable arrays are expected to raise TypeError on __setitem__ calls. 

408 """ 

409 return False 

410 

411 @cache_readonly 

412 def index_class(self) -> type_t[Index]: 

413 """ 

414 The Index subclass to return from Index.__new__ when this dtype is 

415 encountered. 

416 """ 

417 from pandas import Index 

418 

419 return Index 

420 

421 @property 

422 def _supports_2d(self) -> bool: 

423 """ 

424 Do ExtensionArrays with this dtype support 2D arrays? 

425 

426 Historically ExtensionArrays were limited to 1D. By returning True here, 

427 authors can indicate that their arrays support 2D instances. This can 

428 improve performance in some cases, particularly operations with `axis=1`. 

429 

430 Arrays that support 2D values should: 

431 

432 - implement Array.reshape 

433 - subclass the Dim2CompatTests in tests.extension.base 

434 - _concat_same_type should support `axis` keyword 

435 - _reduce and reductions should support `axis` keyword 

436 """ 

437 return False 

438 

439 @property 

440 def _can_fast_transpose(self) -> bool: 

441 """ 

442 Is transposing an array with this dtype zero-copy? 

443 

444 Only relevant for cases where _supports_2d is True. 

445 """ 

446 return False 

447 

448 

449class StorageExtensionDtype(ExtensionDtype): 

450 """ExtensionDtype that may be backed by more than one implementation.""" 

451 

452 name: str 

453 _metadata = ("storage",) 

454 

455 def __init__(self, storage: str | None = None) -> None: 

456 self.storage = storage 

457 

458 def __repr__(self) -> str: 

459 return f"{self.name}[{self.storage}]" 

460 

461 def __str__(self) -> str: 

462 return self.name 

463 

464 def __eq__(self, other: object) -> bool: 

465 if isinstance(other, str) and other == self.name: 

466 return True 

467 return super().__eq__(other) 

468 

469 def __hash__(self) -> int: 

470 # custom __eq__ so have to override __hash__ 

471 return super().__hash__() 

472 

473 @property 

474 def na_value(self) -> libmissing.NAType: 

475 return libmissing.NA 

476 

477 

478def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: 

479 """ 

480 Register an ExtensionType with pandas as class decorator. 

481 

482 This enables operations like ``.astype(name)`` for the name 

483 of the ExtensionDtype. 

484 

485 Returns 

486 ------- 

487 callable 

488 A class decorator. 

489 

490 Examples 

491 -------- 

492 >>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype 

493 >>> @register_extension_dtype 

494 ... class MyExtensionDtype(ExtensionDtype): 

495 ... name = "myextension" 

496 """ 

497 _registry.register(cls) 

498 return cls 

499 

500 

501class Registry: 

502 """ 

503 Registry for dtype inference. 

504 

505 The registry allows one to map a string repr of a extension 

506 dtype to an extension dtype. The string alias can be used in several 

507 places, including 

508 

509 * Series and Index constructors 

510 * :meth:`pandas.array` 

511 * :meth:`pandas.Series.astype` 

512 

513 Multiple extension types can be registered. 

514 These are tried in order. 

515 """ 

516 

517 def __init__(self) -> None: 

518 self.dtypes: list[type_t[ExtensionDtype]] = [] 

519 

520 def register(self, dtype: type_t[ExtensionDtype]) -> None: 

521 """ 

522 Parameters 

523 ---------- 

524 dtype : ExtensionDtype class 

525 """ 

526 if not issubclass(dtype, ExtensionDtype): 

527 raise ValueError("can only register pandas extension dtypes") 

528 

529 self.dtypes.append(dtype) 

530 

531 @overload 

532 def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]: 

533 ... 

534 

535 @overload 

536 def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT: 

537 ... 

538 

539 @overload 

540 def find(self, dtype: str) -> ExtensionDtype | None: 

541 ... 

542 

543 @overload 

544 def find( 

545 self, dtype: npt.DTypeLike 

546 ) -> type_t[ExtensionDtype] | ExtensionDtype | None: 

547 ... 

548 

549 def find( 

550 self, dtype: type_t[ExtensionDtype] | ExtensionDtype | npt.DTypeLike 

551 ) -> type_t[ExtensionDtype] | ExtensionDtype | None: 

552 """ 

553 Parameters 

554 ---------- 

555 dtype : ExtensionDtype class or instance or str or numpy dtype or python type 

556 

557 Returns 

558 ------- 

559 return the first matching dtype, otherwise return None 

560 """ 

561 if not isinstance(dtype, str): 

562 dtype_type: type_t 

563 if not isinstance(dtype, type): 

564 dtype_type = type(dtype) 

565 else: 

566 dtype_type = dtype 

567 if issubclass(dtype_type, ExtensionDtype): 

568 # cast needed here as mypy doesn't know we have figured 

569 # out it is an ExtensionDtype or type_t[ExtensionDtype] 

570 return cast("ExtensionDtype | type_t[ExtensionDtype]", dtype) 

571 

572 return None 

573 

574 for dtype_type in self.dtypes: 

575 try: 

576 return dtype_type.construct_from_string(dtype) 

577 except TypeError: 

578 pass 

579 

580 return None 

581 

582 

583_registry = Registry()