1"""
2Extend pandas with custom array types.
3"""
4from __future__ import annotations
5
6from typing import (
7 TYPE_CHECKING,
8 Any,
9 TypeVar,
10 cast,
11 overload,
12)
13
14import numpy as np
15
16from pandas._libs import missing as libmissing
17from pandas._libs.hashtable import object_hash
18from pandas._libs.properties import cache_readonly
19from pandas.errors import AbstractMethodError
20
21from pandas.core.dtypes.generic import (
22 ABCDataFrame,
23 ABCIndex,
24 ABCSeries,
25)
26
27if TYPE_CHECKING:
28 from pandas._typing import (
29 DtypeObj,
30 Self,
31 Shape,
32 npt,
33 type_t,
34 )
35
36 from pandas import Index
37 from pandas.core.arrays import ExtensionArray
38
39 # To parameterize on same ExtensionDtype
40 ExtensionDtypeT = TypeVar("ExtensionDtypeT", bound="ExtensionDtype")
41
42
43class ExtensionDtype:
44 """
45 A custom data type, to be paired with an ExtensionArray.
46
47 See Also
48 --------
49 extensions.register_extension_dtype: Register an ExtensionType
50 with pandas as class decorator.
51 extensions.ExtensionArray: Abstract base class for custom 1-D array types.
52
53 Notes
54 -----
55 The interface includes the following abstract methods that must
56 be implemented by subclasses:
57
58 * type
59 * name
60 * construct_array_type
61
62 The following attributes and methods influence the behavior of the dtype in
63 pandas operations
64
65 * _is_numeric
66 * _is_boolean
67 * _get_common_dtype
68
69 The `na_value` class attribute can be used to set the default NA value
70 for this type. :attr:`numpy.nan` is used by default.
71
72 ExtensionDtypes are required to be hashable. The base class provides
73 a default implementation, which relies on the ``_metadata`` class
74 attribute. ``_metadata`` should be a tuple containing the strings
75 that define your data type. For example, with ``PeriodDtype`` that's
76 the ``freq`` attribute.
77
78 **If you have a parametrized dtype you should set the ``_metadata``
79 class property**.
80
81 Ideally, the attributes in ``_metadata`` will match the
82 parameters to your ``ExtensionDtype.__init__`` (if any). If any of
83 the attributes in ``_metadata`` don't implement the standard
84 ``__eq__`` or ``__hash__``, the default implementations here will not
85 work.
86
87 Examples
88 --------
89
90 For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method
91 can be implemented: this method receives a pyarrow Array or ChunkedArray
92 as only argument and is expected to return the appropriate pandas
93 ExtensionArray for this dtype and the passed values:
94
95 >>> import pyarrow
96 >>> from pandas.api.extensions import ExtensionArray
97 >>> class ExtensionDtype:
98 ... def __from_arrow__(
99 ... self,
100 ... array: pyarrow.Array | pyarrow.ChunkedArray
101 ... ) -> ExtensionArray:
102 ... ...
103
104 This class does not inherit from 'abc.ABCMeta' for performance reasons.
105 Methods and properties required by the interface raise
106 ``pandas.errors.AbstractMethodError`` and no ``register`` method is
107 provided for registering virtual subclasses.
108 """
109
110 _metadata: tuple[str, ...] = ()
111
112 def __str__(self) -> str:
113 return self.name
114
115 def __eq__(self, other: object) -> bool:
116 """
117 Check whether 'other' is equal to self.
118
119 By default, 'other' is considered equal if either
120
121 * it's a string matching 'self.name'.
122 * it's an instance of this type and all of the attributes
123 in ``self._metadata`` are equal between `self` and `other`.
124
125 Parameters
126 ----------
127 other : Any
128
129 Returns
130 -------
131 bool
132 """
133 if isinstance(other, str):
134 try:
135 other = self.construct_from_string(other)
136 except TypeError:
137 return False
138 if isinstance(other, type(self)):
139 return all(
140 getattr(self, attr) == getattr(other, attr) for attr in self._metadata
141 )
142 return False
143
144 def __hash__(self) -> int:
145 # for python>=3.10, different nan objects have different hashes
146 # we need to avoid that and thus use hash function with old behavior
147 return object_hash(tuple(getattr(self, attr) for attr in self._metadata))
148
149 def __ne__(self, other: object) -> bool:
150 return not self.__eq__(other)
151
152 @property
153 def na_value(self) -> object:
154 """
155 Default NA value to use for this type.
156
157 This is used in e.g. ExtensionArray.take. This should be the
158 user-facing "boxed" version of the NA value, not the physical NA value
159 for storage. e.g. for JSONArray, this is an empty dictionary.
160 """
161 return np.nan
162
163 @property
164 def type(self) -> type_t[Any]:
165 """
166 The scalar type for the array, e.g. ``int``
167
168 It's expected ``ExtensionArray[item]`` returns an instance
169 of ``ExtensionDtype.type`` for scalar ``item``, assuming
170 that value is valid (not NA). NA values do not need to be
171 instances of `type`.
172 """
173 raise AbstractMethodError(self)
174
175 @property
176 def kind(self) -> str:
177 """
178 A character code (one of 'biufcmMOSUV'), default 'O'
179
180 This should match the NumPy dtype used when the array is
181 converted to an ndarray, which is probably 'O' for object if
182 the extension type cannot be represented as a built-in NumPy
183 type.
184
185 See Also
186 --------
187 numpy.dtype.kind
188 """
189 return "O"
190
191 @property
192 def name(self) -> str:
193 """
194 A string identifying the data type.
195
196 Will be used for display in, e.g. ``Series.dtype``
197 """
198 raise AbstractMethodError(self)
199
200 @property
201 def names(self) -> list[str] | None:
202 """
203 Ordered list of field names, or None if there are no fields.
204
205 This is for compatibility with NumPy arrays, and may be removed in the
206 future.
207 """
208 return None
209
210 @classmethod
211 def construct_array_type(cls) -> type_t[ExtensionArray]:
212 """
213 Return the array type associated with this dtype.
214
215 Returns
216 -------
217 type
218 """
219 raise AbstractMethodError(cls)
220
221 def empty(self, shape: Shape) -> ExtensionArray:
222 """
223 Construct an ExtensionArray of this dtype with the given shape.
224
225 Analogous to numpy.empty.
226
227 Parameters
228 ----------
229 shape : int or tuple[int]
230
231 Returns
232 -------
233 ExtensionArray
234 """
235 cls = self.construct_array_type()
236 return cls._empty(shape, dtype=self)
237
238 @classmethod
239 def construct_from_string(cls, string: str) -> Self:
240 r"""
241 Construct this type from a string.
242
243 This is useful mainly for data types that accept parameters.
244 For example, a period dtype accepts a frequency parameter that
245 can be set as ``period[h]`` (where H means hourly frequency).
246
247 By default, in the abstract class, just the name of the type is
248 expected. But subclasses can overwrite this method to accept
249 parameters.
250
251 Parameters
252 ----------
253 string : str
254 The name of the type, for example ``category``.
255
256 Returns
257 -------
258 ExtensionDtype
259 Instance of the dtype.
260
261 Raises
262 ------
263 TypeError
264 If a class cannot be constructed from this 'string'.
265
266 Examples
267 --------
268 For extension dtypes with arguments the following may be an
269 adequate implementation.
270
271 >>> import re
272 >>> @classmethod
273 ... def construct_from_string(cls, string):
274 ... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$")
275 ... match = pattern.match(string)
276 ... if match:
277 ... return cls(**match.groupdict())
278 ... else:
279 ... raise TypeError(
280 ... f"Cannot construct a '{cls.__name__}' from '{string}'"
281 ... )
282 """
283 if not isinstance(string, str):
284 raise TypeError(
285 f"'construct_from_string' expects a string, got {type(string)}"
286 )
287 # error: Non-overlapping equality check (left operand type: "str", right
288 # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap]
289 assert isinstance(cls.name, str), (cls, type(cls.name))
290 if string != cls.name:
291 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
292 return cls()
293
294 @classmethod
295 def is_dtype(cls, dtype: object) -> bool:
296 """
297 Check if we match 'dtype'.
298
299 Parameters
300 ----------
301 dtype : object
302 The object to check.
303
304 Returns
305 -------
306 bool
307
308 Notes
309 -----
310 The default implementation is True if
311
312 1. ``cls.construct_from_string(dtype)`` is an instance
313 of ``cls``.
314 2. ``dtype`` is an object and is an instance of ``cls``
315 3. ``dtype`` has a ``dtype`` attribute, and any of the above
316 conditions is true for ``dtype.dtype``.
317 """
318 dtype = getattr(dtype, "dtype", dtype)
319
320 if isinstance(dtype, (ABCSeries, ABCIndex, ABCDataFrame, np.dtype)):
321 # https://github.com/pandas-dev/pandas/issues/22960
322 # avoid passing data to `construct_from_string`. This could
323 # cause a FutureWarning from numpy about failing elementwise
324 # comparison from, e.g., comparing DataFrame == 'category'.
325 return False
326 elif dtype is None:
327 return False
328 elif isinstance(dtype, cls):
329 return True
330 if isinstance(dtype, str):
331 try:
332 return cls.construct_from_string(dtype) is not None
333 except TypeError:
334 return False
335 return False
336
337 @property
338 def _is_numeric(self) -> bool:
339 """
340 Whether columns with this dtype should be considered numeric.
341
342 By default ExtensionDtypes are assumed to be non-numeric.
343 They'll be excluded from operations that exclude non-numeric
344 columns, like (groupby) reductions, plotting, etc.
345 """
346 return False
347
348 @property
349 def _is_boolean(self) -> bool:
350 """
351 Whether this dtype should be considered boolean.
352
353 By default, ExtensionDtypes are assumed to be non-numeric.
354 Setting this to True will affect the behavior of several places,
355 e.g.
356
357 * is_bool
358 * boolean indexing
359
360 Returns
361 -------
362 bool
363 """
364 return False
365
366 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
367 """
368 Return the common dtype, if one exists.
369
370 Used in `find_common_type` implementation. This is for example used
371 to determine the resulting dtype in a concat operation.
372
373 If no common dtype exists, return None (which gives the other dtypes
374 the chance to determine a common dtype). If all dtypes in the list
375 return None, then the common dtype will be "object" dtype (this means
376 it is never needed to return "object" dtype from this method itself).
377
378 Parameters
379 ----------
380 dtypes : list of dtypes
381 The dtypes for which to determine a common dtype. This is a list
382 of np.dtype or ExtensionDtype instances.
383
384 Returns
385 -------
386 Common dtype (np.dtype or ExtensionDtype) or None
387 """
388 if len(set(dtypes)) == 1:
389 # only itself
390 return self
391 else:
392 return None
393
394 @property
395 def _can_hold_na(self) -> bool:
396 """
397 Can arrays of this dtype hold NA values?
398 """
399 return True
400
401 @property
402 def _is_immutable(self) -> bool:
403 """
404 Can arrays with this dtype be modified with __setitem__? If not, return
405 True.
406
407 Immutable arrays are expected to raise TypeError on __setitem__ calls.
408 """
409 return False
410
411 @cache_readonly
412 def index_class(self) -> type_t[Index]:
413 """
414 The Index subclass to return from Index.__new__ when this dtype is
415 encountered.
416 """
417 from pandas import Index
418
419 return Index
420
421 @property
422 def _supports_2d(self) -> bool:
423 """
424 Do ExtensionArrays with this dtype support 2D arrays?
425
426 Historically ExtensionArrays were limited to 1D. By returning True here,
427 authors can indicate that their arrays support 2D instances. This can
428 improve performance in some cases, particularly operations with `axis=1`.
429
430 Arrays that support 2D values should:
431
432 - implement Array.reshape
433 - subclass the Dim2CompatTests in tests.extension.base
434 - _concat_same_type should support `axis` keyword
435 - _reduce and reductions should support `axis` keyword
436 """
437 return False
438
439 @property
440 def _can_fast_transpose(self) -> bool:
441 """
442 Is transposing an array with this dtype zero-copy?
443
444 Only relevant for cases where _supports_2d is True.
445 """
446 return False
447
448
449class StorageExtensionDtype(ExtensionDtype):
450 """ExtensionDtype that may be backed by more than one implementation."""
451
452 name: str
453 _metadata = ("storage",)
454
455 def __init__(self, storage: str | None = None) -> None:
456 self.storage = storage
457
458 def __repr__(self) -> str:
459 return f"{self.name}[{self.storage}]"
460
461 def __str__(self) -> str:
462 return self.name
463
464 def __eq__(self, other: object) -> bool:
465 if isinstance(other, str) and other == self.name:
466 return True
467 return super().__eq__(other)
468
469 def __hash__(self) -> int:
470 # custom __eq__ so have to override __hash__
471 return super().__hash__()
472
473 @property
474 def na_value(self) -> libmissing.NAType:
475 return libmissing.NA
476
477
478def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
479 """
480 Register an ExtensionType with pandas as class decorator.
481
482 This enables operations like ``.astype(name)`` for the name
483 of the ExtensionDtype.
484
485 Returns
486 -------
487 callable
488 A class decorator.
489
490 Examples
491 --------
492 >>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype
493 >>> @register_extension_dtype
494 ... class MyExtensionDtype(ExtensionDtype):
495 ... name = "myextension"
496 """
497 _registry.register(cls)
498 return cls
499
500
501class Registry:
502 """
503 Registry for dtype inference.
504
505 The registry allows one to map a string repr of a extension
506 dtype to an extension dtype. The string alias can be used in several
507 places, including
508
509 * Series and Index constructors
510 * :meth:`pandas.array`
511 * :meth:`pandas.Series.astype`
512
513 Multiple extension types can be registered.
514 These are tried in order.
515 """
516
517 def __init__(self) -> None:
518 self.dtypes: list[type_t[ExtensionDtype]] = []
519
520 def register(self, dtype: type_t[ExtensionDtype]) -> None:
521 """
522 Parameters
523 ----------
524 dtype : ExtensionDtype class
525 """
526 if not issubclass(dtype, ExtensionDtype):
527 raise ValueError("can only register pandas extension dtypes")
528
529 self.dtypes.append(dtype)
530
531 @overload
532 def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
533 ...
534
535 @overload
536 def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT:
537 ...
538
539 @overload
540 def find(self, dtype: str) -> ExtensionDtype | None:
541 ...
542
543 @overload
544 def find(
545 self, dtype: npt.DTypeLike
546 ) -> type_t[ExtensionDtype] | ExtensionDtype | None:
547 ...
548
549 def find(
550 self, dtype: type_t[ExtensionDtype] | ExtensionDtype | npt.DTypeLike
551 ) -> type_t[ExtensionDtype] | ExtensionDtype | None:
552 """
553 Parameters
554 ----------
555 dtype : ExtensionDtype class or instance or str or numpy dtype or python type
556
557 Returns
558 -------
559 return the first matching dtype, otherwise return None
560 """
561 if not isinstance(dtype, str):
562 dtype_type: type_t
563 if not isinstance(dtype, type):
564 dtype_type = type(dtype)
565 else:
566 dtype_type = dtype
567 if issubclass(dtype_type, ExtensionDtype):
568 # cast needed here as mypy doesn't know we have figured
569 # out it is an ExtensionDtype or type_t[ExtensionDtype]
570 return cast("ExtensionDtype | type_t[ExtensionDtype]", dtype)
571
572 return None
573
574 for dtype_type in self.dtypes:
575 try:
576 return dtype_type.construct_from_string(dtype)
577 except TypeError:
578 pass
579
580 return None
581
582
583_registry = Registry()