1"""
2Extend pandas with custom array types.
3"""
4from __future__ import annotations
5
6from typing import (
7 TYPE_CHECKING,
8 Any,
9 TypeVar,
10 cast,
11 overload,
12)
13
14import numpy as np
15
16from pandas._libs import missing as libmissing
17from pandas._libs.hashtable import object_hash
18from pandas._typing import (
19 DtypeObj,
20 Shape,
21 npt,
22 type_t,
23)
24from pandas.errors import AbstractMethodError
25
26from pandas.core.dtypes.generic import (
27 ABCDataFrame,
28 ABCIndex,
29 ABCSeries,
30)
31
32if TYPE_CHECKING:
33 from pandas.core.arrays import ExtensionArray
34
35 # To parameterize on same ExtensionDtype
36 ExtensionDtypeT = TypeVar("ExtensionDtypeT", bound="ExtensionDtype")
37
38
39class ExtensionDtype:
40 """
41 A custom data type, to be paired with an ExtensionArray.
42
43 See Also
44 --------
45 extensions.register_extension_dtype: Register an ExtensionType
46 with pandas as class decorator.
47 extensions.ExtensionArray: Abstract base class for custom 1-D array types.
48
49 Notes
50 -----
51 The interface includes the following abstract methods that must
52 be implemented by subclasses:
53
54 * type
55 * name
56 * construct_array_type
57
58 The following attributes and methods influence the behavior of the dtype in
59 pandas operations
60
61 * _is_numeric
62 * _is_boolean
63 * _get_common_dtype
64
65 The `na_value` class attribute can be used to set the default NA value
66 for this type. :attr:`numpy.nan` is used by default.
67
68 ExtensionDtypes are required to be hashable. The base class provides
69 a default implementation, which relies on the ``_metadata`` class
70 attribute. ``_metadata`` should be a tuple containing the strings
71 that define your data type. For example, with ``PeriodDtype`` that's
72 the ``freq`` attribute.
73
74 **If you have a parametrized dtype you should set the ``_metadata``
75 class property**.
76
77 Ideally, the attributes in ``_metadata`` will match the
78 parameters to your ``ExtensionDtype.__init__`` (if any). If any of
79 the attributes in ``_metadata`` don't implement the standard
80 ``__eq__`` or ``__hash__``, the default implementations here will not
81 work.
82
83 For interaction with Apache Arrow (pyarrow), a ``__from_arrow__`` method
84 can be implemented: this method receives a pyarrow Array or ChunkedArray
85 as only argument and is expected to return the appropriate pandas
86 ExtensionArray for this dtype and the passed values::
87
88 class ExtensionDtype:
89
90 def __from_arrow__(
91 self, array: Union[pyarrow.Array, pyarrow.ChunkedArray]
92 ) -> ExtensionArray:
93 ...
94
95 This class does not inherit from 'abc.ABCMeta' for performance reasons.
96 Methods and properties required by the interface raise
97 ``pandas.errors.AbstractMethodError`` and no ``register`` method is
98 provided for registering virtual subclasses.
99 """
100
101 _metadata: tuple[str, ...] = ()
102
103 def __str__(self) -> str:
104 return self.name
105
106 def __eq__(self, other: Any) -> bool:
107 """
108 Check whether 'other' is equal to self.
109
110 By default, 'other' is considered equal if either
111
112 * it's a string matching 'self.name'.
113 * it's an instance of this type and all of the attributes
114 in ``self._metadata`` are equal between `self` and `other`.
115
116 Parameters
117 ----------
118 other : Any
119
120 Returns
121 -------
122 bool
123 """
124 if isinstance(other, str):
125 try:
126 other = self.construct_from_string(other)
127 except TypeError:
128 return False
129 if isinstance(other, type(self)):
130 return all(
131 getattr(self, attr) == getattr(other, attr) for attr in self._metadata
132 )
133 return False
134
135 def __hash__(self) -> int:
136 # for python>=3.10, different nan objects have different hashes
137 # we need to avoid that and thus use hash function with old behavior
138 return object_hash(tuple(getattr(self, attr) for attr in self._metadata))
139
140 def __ne__(self, other: Any) -> bool:
141 return not self.__eq__(other)
142
143 @property
144 def na_value(self) -> object:
145 """
146 Default NA value to use for this type.
147
148 This is used in e.g. ExtensionArray.take. This should be the
149 user-facing "boxed" version of the NA value, not the physical NA value
150 for storage. e.g. for JSONArray, this is an empty dictionary.
151 """
152 return np.nan
153
154 @property
155 def type(self) -> type_t[Any]:
156 """
157 The scalar type for the array, e.g. ``int``
158
159 It's expected ``ExtensionArray[item]`` returns an instance
160 of ``ExtensionDtype.type`` for scalar ``item``, assuming
161 that value is valid (not NA). NA values do not need to be
162 instances of `type`.
163 """
164 raise AbstractMethodError(self)
165
166 @property
167 def kind(self) -> str:
168 """
169 A character code (one of 'biufcmMOSUV'), default 'O'
170
171 This should match the NumPy dtype used when the array is
172 converted to an ndarray, which is probably 'O' for object if
173 the extension type cannot be represented as a built-in NumPy
174 type.
175
176 See Also
177 --------
178 numpy.dtype.kind
179 """
180 return "O"
181
182 @property
183 def name(self) -> str:
184 """
185 A string identifying the data type.
186
187 Will be used for display in, e.g. ``Series.dtype``
188 """
189 raise AbstractMethodError(self)
190
191 @property
192 def names(self) -> list[str] | None:
193 """
194 Ordered list of field names, or None if there are no fields.
195
196 This is for compatibility with NumPy arrays, and may be removed in the
197 future.
198 """
199 return None
200
201 @classmethod
202 def construct_array_type(cls) -> type_t[ExtensionArray]:
203 """
204 Return the array type associated with this dtype.
205
206 Returns
207 -------
208 type
209 """
210 raise AbstractMethodError(cls)
211
212 def empty(self, shape: Shape) -> type_t[ExtensionArray]:
213 """
214 Construct an ExtensionArray of this dtype with the given shape.
215
216 Analogous to numpy.empty.
217
218 Parameters
219 ----------
220 shape : int or tuple[int]
221
222 Returns
223 -------
224 ExtensionArray
225 """
226 cls = self.construct_array_type()
227 return cls._empty(shape, dtype=self)
228
229 @classmethod
230 def construct_from_string(
231 cls: type_t[ExtensionDtypeT], string: str
232 ) -> ExtensionDtypeT:
233 r"""
234 Construct this type from a string.
235
236 This is useful mainly for data types that accept parameters.
237 For example, a period dtype accepts a frequency parameter that
238 can be set as ``period[H]`` (where H means hourly frequency).
239
240 By default, in the abstract class, just the name of the type is
241 expected. But subclasses can overwrite this method to accept
242 parameters.
243
244 Parameters
245 ----------
246 string : str
247 The name of the type, for example ``category``.
248
249 Returns
250 -------
251 ExtensionDtype
252 Instance of the dtype.
253
254 Raises
255 ------
256 TypeError
257 If a class cannot be constructed from this 'string'.
258
259 Examples
260 --------
261 For extension dtypes with arguments the following may be an
262 adequate implementation.
263
264 >>> @classmethod
265 ... def construct_from_string(cls, string):
266 ... pattern = re.compile(r"^my_type\[(?P<arg_name>.+)\]$")
267 ... match = pattern.match(string)
268 ... if match:
269 ... return cls(**match.groupdict())
270 ... else:
271 ... raise TypeError(
272 ... f"Cannot construct a '{cls.__name__}' from '{string}'"
273 ... )
274 """
275 if not isinstance(string, str):
276 raise TypeError(
277 f"'construct_from_string' expects a string, got {type(string)}"
278 )
279 # error: Non-overlapping equality check (left operand type: "str", right
280 # operand type: "Callable[[ExtensionDtype], str]") [comparison-overlap]
281 assert isinstance(cls.name, str), (cls, type(cls.name))
282 if string != cls.name:
283 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
284 return cls()
285
286 @classmethod
287 def is_dtype(cls, dtype: object) -> bool:
288 """
289 Check if we match 'dtype'.
290
291 Parameters
292 ----------
293 dtype : object
294 The object to check.
295
296 Returns
297 -------
298 bool
299
300 Notes
301 -----
302 The default implementation is True if
303
304 1. ``cls.construct_from_string(dtype)`` is an instance
305 of ``cls``.
306 2. ``dtype`` is an object and is an instance of ``cls``
307 3. ``dtype`` has a ``dtype`` attribute, and any of the above
308 conditions is true for ``dtype.dtype``.
309 """
310 dtype = getattr(dtype, "dtype", dtype)
311
312 if isinstance(dtype, (ABCSeries, ABCIndex, ABCDataFrame, np.dtype)):
313 # https://github.com/pandas-dev/pandas/issues/22960
314 # avoid passing data to `construct_from_string`. This could
315 # cause a FutureWarning from numpy about failing elementwise
316 # comparison from, e.g., comparing DataFrame == 'category'.
317 return False
318 elif dtype is None:
319 return False
320 elif isinstance(dtype, cls):
321 return True
322 if isinstance(dtype, str):
323 try:
324 return cls.construct_from_string(dtype) is not None
325 except TypeError:
326 return False
327 return False
328
329 @property
330 def _is_numeric(self) -> bool:
331 """
332 Whether columns with this dtype should be considered numeric.
333
334 By default ExtensionDtypes are assumed to be non-numeric.
335 They'll be excluded from operations that exclude non-numeric
336 columns, like (groupby) reductions, plotting, etc.
337 """
338 return False
339
340 @property
341 def _is_boolean(self) -> bool:
342 """
343 Whether this dtype should be considered boolean.
344
345 By default, ExtensionDtypes are assumed to be non-numeric.
346 Setting this to True will affect the behavior of several places,
347 e.g.
348
349 * is_bool
350 * boolean indexing
351
352 Returns
353 -------
354 bool
355 """
356 return False
357
358 def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
359 """
360 Return the common dtype, if one exists.
361
362 Used in `find_common_type` implementation. This is for example used
363 to determine the resulting dtype in a concat operation.
364
365 If no common dtype exists, return None (which gives the other dtypes
366 the chance to determine a common dtype). If all dtypes in the list
367 return None, then the common dtype will be "object" dtype (this means
368 it is never needed to return "object" dtype from this method itself).
369
370 Parameters
371 ----------
372 dtypes : list of dtypes
373 The dtypes for which to determine a common dtype. This is a list
374 of np.dtype or ExtensionDtype instances.
375
376 Returns
377 -------
378 Common dtype (np.dtype or ExtensionDtype) or None
379 """
380 if len(set(dtypes)) == 1:
381 # only itself
382 return self
383 else:
384 return None
385
386 @property
387 def _can_hold_na(self) -> bool:
388 """
389 Can arrays of this dtype hold NA values?
390 """
391 return True
392
393
394class StorageExtensionDtype(ExtensionDtype):
395 """ExtensionDtype that may be backed by more than one implementation."""
396
397 name: str
398 _metadata = ("storage",)
399
400 def __init__(self, storage=None) -> None:
401 self.storage = storage
402
403 def __repr__(self) -> str:
404 return f"{self.name}[{self.storage}]"
405
406 def __str__(self) -> str:
407 return self.name
408
409 def __eq__(self, other: Any) -> bool:
410 if isinstance(other, str) and other == self.name:
411 return True
412 return super().__eq__(other)
413
414 def __hash__(self) -> int:
415 # custom __eq__ so have to override __hash__
416 return super().__hash__()
417
418 @property
419 def na_value(self) -> libmissing.NAType:
420 return libmissing.NA
421
422
423def register_extension_dtype(cls: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
424 """
425 Register an ExtensionType with pandas as class decorator.
426
427 This enables operations like ``.astype(name)`` for the name
428 of the ExtensionDtype.
429
430 Returns
431 -------
432 callable
433 A class decorator.
434
435 Examples
436 --------
437 >>> from pandas.api.extensions import register_extension_dtype, ExtensionDtype
438 >>> @register_extension_dtype
439 ... class MyExtensionDtype(ExtensionDtype):
440 ... name = "myextension"
441 """
442 _registry.register(cls)
443 return cls
444
445
446class Registry:
447 """
448 Registry for dtype inference.
449
450 The registry allows one to map a string repr of a extension
451 dtype to an extension dtype. The string alias can be used in several
452 places, including
453
454 * Series and Index constructors
455 * :meth:`pandas.array`
456 * :meth:`pandas.Series.astype`
457
458 Multiple extension types can be registered.
459 These are tried in order.
460 """
461
462 def __init__(self) -> None:
463 self.dtypes: list[type_t[ExtensionDtype]] = []
464
465 def register(self, dtype: type_t[ExtensionDtype]) -> None:
466 """
467 Parameters
468 ----------
469 dtype : ExtensionDtype class
470 """
471 if not issubclass(dtype, ExtensionDtype):
472 raise ValueError("can only register pandas extension dtypes")
473
474 self.dtypes.append(dtype)
475
476 @overload
477 def find(self, dtype: type_t[ExtensionDtypeT]) -> type_t[ExtensionDtypeT]:
478 ...
479
480 @overload
481 def find(self, dtype: ExtensionDtypeT) -> ExtensionDtypeT:
482 ...
483
484 @overload
485 def find(self, dtype: str) -> ExtensionDtype | None:
486 ...
487
488 @overload
489 def find(
490 self, dtype: npt.DTypeLike
491 ) -> type_t[ExtensionDtype] | ExtensionDtype | None:
492 ...
493
494 def find(
495 self, dtype: type_t[ExtensionDtype] | ExtensionDtype | npt.DTypeLike
496 ) -> type_t[ExtensionDtype] | ExtensionDtype | None:
497 """
498 Parameters
499 ----------
500 dtype : ExtensionDtype class or instance or str or numpy dtype or python type
501
502 Returns
503 -------
504 return the first matching dtype, otherwise return None
505 """
506 if not isinstance(dtype, str):
507 dtype_type: type_t
508 if not isinstance(dtype, type):
509 dtype_type = type(dtype)
510 else:
511 dtype_type = dtype
512 if issubclass(dtype_type, ExtensionDtype):
513 # cast needed here as mypy doesn't know we have figured
514 # out it is an ExtensionDtype or type_t[ExtensionDtype]
515 return cast("ExtensionDtype | type_t[ExtensionDtype]", dtype)
516
517 return None
518
519 for dtype_type in self.dtypes:
520 try:
521 return dtype_type.construct_from_string(dtype)
522 except TypeError:
523 pass
524
525 return None
526
527
528_registry = Registry()