1from __future__ import annotations
2
3import numbers
4from typing import (
5 TYPE_CHECKING,
6 Any,
7 Callable,
8 Mapping,
9 TypeVar,
10)
11
12import numpy as np
13
14from pandas._libs import (
15 lib,
16 missing as libmissing,
17)
18from pandas._typing import (
19 Dtype,
20 DtypeObj,
21 npt,
22)
23from pandas.errors import AbstractMethodError
24from pandas.util._decorators import cache_readonly
25
26from pandas.core.dtypes.common import (
27 is_bool_dtype,
28 is_float_dtype,
29 is_integer_dtype,
30 is_object_dtype,
31 is_string_dtype,
32 pandas_dtype,
33)
34
35from pandas.core.arrays.masked import (
36 BaseMaskedArray,
37 BaseMaskedDtype,
38)
39
40if TYPE_CHECKING:
41 import pyarrow
42
43
44T = TypeVar("T", bound="NumericArray")
45
46
47class NumericDtype(BaseMaskedDtype):
48 _default_np_dtype: np.dtype
49 _checker: Callable[[Any], bool] # is_foo_dtype
50
51 def __repr__(self) -> str:
52 return f"{self.name}Dtype()"
53
54 @cache_readonly
55 def is_signed_integer(self) -> bool:
56 return self.kind == "i"
57
58 @cache_readonly
59 def is_unsigned_integer(self) -> bool:
60 return self.kind == "u"
61
62 @property
63 def _is_numeric(self) -> bool:
64 return True
65
66 def __from_arrow__(
67 self, array: pyarrow.Array | pyarrow.ChunkedArray
68 ) -> BaseMaskedArray:
69 """
70 Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
71 """
72 import pyarrow
73
74 from pandas.core.arrays.arrow._arrow_utils import (
75 pyarrow_array_to_numpy_and_mask,
76 )
77
78 array_class = self.construct_array_type()
79
80 pyarrow_type = pyarrow.from_numpy_dtype(self.type)
81 if not array.type.equals(pyarrow_type):
82 # test_from_arrow_type_error raise for string, but allow
83 # through itemsize conversion GH#31896
84 rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
85 if rt_dtype.kind not in ["i", "u", "f"]:
86 # Could allow "c" or potentially disallow float<->int conversion,
87 # but at the moment we specifically test that uint<->int works
88 raise TypeError(
89 f"Expected array of {self} type, got {array.type} instead"
90 )
91
92 array = array.cast(pyarrow_type)
93
94 if isinstance(array, pyarrow.Array):
95 chunks = [array]
96 else:
97 # pyarrow.ChunkedArray
98 chunks = array.chunks
99
100 results = []
101 for arr in chunks:
102 data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.numpy_dtype)
103 num_arr = array_class(data.copy(), ~mask, copy=False)
104 results.append(num_arr)
105
106 if not results:
107 return array_class(
108 np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_)
109 )
110 elif len(results) == 1:
111 # avoid additional copy in _concat_same_type
112 return results[0]
113 else:
114 return array_class._concat_same_type(results)
115
116 @classmethod
117 def _str_to_dtype_mapping(cls) -> Mapping[str, NumericDtype]:
118 raise AbstractMethodError(cls)
119
120 @classmethod
121 def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
122 """
123 Convert a string representation or a numpy dtype to NumericDtype.
124 """
125 if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
126 # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
127 # https://github.com/numpy/numpy/pull/7476
128 dtype = dtype.lower()
129
130 if not isinstance(dtype, NumericDtype):
131 mapping = cls._str_to_dtype_mapping()
132 try:
133 dtype = mapping[str(np.dtype(dtype))]
134 except KeyError as err:
135 raise ValueError(f"invalid dtype specified {dtype}") from err
136 return dtype
137
138 @classmethod
139 def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
140 """
141 Safely cast the values to the given dtype.
142
143 "safe" in this context means the casting is lossless.
144 """
145 raise AbstractMethodError(cls)
146
147
148def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype):
149 checker = dtype_cls._checker
150
151 inferred_type = None
152
153 if dtype is None and hasattr(values, "dtype"):
154 if checker(values.dtype):
155 dtype = values.dtype
156
157 if dtype is not None:
158 dtype = dtype_cls._standardize_dtype(dtype)
159
160 cls = dtype_cls.construct_array_type()
161 if isinstance(values, cls):
162 values, mask = values._data, values._mask
163 if dtype is not None:
164 values = values.astype(dtype.numpy_dtype, copy=False)
165
166 if copy:
167 values = values.copy()
168 mask = mask.copy()
169 return values, mask, dtype, inferred_type
170
171 original = values
172 values = np.array(values, copy=copy)
173 inferred_type = None
174 if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
175 inferred_type = lib.infer_dtype(values, skipna=True)
176 if inferred_type == "boolean" and dtype is None:
177 name = dtype_cls.__name__.strip("_")
178 raise TypeError(f"{values.dtype} cannot be converted to {name}")
179
180 elif is_bool_dtype(values) and checker(dtype):
181 values = np.array(values, dtype=default_dtype, copy=copy)
182
183 elif not (is_integer_dtype(values) or is_float_dtype(values)):
184 name = dtype_cls.__name__.strip("_")
185 raise TypeError(f"{values.dtype} cannot be converted to {name}")
186
187 if values.ndim != 1:
188 raise TypeError("values must be a 1D list-like")
189
190 if mask is None:
191 if is_integer_dtype(values):
192 # fastpath
193 mask = np.zeros(len(values), dtype=np.bool_)
194 else:
195 mask = libmissing.is_numeric_na(values)
196 else:
197 assert len(mask) == len(values)
198
199 if mask.ndim != 1:
200 raise TypeError("mask must be a 1D list-like")
201
202 # infer dtype if needed
203 if dtype is None:
204 dtype = default_dtype
205 else:
206 dtype = dtype.type
207
208 if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0:
209 if mask.all():
210 values = np.ones(values.shape, dtype=dtype)
211 else:
212 idx = np.nanargmax(values)
213 if int(values[idx]) != original[idx]:
214 # We have ints that lost precision during the cast.
215 inferred_type = lib.infer_dtype(original, skipna=True)
216 if (
217 inferred_type not in ["floating", "mixed-integer-float"]
218 and not mask.any()
219 ):
220 values = np.array(original, dtype=dtype, copy=False)
221 else:
222 values = np.array(original, dtype="object", copy=False)
223
224 # we copy as need to coerce here
225 if mask.any():
226 values = values.copy()
227 values[mask] = cls._internal_fill_value
228 if inferred_type in ("string", "unicode"):
229 # casts from str are always safe since they raise
230 # a ValueError if the str cannot be parsed into a float
231 values = values.astype(dtype, copy=copy)
232 else:
233 values = dtype_cls._safe_cast(values, dtype, copy=False)
234
235 return values, mask, dtype, inferred_type
236
237
238class NumericArray(BaseMaskedArray):
239 """
240 Base class for IntegerArray and FloatingArray.
241 """
242
243 _dtype_cls: type[NumericDtype]
244
245 def __init__(
246 self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
247 ) -> None:
248 checker = self._dtype_cls._checker
249 if not (isinstance(values, np.ndarray) and checker(values.dtype)):
250 descr = (
251 "floating"
252 if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
253 else "integer"
254 )
255 raise TypeError(
256 f"values should be {descr} numpy array. Use "
257 "the 'pd.array' function instead"
258 )
259 if values.dtype == np.float16:
260 # If we don't raise here, then accessing self.dtype would raise
261 raise TypeError("FloatingArray does not support np.float16 dtype.")
262
263 super().__init__(values, mask, copy=copy)
264
265 @cache_readonly
266 def dtype(self) -> NumericDtype:
267 mapping = self._dtype_cls._str_to_dtype_mapping()
268 return mapping[str(self._data.dtype)]
269
270 @classmethod
271 def _coerce_to_array(
272 cls, value, *, dtype: DtypeObj, copy: bool = False
273 ) -> tuple[np.ndarray, np.ndarray]:
274 dtype_cls = cls._dtype_cls
275 default_dtype = dtype_cls._default_np_dtype
276 mask = None
277 values, mask, _, _ = _coerce_to_data_and_mask(
278 value, mask, dtype, copy, dtype_cls, default_dtype
279 )
280 return values, mask
281
282 @classmethod
283 def _from_sequence_of_strings(
284 cls: type[T], strings, *, dtype: Dtype | None = None, copy: bool = False
285 ) -> T:
286 from pandas.core.tools.numeric import to_numeric
287
288 scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
289 return cls._from_sequence(scalars, dtype=dtype, copy=copy)
290
291 _HANDLED_TYPES = (np.ndarray, numbers.Number)