1from __future__ import annotations
2
3from typing import (
4 TYPE_CHECKING,
5 Literal,
6)
7import warnings
8
9import numpy as np
10
11from pandas._libs import lib
12from pandas.util._exceptions import find_stack_level
13from pandas.util._validators import check_dtype_backend
14
15from pandas.core.dtypes.cast import maybe_downcast_numeric
16from pandas.core.dtypes.common import (
17 ensure_object,
18 is_bool_dtype,
19 is_decimal,
20 is_integer_dtype,
21 is_number,
22 is_numeric_dtype,
23 is_scalar,
24 is_string_dtype,
25 needs_i8_conversion,
26)
27from pandas.core.dtypes.dtypes import ArrowDtype
28from pandas.core.dtypes.generic import (
29 ABCIndex,
30 ABCSeries,
31)
32
33from pandas.core.arrays import BaseMaskedArray
34from pandas.core.arrays.string_ import StringDtype
35
36if TYPE_CHECKING:
37 from pandas._typing import (
38 DateTimeErrorChoices,
39 DtypeBackend,
40 npt,
41 )
42
43
44def to_numeric(
45 arg,
46 errors: DateTimeErrorChoices = "raise",
47 downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,
48 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
49):
50 """
51 Convert argument to a numeric type.
52
53 The default return dtype is `float64` or `int64`
54 depending on the data supplied. Use the `downcast` parameter
55 to obtain other dtypes.
56
57 Please note that precision loss may occur if really large numbers
58 are passed in. Due to the internal limitations of `ndarray`, if
59 numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
60 or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
61 passed in, it is very likely they will be converted to float so that
62 they can be stored in an `ndarray`. These warnings apply similarly to
63 `Series` since it internally leverages `ndarray`.
64
65 Parameters
66 ----------
67 arg : scalar, list, tuple, 1-d array, or Series
68 Argument to be converted.
69 errors : {'ignore', 'raise', 'coerce'}, default 'raise'
70 - If 'raise', then invalid parsing will raise an exception.
71 - If 'coerce', then invalid parsing will be set as NaN.
72 - If 'ignore', then invalid parsing will return the input.
73
74 .. versionchanged:: 2.2
75
76 "ignore" is deprecated. Catch exceptions explicitly instead.
77
78 downcast : str, default None
79 Can be 'integer', 'signed', 'unsigned', or 'float'.
80 If not None, and if the data has been successfully cast to a
81 numerical dtype (or if the data was numeric to begin with),
82 downcast that resulting data to the smallest numerical dtype
83 possible according to the following rules:
84
85 - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
86 - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
87 - 'float': smallest float dtype (min.: np.float32)
88
89 As this behaviour is separate from the core conversion to
90 numeric values, any errors raised during the downcasting
91 will be surfaced regardless of the value of the 'errors' input.
92
93 In addition, downcasting will only occur if the size
94 of the resulting data's dtype is strictly larger than
95 the dtype it is to be cast to, so if none of the dtypes
96 checked satisfy that specification, no downcasting will be
97 performed on the data.
98 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
99 Back-end data type applied to the resultant :class:`DataFrame`
100 (still experimental). Behaviour is as follows:
101
102 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
103 (default).
104 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
105 DataFrame.
106
107 .. versionadded:: 2.0
108
109 Returns
110 -------
111 ret
112 Numeric if parsing succeeded.
113 Return type depends on input. Series if Series, otherwise ndarray.
114
115 See Also
116 --------
117 DataFrame.astype : Cast argument to a specified dtype.
118 to_datetime : Convert argument to datetime.
119 to_timedelta : Convert argument to timedelta.
120 numpy.ndarray.astype : Cast a numpy array to a specified type.
121 DataFrame.convert_dtypes : Convert dtypes.
122
123 Examples
124 --------
125 Take separate series and convert to numeric, coercing when told to
126
127 >>> s = pd.Series(['1.0', '2', -3])
128 >>> pd.to_numeric(s)
129 0 1.0
130 1 2.0
131 2 -3.0
132 dtype: float64
133 >>> pd.to_numeric(s, downcast='float')
134 0 1.0
135 1 2.0
136 2 -3.0
137 dtype: float32
138 >>> pd.to_numeric(s, downcast='signed')
139 0 1
140 1 2
141 2 -3
142 dtype: int8
143 >>> s = pd.Series(['apple', '1.0', '2', -3])
144 >>> pd.to_numeric(s, errors='coerce')
145 0 NaN
146 1 1.0
147 2 2.0
148 3 -3.0
149 dtype: float64
150
151 Downcasting of nullable integer and floating dtypes is supported:
152
153 >>> s = pd.Series([1, 2, 3], dtype="Int64")
154 >>> pd.to_numeric(s, downcast="integer")
155 0 1
156 1 2
157 2 3
158 dtype: Int8
159 >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
160 >>> pd.to_numeric(s, downcast="float")
161 0 1.0
162 1 2.1
163 2 3.0
164 dtype: Float32
165 """
166 if downcast not in (None, "integer", "signed", "unsigned", "float"):
167 raise ValueError("invalid downcasting method provided")
168
169 if errors not in ("ignore", "raise", "coerce"):
170 raise ValueError("invalid error value specified")
171 if errors == "ignore":
172 # GH#54467
173 warnings.warn(
174 "errors='ignore' is deprecated and will raise in a future version. "
175 "Use to_numeric without passing `errors` and catch exceptions "
176 "explicitly instead",
177 FutureWarning,
178 stacklevel=find_stack_level(),
179 )
180
181 check_dtype_backend(dtype_backend)
182
183 is_series = False
184 is_index = False
185 is_scalars = False
186
187 if isinstance(arg, ABCSeries):
188 is_series = True
189 values = arg.values
190 elif isinstance(arg, ABCIndex):
191 is_index = True
192 if needs_i8_conversion(arg.dtype):
193 values = arg.view("i8")
194 else:
195 values = arg.values
196 elif isinstance(arg, (list, tuple)):
197 values = np.array(arg, dtype="O")
198 elif is_scalar(arg):
199 if is_decimal(arg):
200 return float(arg)
201 if is_number(arg):
202 return arg
203 is_scalars = True
204 values = np.array([arg], dtype="O")
205 elif getattr(arg, "ndim", 1) > 1:
206 raise TypeError("arg must be a list, tuple, 1-d array, or Series")
207 else:
208 values = arg
209
210 orig_values = values
211
212 # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
213 # save mask to reconstruct the full array after casting
214 mask: npt.NDArray[np.bool_] | None = None
215 if isinstance(values, BaseMaskedArray):
216 mask = values._mask
217 values = values._data[~mask]
218
219 values_dtype = getattr(values, "dtype", None)
220 if isinstance(values_dtype, ArrowDtype):
221 mask = values.isna()
222 values = values.dropna().to_numpy()
223 new_mask: np.ndarray | None = None
224 if is_numeric_dtype(values_dtype):
225 pass
226 elif lib.is_np_dtype(values_dtype, "mM"):
227 values = values.view(np.int64)
228 else:
229 values = ensure_object(values)
230 coerce_numeric = errors not in ("ignore", "raise")
231 try:
232 values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload]
233 values,
234 set(),
235 coerce_numeric=coerce_numeric,
236 convert_to_masked_nullable=dtype_backend is not lib.no_default
237 or isinstance(values_dtype, StringDtype)
238 and not values_dtype.storage == "pyarrow_numpy",
239 )
240 except (ValueError, TypeError):
241 if errors == "raise":
242 raise
243 values = orig_values
244
245 if new_mask is not None:
246 # Remove unnecessary values, is expected later anyway and enables
247 # downcasting
248 values = values[~new_mask]
249 elif (
250 dtype_backend is not lib.no_default
251 and new_mask is None
252 or isinstance(values_dtype, StringDtype)
253 and not values_dtype.storage == "pyarrow_numpy"
254 ):
255 new_mask = np.zeros(values.shape, dtype=np.bool_)
256
257 # attempt downcast only if the data has been successfully converted
258 # to a numerical dtype and if a downcast method has been specified
259 if downcast is not None and is_numeric_dtype(values.dtype):
260 typecodes: str | None = None
261
262 if downcast in ("integer", "signed"):
263 typecodes = np.typecodes["Integer"]
264 elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0):
265 typecodes = np.typecodes["UnsignedInteger"]
266 elif downcast == "float":
267 typecodes = np.typecodes["Float"]
268
269 # pandas support goes only to np.float32,
270 # as float dtypes smaller than that are
271 # extremely rare and not well supported
272 float_32_char = np.dtype(np.float32).char
273 float_32_ind = typecodes.index(float_32_char)
274 typecodes = typecodes[float_32_ind:]
275
276 if typecodes is not None:
277 # from smallest to largest
278 for typecode in typecodes:
279 dtype = np.dtype(typecode)
280 if dtype.itemsize <= values.dtype.itemsize:
281 values = maybe_downcast_numeric(values, dtype)
282
283 # successful conversion
284 if values.dtype == dtype:
285 break
286
287 # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct
288 # masked array
289 if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):
290 if mask is None or (new_mask is not None and new_mask.shape == mask.shape):
291 # GH 52588
292 mask = new_mask
293 else:
294 mask = mask.copy()
295 assert isinstance(mask, np.ndarray)
296 data = np.zeros(mask.shape, dtype=values.dtype)
297 data[~mask] = values
298
299 from pandas.core.arrays import (
300 ArrowExtensionArray,
301 BooleanArray,
302 FloatingArray,
303 IntegerArray,
304 )
305
306 klass: type[IntegerArray | BooleanArray | FloatingArray]
307 if is_integer_dtype(data.dtype):
308 klass = IntegerArray
309 elif is_bool_dtype(data.dtype):
310 klass = BooleanArray
311 else:
312 klass = FloatingArray
313 values = klass(data, mask)
314
315 if dtype_backend == "pyarrow" or isinstance(values_dtype, ArrowDtype):
316 values = ArrowExtensionArray(values.__arrow_array__())
317
318 if is_series:
319 return arg._constructor(values, index=arg.index, name=arg.name)
320 elif is_index:
321 # because we want to coerce to numeric if possible,
322 # do not use _shallow_copy
323 from pandas import Index
324
325 return Index(values, name=arg.name)
326 elif is_scalars:
327 return values[0]
328 else:
329 return values