Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/tools/numeric.py: 13%

1from __future__ import annotations

3from typing import (

4 TYPE_CHECKING,

5 Literal,

7import warnings

9import numpy as np

11from pandas._libs import lib

12from pandas.util._exceptions import find_stack_level

13from pandas.util._validators import check_dtype_backend

15from pandas.core.dtypes.cast import maybe_downcast_numeric

16from pandas.core.dtypes.common import (

17 ensure_object,

18 is_bool_dtype,

19 is_decimal,

20 is_integer_dtype,

21 is_number,

22 is_numeric_dtype,

23 is_scalar,

24 is_string_dtype,

25 needs_i8_conversion,

26)

27from pandas.core.dtypes.dtypes import ArrowDtype

28from pandas.core.dtypes.generic import (

29 ABCIndex,

30 ABCSeries,

31)

33from pandas.core.arrays import BaseMaskedArray

34from pandas.core.arrays.string_ import StringDtype

36if TYPE_CHECKING:

37 from pandas._typing import (

38 DateTimeErrorChoices,

39 DtypeBackend,

40 npt,

41 )

44def to_numeric(

45 arg,

46 errors: DateTimeErrorChoices = "raise",

47 downcast: Literal["integer", "signed", "unsigned", "float"] | None = None,

48 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,

49):

50 """

51 Convert argument to a numeric type.

53 The default return dtype is `float64` or `int64`

54 depending on the data supplied. Use the `downcast` parameter

55 to obtain other dtypes.

57 Please note that precision loss may occur if really large numbers

58 are passed in. Due to the internal limitations of `ndarray`, if

59 numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)

60 or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are

61 passed in, it is very likely they will be converted to float so that

62 they can be stored in an `ndarray`. These warnings apply similarly to

63 `Series` since it internally leverages `ndarray`.

65 Parameters

66 ----------

67 arg : scalar, list, tuple, 1-d array, or Series

68 Argument to be converted.

69 errors : {'ignore', 'raise', 'coerce'}, default 'raise'

70 - If 'raise', then invalid parsing will raise an exception.

71 - If 'coerce', then invalid parsing will be set as NaN.

72 - If 'ignore', then invalid parsing will return the input.

74 .. versionchanged:: 2.2

76 "ignore" is deprecated. Catch exceptions explicitly instead.

78 downcast : str, default None

79 Can be 'integer', 'signed', 'unsigned', or 'float'.

80 If not None, and if the data has been successfully cast to a

81 numerical dtype (or if the data was numeric to begin with),

82 downcast that resulting data to the smallest numerical dtype

83 possible according to the following rules:

85 - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)

86 - 'unsigned': smallest unsigned int dtype (min.: np.uint8)

87 - 'float': smallest float dtype (min.: np.float32)

89 As this behaviour is separate from the core conversion to

90 numeric values, any errors raised during the downcasting

91 will be surfaced regardless of the value of the 'errors' input.

93 In addition, downcasting will only occur if the size

94 of the resulting data's dtype is strictly larger than

95 the dtype it is to be cast to, so if none of the dtypes

96 checked satisfy that specification, no downcasting will be

97 performed on the data.

98 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'

99 Back-end data type applied to the resultant :class:`DataFrame`

100 (still experimental). Behaviour is as follows:

101

102 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`

103 (default).

104 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`

105 DataFrame.

106

107 .. versionadded:: 2.0

108

109 Returns

110 -------

111 ret

112 Numeric if parsing succeeded.

113 Return type depends on input. Series if Series, otherwise ndarray.

114

115 See Also

116 --------

117 DataFrame.astype : Cast argument to a specified dtype.

118 to_datetime : Convert argument to datetime.

119 to_timedelta : Convert argument to timedelta.

120 numpy.ndarray.astype : Cast a numpy array to a specified type.

121 DataFrame.convert_dtypes : Convert dtypes.

122

123 Examples

124 --------

125 Take separate series and convert to numeric, coercing when told to

126

127 >>> s = pd.Series(['1.0', '2', -3])

128 >>> pd.to_numeric(s)

129 0 1.0

130 1 2.0

131 2 -3.0

132 dtype: float64

133 >>> pd.to_numeric(s, downcast='float')

134 0 1.0

135 1 2.0

136 2 -3.0

137 dtype: float32

138 >>> pd.to_numeric(s, downcast='signed')

139 0 1

140 1 2

141 2 -3

142 dtype: int8

143 >>> s = pd.Series(['apple', '1.0', '2', -3])

144 >>> pd.to_numeric(s, errors='coerce')

145 0 NaN

146 1 1.0

147 2 2.0

148 3 -3.0

149 dtype: float64

150

151 Downcasting of nullable integer and floating dtypes is supported:

152

153 >>> s = pd.Series([1, 2, 3], dtype="Int64")

154 >>> pd.to_numeric(s, downcast="integer")

155 0 1

156 1 2

157 2 3

158 dtype: Int8

159 >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")

160 >>> pd.to_numeric(s, downcast="float")

161 0 1.0

162 1 2.1

163 2 3.0

164 dtype: Float32

165 """

166 if downcast not in (None, "integer", "signed", "unsigned", "float"):

167 raise ValueError("invalid downcasting method provided")

168

169 if errors not in ("ignore", "raise", "coerce"):

170 raise ValueError("invalid error value specified")

171 if errors == "ignore":

172 # GH#54467

173 warnings.warn(

174 "errors='ignore' is deprecated and will raise in a future version. "

175 "Use to_numeric without passing `errors` and catch exceptions "

176 "explicitly instead",

177 FutureWarning,

178 stacklevel=find_stack_level(),

179 )

180

181 check_dtype_backend(dtype_backend)

182

183 is_series = False

184 is_index = False

185 is_scalars = False

186

187 if isinstance(arg, ABCSeries):

188 is_series = True

189 values = arg.values

190 elif isinstance(arg, ABCIndex):

191 is_index = True

192 if needs_i8_conversion(arg.dtype):

193 values = arg.view("i8")

194 else:

195 values = arg.values

196 elif isinstance(arg, (list, tuple)):

197 values = np.array(arg, dtype="O")

198 elif is_scalar(arg):

199 if is_decimal(arg):

200 return float(arg)

201 if is_number(arg):

202 return arg

203 is_scalars = True

204 values = np.array([arg], dtype="O")

205 elif getattr(arg, "ndim", 1) > 1:

206 raise TypeError("arg must be a list, tuple, 1-d array, or Series")

207 else:

208 values = arg

209

210 orig_values = values

211

212 # GH33013: for IntegerArray & FloatingArray extract non-null values for casting

213 # save mask to reconstruct the full array after casting

214 mask: npt.NDArray[np.bool_] | None = None

215 if isinstance(values, BaseMaskedArray):

216 mask = values._mask

217 values = values._data[~mask]

218

219 values_dtype = getattr(values, "dtype", None)

220 if isinstance(values_dtype, ArrowDtype):

221 mask = values.isna()

222 values = values.dropna().to_numpy()

223 new_mask: np.ndarray | None = None

224 if is_numeric_dtype(values_dtype):

225 pass

226 elif lib.is_np_dtype(values_dtype, "mM"):

227 values = values.view(np.int64)

228 else:

229 values = ensure_object(values)

230 coerce_numeric = errors not in ("ignore", "raise")

231 try:

232 values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload]

233 values,

234 set(),

235 coerce_numeric=coerce_numeric,

236 convert_to_masked_nullable=dtype_backend is not lib.no_default

237 or isinstance(values_dtype, StringDtype)

238 and not values_dtype.storage == "pyarrow_numpy",

239 )

240 except (ValueError, TypeError):

241 if errors == "raise":

242 raise

243 values = orig_values

244

245 if new_mask is not None:

246 # Remove unnecessary values, is expected later anyway and enables

247 # downcasting

248 values = values[~new_mask]

249 elif (

250 dtype_backend is not lib.no_default

251 and new_mask is None

252 or isinstance(values_dtype, StringDtype)

253 and not values_dtype.storage == "pyarrow_numpy"

254 ):

255 new_mask = np.zeros(values.shape, dtype=np.bool_)

256

257 # attempt downcast only if the data has been successfully converted

258 # to a numerical dtype and if a downcast method has been specified

259 if downcast is not None and is_numeric_dtype(values.dtype):

260 typecodes: str | None = None

261

262 if downcast in ("integer", "signed"):

263 typecodes = np.typecodes["Integer"]

264 elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0):

265 typecodes = np.typecodes["UnsignedInteger"]

266 elif downcast == "float":

267 typecodes = np.typecodes["Float"]

268

269 # pandas support goes only to np.float32,

270 # as float dtypes smaller than that are

271 # extremely rare and not well supported

272 float_32_char = np.dtype(np.float32).char

273 float_32_ind = typecodes.index(float_32_char)

274 typecodes = typecodes[float_32_ind:]

275

276 if typecodes is not None:

277 # from smallest to largest

278 for typecode in typecodes:

279 dtype = np.dtype(typecode)

280 if dtype.itemsize <= values.dtype.itemsize:

281 values = maybe_downcast_numeric(values, dtype)

282

283 # successful conversion

284 if values.dtype == dtype:

285 break

286

287 # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct

288 # masked array

289 if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype):

290 if mask is None or (new_mask is not None and new_mask.shape == mask.shape):

291 # GH 52588

292 mask = new_mask

293 else:

294 mask = mask.copy()

295 assert isinstance(mask, np.ndarray)

296 data = np.zeros(mask.shape, dtype=values.dtype)

297 data[~mask] = values

298

299 from pandas.core.arrays import (

300 ArrowExtensionArray,

301 BooleanArray,

302 FloatingArray,

303 IntegerArray,

304 )

305

306 klass: type[IntegerArray | BooleanArray | FloatingArray]

307 if is_integer_dtype(data.dtype):

308 klass = IntegerArray

309 elif is_bool_dtype(data.dtype):

310 klass = BooleanArray

311 else:

312 klass = FloatingArray

313 values = klass(data, mask)

314

315 if dtype_backend == "pyarrow" or isinstance(values_dtype, ArrowDtype):

316 values = ArrowExtensionArray(values.__arrow_array__())

317

318 if is_series:

319 return arg._constructor(values, index=arg.index, name=arg.name)

320 elif is_index:

321 # because we want to coerce to numeric if possible,

322 # do not use _shallow_copy

323 from pandas import Index

324

325 return Index(values, name=arg.name)

326 elif is_scalars:

327 return values[0]

328 else:

329 return values