Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/tools/numeric.py: 13%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

114 statements  

1from __future__ import annotations 

2 

3from typing import ( 

4 TYPE_CHECKING, 

5 Literal, 

6) 

7import warnings 

8 

9import numpy as np 

10 

11from pandas._libs import lib 

12from pandas.util._exceptions import find_stack_level 

13from pandas.util._validators import check_dtype_backend 

14 

15from pandas.core.dtypes.cast import maybe_downcast_numeric 

16from pandas.core.dtypes.common import ( 

17 ensure_object, 

18 is_bool_dtype, 

19 is_decimal, 

20 is_integer_dtype, 

21 is_number, 

22 is_numeric_dtype, 

23 is_scalar, 

24 is_string_dtype, 

25 needs_i8_conversion, 

26) 

27from pandas.core.dtypes.dtypes import ArrowDtype 

28from pandas.core.dtypes.generic import ( 

29 ABCIndex, 

30 ABCSeries, 

31) 

32 

33from pandas.core.arrays import BaseMaskedArray 

34from pandas.core.arrays.string_ import StringDtype 

35 

36if TYPE_CHECKING: 

37 from pandas._typing import ( 

38 DateTimeErrorChoices, 

39 DtypeBackend, 

40 npt, 

41 ) 

42 

43 

44def to_numeric( 

45 arg, 

46 errors: DateTimeErrorChoices = "raise", 

47 downcast: Literal["integer", "signed", "unsigned", "float"] | None = None, 

48 dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, 

49): 

50 """ 

51 Convert argument to a numeric type. 

52 

53 The default return dtype is `float64` or `int64` 

54 depending on the data supplied. Use the `downcast` parameter 

55 to obtain other dtypes. 

56 

57 Please note that precision loss may occur if really large numbers 

58 are passed in. Due to the internal limitations of `ndarray`, if 

59 numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) 

60 or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are 

61 passed in, it is very likely they will be converted to float so that 

62 they can be stored in an `ndarray`. These warnings apply similarly to 

63 `Series` since it internally leverages `ndarray`. 

64 

65 Parameters 

66 ---------- 

67 arg : scalar, list, tuple, 1-d array, or Series 

68 Argument to be converted. 

69 errors : {'ignore', 'raise', 'coerce'}, default 'raise' 

70 - If 'raise', then invalid parsing will raise an exception. 

71 - If 'coerce', then invalid parsing will be set as NaN. 

72 - If 'ignore', then invalid parsing will return the input. 

73 

74 .. versionchanged:: 2.2 

75 

76 "ignore" is deprecated. Catch exceptions explicitly instead. 

77 

78 downcast : str, default None 

79 Can be 'integer', 'signed', 'unsigned', or 'float'. 

80 If not None, and if the data has been successfully cast to a 

81 numerical dtype (or if the data was numeric to begin with), 

82 downcast that resulting data to the smallest numerical dtype 

83 possible according to the following rules: 

84 

85 - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) 

86 - 'unsigned': smallest unsigned int dtype (min.: np.uint8) 

87 - 'float': smallest float dtype (min.: np.float32) 

88 

89 As this behaviour is separate from the core conversion to 

90 numeric values, any errors raised during the downcasting 

91 will be surfaced regardless of the value of the 'errors' input. 

92 

93 In addition, downcasting will only occur if the size 

94 of the resulting data's dtype is strictly larger than 

95 the dtype it is to be cast to, so if none of the dtypes 

96 checked satisfy that specification, no downcasting will be 

97 performed on the data. 

98 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' 

99 Back-end data type applied to the resultant :class:`DataFrame` 

100 (still experimental). Behaviour is as follows: 

101 

102 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` 

103 (default). 

104 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` 

105 DataFrame. 

106 

107 .. versionadded:: 2.0 

108 

109 Returns 

110 ------- 

111 ret 

112 Numeric if parsing succeeded. 

113 Return type depends on input. Series if Series, otherwise ndarray. 

114 

115 See Also 

116 -------- 

117 DataFrame.astype : Cast argument to a specified dtype. 

118 to_datetime : Convert argument to datetime. 

119 to_timedelta : Convert argument to timedelta. 

120 numpy.ndarray.astype : Cast a numpy array to a specified type. 

121 DataFrame.convert_dtypes : Convert dtypes. 

122 

123 Examples 

124 -------- 

125 Take separate series and convert to numeric, coercing when told to 

126 

127 >>> s = pd.Series(['1.0', '2', -3]) 

128 >>> pd.to_numeric(s) 

129 0 1.0 

130 1 2.0 

131 2 -3.0 

132 dtype: float64 

133 >>> pd.to_numeric(s, downcast='float') 

134 0 1.0 

135 1 2.0 

136 2 -3.0 

137 dtype: float32 

138 >>> pd.to_numeric(s, downcast='signed') 

139 0 1 

140 1 2 

141 2 -3 

142 dtype: int8 

143 >>> s = pd.Series(['apple', '1.0', '2', -3]) 

144 >>> pd.to_numeric(s, errors='coerce') 

145 0 NaN 

146 1 1.0 

147 2 2.0 

148 3 -3.0 

149 dtype: float64 

150 

151 Downcasting of nullable integer and floating dtypes is supported: 

152 

153 >>> s = pd.Series([1, 2, 3], dtype="Int64") 

154 >>> pd.to_numeric(s, downcast="integer") 

155 0 1 

156 1 2 

157 2 3 

158 dtype: Int8 

159 >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64") 

160 >>> pd.to_numeric(s, downcast="float") 

161 0 1.0 

162 1 2.1 

163 2 3.0 

164 dtype: Float32 

165 """ 

166 if downcast not in (None, "integer", "signed", "unsigned", "float"): 

167 raise ValueError("invalid downcasting method provided") 

168 

169 if errors not in ("ignore", "raise", "coerce"): 

170 raise ValueError("invalid error value specified") 

171 if errors == "ignore": 

172 # GH#54467 

173 warnings.warn( 

174 "errors='ignore' is deprecated and will raise in a future version. " 

175 "Use to_numeric without passing `errors` and catch exceptions " 

176 "explicitly instead", 

177 FutureWarning, 

178 stacklevel=find_stack_level(), 

179 ) 

180 

181 check_dtype_backend(dtype_backend) 

182 

183 is_series = False 

184 is_index = False 

185 is_scalars = False 

186 

187 if isinstance(arg, ABCSeries): 

188 is_series = True 

189 values = arg.values 

190 elif isinstance(arg, ABCIndex): 

191 is_index = True 

192 if needs_i8_conversion(arg.dtype): 

193 values = arg.view("i8") 

194 else: 

195 values = arg.values 

196 elif isinstance(arg, (list, tuple)): 

197 values = np.array(arg, dtype="O") 

198 elif is_scalar(arg): 

199 if is_decimal(arg): 

200 return float(arg) 

201 if is_number(arg): 

202 return arg 

203 is_scalars = True 

204 values = np.array([arg], dtype="O") 

205 elif getattr(arg, "ndim", 1) > 1: 

206 raise TypeError("arg must be a list, tuple, 1-d array, or Series") 

207 else: 

208 values = arg 

209 

210 orig_values = values 

211 

212 # GH33013: for IntegerArray & FloatingArray extract non-null values for casting 

213 # save mask to reconstruct the full array after casting 

214 mask: npt.NDArray[np.bool_] | None = None 

215 if isinstance(values, BaseMaskedArray): 

216 mask = values._mask 

217 values = values._data[~mask] 

218 

219 values_dtype = getattr(values, "dtype", None) 

220 if isinstance(values_dtype, ArrowDtype): 

221 mask = values.isna() 

222 values = values.dropna().to_numpy() 

223 new_mask: np.ndarray | None = None 

224 if is_numeric_dtype(values_dtype): 

225 pass 

226 elif lib.is_np_dtype(values_dtype, "mM"): 

227 values = values.view(np.int64) 

228 else: 

229 values = ensure_object(values) 

230 coerce_numeric = errors not in ("ignore", "raise") 

231 try: 

232 values, new_mask = lib.maybe_convert_numeric( # type: ignore[call-overload] 

233 values, 

234 set(), 

235 coerce_numeric=coerce_numeric, 

236 convert_to_masked_nullable=dtype_backend is not lib.no_default 

237 or isinstance(values_dtype, StringDtype) 

238 and not values_dtype.storage == "pyarrow_numpy", 

239 ) 

240 except (ValueError, TypeError): 

241 if errors == "raise": 

242 raise 

243 values = orig_values 

244 

245 if new_mask is not None: 

246 # Remove unnecessary values, is expected later anyway and enables 

247 # downcasting 

248 values = values[~new_mask] 

249 elif ( 

250 dtype_backend is not lib.no_default 

251 and new_mask is None 

252 or isinstance(values_dtype, StringDtype) 

253 and not values_dtype.storage == "pyarrow_numpy" 

254 ): 

255 new_mask = np.zeros(values.shape, dtype=np.bool_) 

256 

257 # attempt downcast only if the data has been successfully converted 

258 # to a numerical dtype and if a downcast method has been specified 

259 if downcast is not None and is_numeric_dtype(values.dtype): 

260 typecodes: str | None = None 

261 

262 if downcast in ("integer", "signed"): 

263 typecodes = np.typecodes["Integer"] 

264 elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0): 

265 typecodes = np.typecodes["UnsignedInteger"] 

266 elif downcast == "float": 

267 typecodes = np.typecodes["Float"] 

268 

269 # pandas support goes only to np.float32, 

270 # as float dtypes smaller than that are 

271 # extremely rare and not well supported 

272 float_32_char = np.dtype(np.float32).char 

273 float_32_ind = typecodes.index(float_32_char) 

274 typecodes = typecodes[float_32_ind:] 

275 

276 if typecodes is not None: 

277 # from smallest to largest 

278 for typecode in typecodes: 

279 dtype = np.dtype(typecode) 

280 if dtype.itemsize <= values.dtype.itemsize: 

281 values = maybe_downcast_numeric(values, dtype) 

282 

283 # successful conversion 

284 if values.dtype == dtype: 

285 break 

286 

287 # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct 

288 # masked array 

289 if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype): 

290 if mask is None or (new_mask is not None and new_mask.shape == mask.shape): 

291 # GH 52588 

292 mask = new_mask 

293 else: 

294 mask = mask.copy() 

295 assert isinstance(mask, np.ndarray) 

296 data = np.zeros(mask.shape, dtype=values.dtype) 

297 data[~mask] = values 

298 

299 from pandas.core.arrays import ( 

300 ArrowExtensionArray, 

301 BooleanArray, 

302 FloatingArray, 

303 IntegerArray, 

304 ) 

305 

306 klass: type[IntegerArray | BooleanArray | FloatingArray] 

307 if is_integer_dtype(data.dtype): 

308 klass = IntegerArray 

309 elif is_bool_dtype(data.dtype): 

310 klass = BooleanArray 

311 else: 

312 klass = FloatingArray 

313 values = klass(data, mask) 

314 

315 if dtype_backend == "pyarrow" or isinstance(values_dtype, ArrowDtype): 

316 values = ArrowExtensionArray(values.__arrow_array__()) 

317 

318 if is_series: 

319 return arg._constructor(values, index=arg.index, name=arg.name) 

320 elif is_index: 

321 # because we want to coerce to numeric if possible, 

322 # do not use _shallow_copy 

323 from pandas import Index 

324 

325 return Index(values, name=arg.name) 

326 elif is_scalars: 

327 return values[0] 

328 else: 

329 return values