1"""
2Routines for casting.
3"""
4
5from __future__ import annotations
6
7import datetime as dt
8import functools
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Literal,
13 Sized,
14 TypeVar,
15 cast,
16 overload,
17)
18import warnings
19
20import numpy as np
21
22from pandas._libs import lib
23from pandas._libs.missing import (
24 NA,
25 NAType,
26 checknull,
27)
28from pandas._libs.tslibs import (
29 NaT,
30 OutOfBoundsDatetime,
31 OutOfBoundsTimedelta,
32 Timedelta,
33 Timestamp,
34 get_unit_from_dtype,
35 is_supported_unit,
36)
37from pandas._libs.tslibs.timedeltas import array_to_timedelta64
38from pandas._typing import (
39 ArrayLike,
40 Dtype,
41 DtypeObj,
42 NumpyIndexT,
43 Scalar,
44 npt,
45)
46from pandas.errors import (
47 IntCastingNaNError,
48 LossySetitemError,
49)
50
51from pandas.core.dtypes.common import (
52 ensure_int8,
53 ensure_int16,
54 ensure_int32,
55 ensure_int64,
56 ensure_object,
57 ensure_str,
58 is_bool,
59 is_bool_dtype,
60 is_complex,
61 is_complex_dtype,
62 is_datetime64_dtype,
63 is_extension_array_dtype,
64 is_float,
65 is_float_dtype,
66 is_integer,
67 is_integer_dtype,
68 is_numeric_dtype,
69 is_object_dtype,
70 is_scalar,
71 is_signed_integer_dtype,
72 is_string_dtype,
73 is_timedelta64_dtype,
74 is_unsigned_integer_dtype,
75 pandas_dtype as pandas_dtype_func,
76)
77from pandas.core.dtypes.dtypes import (
78 BaseMaskedDtype,
79 CategoricalDtype,
80 DatetimeTZDtype,
81 ExtensionDtype,
82 IntervalDtype,
83 PandasExtensionDtype,
84 PeriodDtype,
85)
86from pandas.core.dtypes.generic import (
87 ABCExtensionArray,
88 ABCIndex,
89 ABCSeries,
90)
91from pandas.core.dtypes.inference import is_list_like
92from pandas.core.dtypes.missing import (
93 is_valid_na_for_dtype,
94 isna,
95 na_value_for_dtype,
96 notna,
97)
98
99from pandas.io._util import _arrow_dtype_mapping
100
101if TYPE_CHECKING:
102 from pandas import Index
103 from pandas.core.arrays import (
104 Categorical,
105 DatetimeArray,
106 ExtensionArray,
107 IntervalArray,
108 PeriodArray,
109 TimedeltaArray,
110 )
111
112
113_int8_max = np.iinfo(np.int8).max
114_int16_max = np.iinfo(np.int16).max
115_int32_max = np.iinfo(np.int32).max
116_int64_max = np.iinfo(np.int64).max
117
118_dtype_obj = np.dtype(object)
119
120NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray)
121
122
123def maybe_convert_platform(
124 values: list | tuple | range | np.ndarray | ExtensionArray,
125) -> ArrayLike:
126 """try to do platform conversion, allow ndarray or list here"""
127 arr: ArrayLike
128
129 if isinstance(values, (list, tuple, range)):
130 arr = construct_1d_object_array_from_listlike(values)
131 else:
132 # The caller is responsible for ensuring that we have np.ndarray
133 # or ExtensionArray here.
134 arr = values
135
136 if arr.dtype == _dtype_obj:
137 arr = cast(np.ndarray, arr)
138 arr = lib.maybe_convert_objects(arr)
139
140 return arr
141
142
143def is_nested_object(obj) -> bool:
144 """
145 return a boolean if we have a nested object, e.g. a Series with 1 or
146 more Series elements
147
148 This may not be necessarily be performant.
149
150 """
151 return bool(
152 isinstance(obj, ABCSeries)
153 and is_object_dtype(obj.dtype)
154 and any(isinstance(v, ABCSeries) for v in obj._values)
155 )
156
157
158def maybe_box_datetimelike(value: Scalar, dtype: Dtype | None = None) -> Scalar:
159 """
160 Cast scalar to Timestamp or Timedelta if scalar is datetime-like
161 and dtype is not object.
162
163 Parameters
164 ----------
165 value : scalar
166 dtype : Dtype, optional
167
168 Returns
169 -------
170 scalar
171 """
172 if dtype == _dtype_obj:
173 pass
174 elif isinstance(value, (np.datetime64, dt.datetime)):
175 value = Timestamp(value)
176 elif isinstance(value, (np.timedelta64, dt.timedelta)):
177 value = Timedelta(value)
178
179 return value
180
181
182def maybe_box_native(value: Scalar | None | NAType) -> Scalar | None | NAType:
183 """
184 If passed a scalar cast the scalar to a python native type.
185
186 Parameters
187 ----------
188 value : scalar or Series
189
190 Returns
191 -------
192 scalar or Series
193 """
194 if is_float(value):
195 # error: Argument 1 to "float" has incompatible type
196 # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
197 # expected "Union[SupportsFloat, _SupportsIndex, str]"
198 value = float(value) # type: ignore[arg-type]
199 elif is_integer(value):
200 # error: Argument 1 to "int" has incompatible type
201 # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]";
202 # expected "Union[str, SupportsInt, _SupportsIndex, _SupportsTrunc]"
203 value = int(value) # type: ignore[arg-type]
204 elif is_bool(value):
205 value = bool(value)
206 elif isinstance(value, (np.datetime64, np.timedelta64)):
207 value = maybe_box_datetimelike(value)
208 elif value is NA:
209 value = None
210 return value
211
212
213def _maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar:
214 """
215 Convert a Timedelta or Timestamp to timedelta64 or datetime64 for setting
216 into a numpy array. Failing to unbox would risk dropping nanoseconds.
217
218 Notes
219 -----
220 Caller is responsible for checking dtype.kind in ["m", "M"]
221 """
222 if is_valid_na_for_dtype(value, dtype):
223 # GH#36541: can't fill array directly with pd.NaT
224 # > np.empty(10, dtype="datetime64[ns]").fill(pd.NaT)
225 # ValueError: cannot convert float NaN to integer
226 value = dtype.type("NaT", "ns")
227 elif isinstance(value, Timestamp):
228 if value.tz is None:
229 value = value.to_datetime64()
230 elif not isinstance(dtype, DatetimeTZDtype):
231 raise TypeError("Cannot unbox tzaware Timestamp to tznaive dtype")
232 elif isinstance(value, Timedelta):
233 value = value.to_timedelta64()
234
235 _disallow_mismatched_datetimelike(value, dtype)
236 return value
237
238
239def _disallow_mismatched_datetimelike(value, dtype: DtypeObj):
240 """
241 numpy allows np.array(dt64values, dtype="timedelta64[ns]") and
242 vice-versa, but we do not want to allow this, so we need to
243 check explicitly
244 """
245 vdtype = getattr(value, "dtype", None)
246 if vdtype is None:
247 return
248 elif (vdtype.kind == "m" and dtype.kind == "M") or (
249 vdtype.kind == "M" and dtype.kind == "m"
250 ):
251 raise TypeError(f"Cannot cast {repr(value)} to {dtype}")
252
253
254@overload
255def maybe_downcast_to_dtype(result: np.ndarray, dtype: str | np.dtype) -> np.ndarray:
256 ...
257
258
259@overload
260def maybe_downcast_to_dtype(result: ExtensionArray, dtype: str | np.dtype) -> ArrayLike:
261 ...
262
263
264def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLike:
265 """
266 try to cast to the specified dtype (e.g. convert back to bool/int
267 or could be an astype of float64->float32
268 """
269 do_round = False
270
271 if isinstance(dtype, str):
272 if dtype == "infer":
273 inferred_type = lib.infer_dtype(result, skipna=False)
274 if inferred_type == "boolean":
275 dtype = "bool"
276 elif inferred_type == "integer":
277 dtype = "int64"
278 elif inferred_type == "datetime64":
279 dtype = "datetime64[ns]"
280 elif inferred_type in ["timedelta", "timedelta64"]:
281 dtype = "timedelta64[ns]"
282
283 # try to upcast here
284 elif inferred_type == "floating":
285 dtype = "int64"
286 if issubclass(result.dtype.type, np.number):
287 do_round = True
288
289 else:
290 # TODO: complex? what if result is already non-object?
291 dtype = "object"
292
293 dtype = np.dtype(dtype)
294
295 if not isinstance(dtype, np.dtype):
296 # enforce our signature annotation
297 raise TypeError(dtype) # pragma: no cover
298
299 converted = maybe_downcast_numeric(result, dtype, do_round)
300 if converted is not result:
301 return converted
302
303 # a datetimelike
304 # GH12821, iNaT is cast to float
305 if dtype.kind in ["M", "m"] and result.dtype.kind in ["i", "f"]:
306 result = result.astype(dtype)
307
308 elif dtype.kind == "m" and result.dtype == _dtype_obj:
309 # test_where_downcast_to_td64
310 result = cast(np.ndarray, result)
311 result = array_to_timedelta64(result)
312
313 elif dtype == np.dtype("M8[ns]") and result.dtype == _dtype_obj:
314 result = cast(np.ndarray, result)
315 return np.asarray(maybe_cast_to_datetime(result, dtype=dtype))
316
317 return result
318
319
320@overload
321def maybe_downcast_numeric(
322 result: np.ndarray, dtype: np.dtype, do_round: bool = False
323) -> np.ndarray:
324 ...
325
326
327@overload
328def maybe_downcast_numeric(
329 result: ExtensionArray, dtype: DtypeObj, do_round: bool = False
330) -> ArrayLike:
331 ...
332
333
334def maybe_downcast_numeric(
335 result: ArrayLike, dtype: DtypeObj, do_round: bool = False
336) -> ArrayLike:
337 """
338 Subset of maybe_downcast_to_dtype restricted to numeric dtypes.
339
340 Parameters
341 ----------
342 result : ndarray or ExtensionArray
343 dtype : np.dtype or ExtensionDtype
344 do_round : bool
345
346 Returns
347 -------
348 ndarray or ExtensionArray
349 """
350 if not isinstance(dtype, np.dtype) or not isinstance(result.dtype, np.dtype):
351 # e.g. SparseDtype has no itemsize attr
352 return result
353
354 def trans(x):
355 if do_round:
356 return x.round()
357 return x
358
359 if dtype.kind == result.dtype.kind:
360 # don't allow upcasts here (except if empty)
361 if result.dtype.itemsize <= dtype.itemsize and result.size:
362 return result
363
364 if is_bool_dtype(dtype) or is_integer_dtype(dtype):
365 if not result.size:
366 # if we don't have any elements, just astype it
367 return trans(result).astype(dtype)
368
369 # do a test on the first element, if it fails then we are done
370 r = result.ravel()
371 arr = np.array([r[0]])
372
373 if isna(arr).any():
374 # if we have any nulls, then we are done
375 return result
376
377 elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)):
378 # a comparable, e.g. a Decimal may slip in here
379 return result
380
381 if (
382 issubclass(result.dtype.type, (np.object_, np.number))
383 and notna(result).all()
384 ):
385 new_result = trans(result).astype(dtype)
386 if new_result.dtype.kind == "O" or result.dtype.kind == "O":
387 # np.allclose may raise TypeError on object-dtype
388 if (new_result == result).all():
389 return new_result
390 else:
391 if np.allclose(new_result, result, rtol=0):
392 return new_result
393
394 elif (
395 issubclass(dtype.type, np.floating)
396 and not is_bool_dtype(result.dtype)
397 and not is_string_dtype(result.dtype)
398 ):
399 with warnings.catch_warnings():
400 warnings.filterwarnings(
401 "ignore", "overflow encountered in cast", RuntimeWarning
402 )
403 new_result = result.astype(dtype)
404
405 # Adjust tolerances based on floating point size
406 size_tols = {4: 5e-4, 8: 5e-8, 16: 5e-16}
407
408 atol = size_tols.get(new_result.dtype.itemsize, 0.0)
409
410 # Check downcast float values are still equal within 7 digits when
411 # converting from float64 to float32
412 if np.allclose(new_result, result, equal_nan=True, rtol=0.0, atol=atol):
413 return new_result
414
415 elif dtype.kind == result.dtype.kind == "c":
416 new_result = result.astype(dtype)
417
418 if np.array_equal(new_result, result, equal_nan=True):
419 # TODO: use tolerance like we do for float?
420 return new_result
421
422 return result
423
424
425def maybe_upcast_numeric_to_64bit(arr: NumpyIndexT) -> NumpyIndexT:
426 """
427 If array is a int/uint/float bit size lower than 64 bit, upcast it to 64 bit.
428
429 Parameters
430 ----------
431 arr : ndarray or ExtensionArray
432
433 Returns
434 -------
435 ndarray or ExtensionArray
436 """
437 dtype = arr.dtype
438 if is_signed_integer_dtype(dtype) and dtype != np.int64:
439 return arr.astype(np.int64)
440 elif is_unsigned_integer_dtype(dtype) and dtype != np.uint64:
441 return arr.astype(np.uint64)
442 elif is_float_dtype(dtype) and dtype != np.float64:
443 return arr.astype(np.float64)
444 else:
445 return arr
446
447
448def maybe_cast_pointwise_result(
449 result: ArrayLike,
450 dtype: DtypeObj,
451 numeric_only: bool = False,
452 same_dtype: bool = True,
453) -> ArrayLike:
454 """
455 Try casting result of a pointwise operation back to the original dtype if
456 appropriate.
457
458 Parameters
459 ----------
460 result : array-like
461 Result to cast.
462 dtype : np.dtype or ExtensionDtype
463 Input Series from which result was calculated.
464 numeric_only : bool, default False
465 Whether to cast only numerics or datetimes as well.
466 same_dtype : bool, default True
467 Specify dtype when calling _from_sequence
468
469 Returns
470 -------
471 result : array-like
472 result maybe casted to the dtype.
473 """
474
475 assert not is_scalar(result)
476
477 if isinstance(dtype, ExtensionDtype):
478 if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)):
479 # TODO: avoid this special-casing
480 # We have to special case categorical so as not to upcast
481 # things like counts back to categorical
482
483 cls = dtype.construct_array_type()
484 if same_dtype:
485 result = maybe_cast_to_extension_array(cls, result, dtype=dtype)
486 else:
487 result = maybe_cast_to_extension_array(cls, result)
488
489 elif (numeric_only and is_numeric_dtype(dtype)) or not numeric_only:
490 result = maybe_downcast_to_dtype(result, dtype)
491
492 return result
493
494
495def maybe_cast_to_extension_array(
496 cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None
497) -> ArrayLike:
498 """
499 Call to `_from_sequence` that returns the object unchanged on Exception.
500
501 Parameters
502 ----------
503 cls : class, subclass of ExtensionArray
504 obj : arraylike
505 Values to pass to cls._from_sequence
506 dtype : ExtensionDtype, optional
507
508 Returns
509 -------
510 ExtensionArray or obj
511 """
512 from pandas.core.arrays.string_ import BaseStringArray
513
514 assert isinstance(cls, type), f"must pass a type: {cls}"
515 assertion_msg = f"must pass a subclass of ExtensionArray: {cls}"
516 assert issubclass(cls, ABCExtensionArray), assertion_msg
517
518 # Everything can be converted to StringArrays, but we may not want to convert
519 if issubclass(cls, BaseStringArray) and lib.infer_dtype(obj) != "string":
520 return obj
521
522 try:
523 result = cls._from_sequence(obj, dtype=dtype)
524 except Exception:
525 # We can't predict what downstream EA constructors may raise
526 result = obj
527 return result
528
529
530@overload
531def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype:
532 ...
533
534
535@overload
536def ensure_dtype_can_hold_na(dtype: ExtensionDtype) -> ExtensionDtype:
537 ...
538
539
540def ensure_dtype_can_hold_na(dtype: DtypeObj) -> DtypeObj:
541 """
542 If we have a dtype that cannot hold NA values, find the best match that can.
543 """
544 if isinstance(dtype, ExtensionDtype):
545 if dtype._can_hold_na:
546 return dtype
547 elif isinstance(dtype, IntervalDtype):
548 # TODO(GH#45349): don't special-case IntervalDtype, allow
549 # overriding instead of returning object below.
550 return IntervalDtype(np.float64, closed=dtype.closed)
551 return _dtype_obj
552 elif dtype.kind == "b":
553 return _dtype_obj
554 elif dtype.kind in ["i", "u"]:
555 return np.dtype(np.float64)
556 return dtype
557
558
559_canonical_nans = {
560 np.datetime64: np.datetime64("NaT", "ns"),
561 np.timedelta64: np.timedelta64("NaT", "ns"),
562 type(np.nan): np.nan,
563}
564
565
566def maybe_promote(dtype: np.dtype, fill_value=np.nan):
567 """
568 Find the minimal dtype that can hold both the given dtype and fill_value.
569
570 Parameters
571 ----------
572 dtype : np.dtype
573 fill_value : scalar, default np.nan
574
575 Returns
576 -------
577 dtype
578 Upcasted from dtype argument if necessary.
579 fill_value
580 Upcasted from fill_value argument if necessary.
581
582 Raises
583 ------
584 ValueError
585 If fill_value is a non-scalar and dtype is not object.
586 """
587 orig = fill_value
588 orig_is_nat = False
589 if checknull(fill_value):
590 # https://github.com/pandas-dev/pandas/pull/39692#issuecomment-1441051740
591 # avoid cache misses with NaN/NaT values that are not singletons
592 if fill_value is not NA:
593 try:
594 orig_is_nat = np.isnat(fill_value)
595 except TypeError:
596 pass
597
598 fill_value = _canonical_nans.get(type(fill_value), fill_value)
599
600 # for performance, we are using a cached version of the actual implementation
601 # of the function in _maybe_promote. However, this doesn't always work (in case
602 # of non-hashable arguments), so we fallback to the actual implementation if needed
603 try:
604 # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type
605 # "Type[Any]"; expected "Hashable" [arg-type]
606 dtype, fill_value = _maybe_promote_cached(
607 dtype, fill_value, type(fill_value) # type: ignore[arg-type]
608 )
609 except TypeError:
610 # if fill_value is not hashable (required for caching)
611 dtype, fill_value = _maybe_promote(dtype, fill_value)
612
613 if (dtype == _dtype_obj and orig is not None) or (
614 orig_is_nat and np.datetime_data(orig)[0] != "ns"
615 ):
616 # GH#51592,53497 restore our potentially non-canonical fill_value
617 fill_value = orig
618 return dtype, fill_value
619
620
621@functools.lru_cache(maxsize=128)
622def _maybe_promote_cached(dtype, fill_value, fill_value_type):
623 # The cached version of _maybe_promote below
624 # This also use fill_value_type as (unused) argument to use this in the
625 # cache lookup -> to differentiate 1 and True
626 return _maybe_promote(dtype, fill_value)
627
628
629def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
630 # The actual implementation of the function, use `maybe_promote` above for
631 # a cached version.
632 if not is_scalar(fill_value):
633 # with object dtype there is nothing to promote, and the user can
634 # pass pretty much any weird fill_value they like
635 if not is_object_dtype(dtype):
636 # with object dtype there is nothing to promote, and the user can
637 # pass pretty much any weird fill_value they like
638 raise ValueError("fill_value must be a scalar")
639 dtype = _dtype_obj
640 return dtype, fill_value
641
642 kinds = ["i", "u", "f", "c", "m", "M"]
643 if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in kinds:
644 dtype = ensure_dtype_can_hold_na(dtype)
645 fv = na_value_for_dtype(dtype)
646 return dtype, fv
647
648 elif isinstance(dtype, CategoricalDtype):
649 if fill_value in dtype.categories or isna(fill_value):
650 return dtype, fill_value
651 else:
652 return object, ensure_object(fill_value)
653
654 elif isna(fill_value):
655 dtype = _dtype_obj
656 if fill_value is None:
657 # but we retain e.g. pd.NA
658 fill_value = np.nan
659 return dtype, fill_value
660
661 # returns tuple of (dtype, fill_value)
662 if issubclass(dtype.type, np.datetime64):
663 inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
664 if inferred == dtype:
665 return dtype, fv
666
667 from pandas.core.arrays import DatetimeArray
668
669 dta = DatetimeArray._from_sequence([], dtype="M8[ns]")
670 try:
671 fv = dta._validate_setitem_value(fill_value)
672 return dta.dtype, fv
673 except (ValueError, TypeError):
674 return _dtype_obj, fill_value
675
676 elif issubclass(dtype.type, np.timedelta64):
677 inferred, fv = infer_dtype_from_scalar(fill_value, pandas_dtype=True)
678 if inferred == dtype:
679 return dtype, fv
680
681 return np.dtype("object"), fill_value
682
683 elif is_float(fill_value):
684 if issubclass(dtype.type, np.bool_):
685 dtype = np.dtype(np.object_)
686
687 elif issubclass(dtype.type, np.integer):
688 dtype = np.dtype(np.float64)
689
690 elif dtype.kind == "f":
691 mst = np.min_scalar_type(fill_value)
692 if mst > dtype:
693 # e.g. mst is np.float64 and dtype is np.float32
694 dtype = mst
695
696 elif dtype.kind == "c":
697 mst = np.min_scalar_type(fill_value)
698 dtype = np.promote_types(dtype, mst)
699
700 elif is_bool(fill_value):
701 if not issubclass(dtype.type, np.bool_):
702 dtype = np.dtype(np.object_)
703
704 elif is_integer(fill_value):
705 if issubclass(dtype.type, np.bool_):
706 dtype = np.dtype(np.object_)
707
708 elif issubclass(dtype.type, np.integer):
709 if not np.can_cast(fill_value, dtype):
710 # upcast to prevent overflow
711 mst = np.min_scalar_type(fill_value)
712 dtype = np.promote_types(dtype, mst)
713 if dtype.kind == "f":
714 # Case where we disagree with numpy
715 dtype = np.dtype(np.object_)
716
717 elif is_complex(fill_value):
718 if issubclass(dtype.type, np.bool_):
719 dtype = np.dtype(np.object_)
720
721 elif issubclass(dtype.type, (np.integer, np.floating)):
722 mst = np.min_scalar_type(fill_value)
723 dtype = np.promote_types(dtype, mst)
724
725 elif dtype.kind == "c":
726 mst = np.min_scalar_type(fill_value)
727 if mst > dtype:
728 # e.g. mst is np.complex128 and dtype is np.complex64
729 dtype = mst
730
731 else:
732 dtype = np.dtype(np.object_)
733
734 # in case we have a string that looked like a number
735 if issubclass(dtype.type, (bytes, str)):
736 dtype = np.dtype(np.object_)
737
738 fill_value = _ensure_dtype_type(fill_value, dtype)
739 return dtype, fill_value
740
741
742def _ensure_dtype_type(value, dtype: np.dtype):
743 """
744 Ensure that the given value is an instance of the given dtype.
745
746 e.g. if out dtype is np.complex64_, we should have an instance of that
747 as opposed to a python complex object.
748
749 Parameters
750 ----------
751 value : object
752 dtype : np.dtype
753
754 Returns
755 -------
756 object
757 """
758 # Start with exceptions in which we do _not_ cast to numpy types
759
760 if dtype == _dtype_obj:
761 return value
762
763 # Note: before we get here we have already excluded isna(value)
764 return dtype.type(value)
765
766
767def infer_dtype_from(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:
768 """
769 Interpret the dtype from a scalar or array.
770
771 Parameters
772 ----------
773 val : object
774 pandas_dtype : bool, default False
775 whether to infer dtype including pandas extension types.
776 If False, scalar/array belongs to pandas extension types is inferred as
777 object
778 """
779 if not is_list_like(val):
780 return infer_dtype_from_scalar(val, pandas_dtype=pandas_dtype)
781 return infer_dtype_from_array(val, pandas_dtype=pandas_dtype)
782
783
784def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, Any]:
785 """
786 Interpret the dtype from a scalar.
787
788 Parameters
789 ----------
790 pandas_dtype : bool, default False
791 whether to infer dtype including pandas extension types.
792 If False, scalar belongs to pandas extension types is inferred as
793 object
794 """
795 dtype: DtypeObj = _dtype_obj
796
797 # a 1-element ndarray
798 if isinstance(val, np.ndarray):
799 if val.ndim != 0:
800 msg = "invalid ndarray passed to infer_dtype_from_scalar"
801 raise ValueError(msg)
802
803 dtype = val.dtype
804 val = lib.item_from_zerodim(val)
805
806 elif isinstance(val, str):
807 # If we create an empty array using a string to infer
808 # the dtype, NumPy will only allocate one character per entry
809 # so this is kind of bad. Alternately we could use np.repeat
810 # instead of np.empty (but then you still don't want things
811 # coming out as np.str_!
812
813 dtype = _dtype_obj
814
815 elif isinstance(val, (np.datetime64, dt.datetime)):
816 try:
817 val = Timestamp(val)
818 if val is not NaT:
819 val = val.as_unit("ns")
820 except OutOfBoundsDatetime:
821 return _dtype_obj, val
822
823 if val is NaT or val.tz is None:
824 val = val.to_datetime64()
825 dtype = val.dtype
826 # TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes
827 else:
828 if pandas_dtype:
829 dtype = DatetimeTZDtype(unit="ns", tz=val.tz)
830 else:
831 # return datetimetz as object
832 return _dtype_obj, val
833
834 elif isinstance(val, (np.timedelta64, dt.timedelta)):
835 try:
836 val = Timedelta(val)
837 except (OutOfBoundsTimedelta, OverflowError):
838 dtype = _dtype_obj
839 else:
840 dtype = np.dtype("m8[ns]")
841 val = np.timedelta64(val.value, "ns")
842
843 elif is_bool(val):
844 dtype = np.dtype(np.bool_)
845
846 elif is_integer(val):
847 if isinstance(val, np.integer):
848 dtype = np.dtype(type(val))
849 else:
850 dtype = np.dtype(np.int64)
851
852 try:
853 np.array(val, dtype=dtype)
854 except OverflowError:
855 dtype = np.array(val).dtype
856
857 elif is_float(val):
858 if isinstance(val, np.floating):
859 dtype = np.dtype(type(val))
860 else:
861 dtype = np.dtype(np.float64)
862
863 elif is_complex(val):
864 dtype = np.dtype(np.complex_)
865
866 elif pandas_dtype:
867 if lib.is_period(val):
868 dtype = PeriodDtype(freq=val.freq)
869 elif lib.is_interval(val):
870 subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0]
871 dtype = IntervalDtype(subtype=subtype, closed=val.closed)
872
873 return dtype, val
874
875
876def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:
877 """
878 Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
879
880 Parameters
881 ----------
882 d: dict-like object
883
884 Returns
885 -------
886 dict
887 """
888 return {maybe_box_datetimelike(key): value for key, value in d.items()}
889
890
891def infer_dtype_from_array(
892 arr, pandas_dtype: bool = False
893) -> tuple[DtypeObj, ArrayLike]:
894 """
895 Infer the dtype from an array.
896
897 Parameters
898 ----------
899 arr : array
900 pandas_dtype : bool, default False
901 whether to infer dtype including pandas extension types.
902 If False, array belongs to pandas extension types
903 is inferred as object
904
905 Returns
906 -------
907 tuple (numpy-compat/pandas-compat dtype, array)
908
909 Notes
910 -----
911 if pandas_dtype=False. these infer to numpy dtypes
912 exactly with the exception that mixed / object dtypes
913 are not coerced by stringifying or conversion
914
915 if pandas_dtype=True. datetime64tz-aware/categorical
916 types will retain there character.
917
918 Examples
919 --------
920 >>> np.asarray([1, '1'])
921 array(['1', '1'], dtype='<U21')
922
923 >>> infer_dtype_from_array([1, '1'])
924 (dtype('O'), [1, '1'])
925 """
926 if isinstance(arr, np.ndarray):
927 return arr.dtype, arr
928
929 if not is_list_like(arr):
930 raise TypeError("'arr' must be list-like")
931
932 if pandas_dtype and is_extension_array_dtype(arr):
933 return arr.dtype, arr
934
935 elif isinstance(arr, ABCSeries):
936 return arr.dtype, np.asarray(arr)
937
938 # don't force numpy coerce with nan's
939 inferred = lib.infer_dtype(arr, skipna=False)
940 if inferred in ["string", "bytes", "mixed", "mixed-integer"]:
941 return (np.dtype(np.object_), arr)
942
943 arr = np.asarray(arr)
944 return arr.dtype, arr
945
946
947def _maybe_infer_dtype_type(element):
948 """
949 Try to infer an object's dtype, for use in arithmetic ops.
950
951 Uses `element.dtype` if that's available.
952 Objects implementing the iterator protocol are cast to a NumPy array,
953 and from there the array's type is used.
954
955 Parameters
956 ----------
957 element : object
958 Possibly has a `.dtype` attribute, and possibly the iterator
959 protocol.
960
961 Returns
962 -------
963 tipo : type
964
965 Examples
966 --------
967 >>> from collections import namedtuple
968 >>> Foo = namedtuple("Foo", "dtype")
969 >>> _maybe_infer_dtype_type(Foo(np.dtype("i8")))
970 dtype('int64')
971 """
972 tipo = None
973 if hasattr(element, "dtype"):
974 tipo = element.dtype
975 elif is_list_like(element):
976 element = np.asarray(element)
977 tipo = element.dtype
978 return tipo
979
980
981def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:
982 """
983 Change string like dtypes to object for
984 ``DataFrame.select_dtypes()``.
985 """
986 # error: Argument 1 to <set> has incompatible type "Type[generic]"; expected
987 # "Union[dtype[Any], ExtensionDtype, None]"
988 # error: Argument 2 to <set> has incompatible type "Type[generic]"; expected
989 # "Union[dtype[Any], ExtensionDtype, None]"
990 non_string_dtypes = dtype_set - {
991 np.dtype("S").type, # type: ignore[arg-type]
992 np.dtype("<U").type, # type: ignore[arg-type]
993 }
994 if non_string_dtypes != dtype_set:
995 raise TypeError("string dtypes are not allowed, use 'object' instead")
996
997
998def coerce_indexer_dtype(indexer, categories) -> np.ndarray:
999 """coerce the indexer input array to the smallest dtype possible"""
1000 length = len(categories)
1001 if length < _int8_max:
1002 return ensure_int8(indexer)
1003 elif length < _int16_max:
1004 return ensure_int16(indexer)
1005 elif length < _int32_max:
1006 return ensure_int32(indexer)
1007 return ensure_int64(indexer)
1008
1009
1010def convert_dtypes(
1011 input_array: ArrayLike,
1012 convert_string: bool = True,
1013 convert_integer: bool = True,
1014 convert_boolean: bool = True,
1015 convert_floating: bool = True,
1016 infer_objects: bool = False,
1017 dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable",
1018) -> DtypeObj:
1019 """
1020 Convert objects to best possible type, and optionally,
1021 to types supporting ``pd.NA``.
1022
1023 Parameters
1024 ----------
1025 input_array : ExtensionArray or np.ndarray
1026 convert_string : bool, default True
1027 Whether object dtypes should be converted to ``StringDtype()``.
1028 convert_integer : bool, default True
1029 Whether, if possible, conversion can be done to integer extension types.
1030 convert_boolean : bool, defaults True
1031 Whether object dtypes should be converted to ``BooleanDtypes()``.
1032 convert_floating : bool, defaults True
1033 Whether, if possible, conversion can be done to floating extension types.
1034 If `convert_integer` is also True, preference will be give to integer
1035 dtypes if the floats can be faithfully casted to integers.
1036 infer_objects : bool, defaults False
1037 Whether to also infer objects to float/int if possible. Is only hit if the
1038 object array contains pd.NA.
1039 dtype_backend : str, default "numpy_nullable"
1040 Nullable dtype implementation to use.
1041
1042 * "numpy_nullable" returns numpy-backed nullable types
1043 * "pyarrow" returns pyarrow-backed nullable types using ``ArrowDtype``
1044
1045 Returns
1046 -------
1047 np.dtype, or ExtensionDtype
1048 """
1049 inferred_dtype: str | DtypeObj
1050
1051 from pandas.core.arrays.arrow.dtype import ArrowDtype
1052
1053 if (
1054 convert_string or convert_integer or convert_boolean or convert_floating
1055 ) and isinstance(input_array, np.ndarray):
1056 if is_object_dtype(input_array.dtype):
1057 inferred_dtype = lib.infer_dtype(input_array)
1058 else:
1059 inferred_dtype = input_array.dtype
1060
1061 if is_string_dtype(inferred_dtype):
1062 if not convert_string or inferred_dtype == "bytes":
1063 inferred_dtype = input_array.dtype
1064 else:
1065 inferred_dtype = pandas_dtype_func("string")
1066
1067 if convert_integer:
1068 target_int_dtype = pandas_dtype_func("Int64")
1069
1070 if is_integer_dtype(input_array.dtype):
1071 from pandas.core.arrays.integer import INT_STR_TO_DTYPE
1072
1073 inferred_dtype = INT_STR_TO_DTYPE.get(
1074 input_array.dtype.name, target_int_dtype
1075 )
1076 elif is_numeric_dtype(input_array.dtype):
1077 # TODO: de-dup with maybe_cast_to_integer_array?
1078 arr = input_array[notna(input_array)]
1079 if (arr.astype(int) == arr).all():
1080 inferred_dtype = target_int_dtype
1081 else:
1082 inferred_dtype = input_array.dtype
1083 elif (
1084 infer_objects
1085 and is_object_dtype(input_array.dtype)
1086 and (isinstance(inferred_dtype, str) and inferred_dtype == "integer")
1087 ):
1088 inferred_dtype = target_int_dtype
1089
1090 if convert_floating:
1091 if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
1092 input_array.dtype
1093 ):
1094 from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE
1095
1096 inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get(
1097 input_array.dtype.name, pandas_dtype_func("Float64")
1098 )
1099 # if we could also convert to integer, check if all floats
1100 # are actually integers
1101 if convert_integer:
1102 # TODO: de-dup with maybe_cast_to_integer_array?
1103 arr = input_array[notna(input_array)]
1104 if (arr.astype(int) == arr).all():
1105 inferred_dtype = pandas_dtype_func("Int64")
1106 else:
1107 inferred_dtype = inferred_float_dtype
1108 else:
1109 inferred_dtype = inferred_float_dtype
1110 elif (
1111 infer_objects
1112 and is_object_dtype(input_array.dtype)
1113 and (
1114 isinstance(inferred_dtype, str)
1115 and inferred_dtype == "mixed-integer-float"
1116 )
1117 ):
1118 inferred_dtype = pandas_dtype_func("Float64")
1119
1120 if convert_boolean:
1121 if is_bool_dtype(input_array.dtype):
1122 inferred_dtype = pandas_dtype_func("boolean")
1123 elif isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
1124 inferred_dtype = pandas_dtype_func("boolean")
1125
1126 if isinstance(inferred_dtype, str):
1127 # If we couldn't do anything else, then we retain the dtype
1128 inferred_dtype = input_array.dtype
1129
1130 else:
1131 inferred_dtype = input_array.dtype
1132
1133 if dtype_backend == "pyarrow":
1134 from pandas.core.arrays.arrow.array import to_pyarrow_type
1135 from pandas.core.arrays.string_ import StringDtype
1136
1137 assert not isinstance(inferred_dtype, str)
1138
1139 if (
1140 (convert_integer and inferred_dtype.kind in "iu")
1141 or (convert_floating and inferred_dtype.kind in "fc")
1142 or (convert_boolean and inferred_dtype.kind == "b")
1143 or (convert_string and isinstance(inferred_dtype, StringDtype))
1144 or (
1145 inferred_dtype.kind not in "iufcb"
1146 and not isinstance(inferred_dtype, StringDtype)
1147 )
1148 ):
1149 if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance(
1150 inferred_dtype, DatetimeTZDtype
1151 ):
1152 base_dtype = inferred_dtype.base
1153 elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)):
1154 base_dtype = inferred_dtype.numpy_dtype
1155 elif isinstance(inferred_dtype, StringDtype):
1156 base_dtype = np.dtype(str)
1157 else:
1158 base_dtype = inferred_dtype
1159 pa_type = to_pyarrow_type(base_dtype)
1160 if pa_type is not None:
1161 inferred_dtype = ArrowDtype(pa_type)
1162 elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype):
1163 # GH 53648
1164 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype]
1165
1166 # error: Incompatible return value type (got "Union[str, Union[dtype[Any],
1167 # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")
1168 return inferred_dtype # type: ignore[return-value]
1169
1170
1171def maybe_infer_to_datetimelike(
1172 value: npt.NDArray[np.object_],
1173) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray:
1174 """
1175 we might have a array (or single object) that is datetime like,
1176 and no dtype is passed don't change the value unless we find a
1177 datetime/timedelta set
1178
1179 this is pretty strict in that a datetime/timedelta is REQUIRED
1180 in addition to possible nulls/string likes
1181
1182 Parameters
1183 ----------
1184 value : np.ndarray[object]
1185
1186 Returns
1187 -------
1188 np.ndarray, DatetimeArray, TimedeltaArray, PeriodArray, or IntervalArray
1189
1190 """
1191 if not isinstance(value, np.ndarray) or value.dtype != object:
1192 # Caller is responsible for passing only ndarray[object]
1193 raise TypeError(type(value)) # pragma: no cover
1194 if value.ndim != 1:
1195 # Caller is responsible
1196 raise ValueError(value.ndim) # pragma: no cover
1197
1198 if not len(value):
1199 return value
1200
1201 # error: Incompatible return value type (got "Union[ExtensionArray,
1202 # ndarray[Any, Any]]", expected "Union[ndarray[Any, Any], DatetimeArray,
1203 # TimedeltaArray, PeriodArray, IntervalArray]")
1204 return lib.maybe_convert_objects( # type: ignore[return-value]
1205 value,
1206 # Here we do not convert numeric dtypes, as if we wanted that,
1207 # numpy would have done it for us.
1208 convert_numeric=False,
1209 convert_period=True,
1210 convert_interval=True,
1211 convert_timedelta=True,
1212 convert_datetime=True,
1213 dtype_if_all_nat=np.dtype("M8[ns]"),
1214 )
1215
1216
1217def maybe_cast_to_datetime(
1218 value: np.ndarray | list, dtype: np.dtype
1219) -> ExtensionArray | np.ndarray:
1220 """
1221 try to cast the array/value to a datetimelike dtype, converting float
1222 nan to iNaT
1223
1224 Caller is responsible for handling ExtensionDtype cases and non dt64/td64
1225 cases.
1226 """
1227 from pandas.core.arrays.datetimes import DatetimeArray
1228 from pandas.core.arrays.timedeltas import TimedeltaArray
1229
1230 assert dtype.kind in ["m", "M"]
1231 if not is_list_like(value):
1232 raise TypeError("value must be listlike")
1233
1234 # TODO: _from_sequence would raise ValueError in cases where
1235 # _ensure_nanosecond_dtype raises TypeError
1236 _ensure_nanosecond_dtype(dtype)
1237
1238 if is_timedelta64_dtype(dtype):
1239 res = TimedeltaArray._from_sequence(value, dtype=dtype)
1240 return res
1241 else:
1242 try:
1243 dta = DatetimeArray._from_sequence(value, dtype=dtype)
1244 except ValueError as err:
1245 # We can give a Series-specific exception message.
1246 if "cannot supply both a tz and a timezone-naive dtype" in str(err):
1247 raise ValueError(
1248 "Cannot convert timezone-aware data to "
1249 "timezone-naive dtype. Use "
1250 "pd.Series(values).dt.tz_localize(None) instead."
1251 ) from err
1252 raise
1253
1254 return dta
1255
1256
1257def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None:
1258 """
1259 Convert dtypes with granularity less than nanosecond to nanosecond
1260
1261 >>> _ensure_nanosecond_dtype(np.dtype("M8[us]"))
1262
1263 >>> _ensure_nanosecond_dtype(np.dtype("M8[D]"))
1264 Traceback (most recent call last):
1265 ...
1266 TypeError: dtype=datetime64[D] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns'
1267
1268 >>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))
1269 Traceback (most recent call last):
1270 ...
1271 TypeError: dtype=timedelta64[ps] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns'
1272 """ # noqa:E501
1273 msg = (
1274 f"The '{dtype.name}' dtype has no unit. "
1275 f"Please pass in '{dtype.name}[ns]' instead."
1276 )
1277
1278 # unpack e.g. SparseDtype
1279 dtype = getattr(dtype, "subtype", dtype)
1280
1281 if not isinstance(dtype, np.dtype):
1282 # i.e. datetime64tz
1283 pass
1284
1285 elif dtype.kind in ["m", "M"]:
1286 reso = get_unit_from_dtype(dtype)
1287 if not is_supported_unit(reso):
1288 # pre-2.0 we would silently swap in nanos for lower-resolutions,
1289 # raise for above-nano resolutions
1290 if dtype.name in ["datetime64", "timedelta64"]:
1291 raise ValueError(msg)
1292 # TODO: ValueError or TypeError? existing test
1293 # test_constructor_generic_timestamp_bad_frequency expects TypeError
1294 raise TypeError(
1295 f"dtype={dtype} is not supported. Supported resolutions are 's', "
1296 "'ms', 'us', and 'ns'"
1297 )
1298
1299
1300# TODO: other value-dependent functions to standardize here include
1301# Index._find_common_type_compat
1302def find_result_type(left: ArrayLike, right: Any) -> DtypeObj:
1303 """
1304 Find the type/dtype for a the result of an operation between these objects.
1305
1306 This is similar to find_common_type, but looks at the objects instead
1307 of just their dtypes. This can be useful in particular when one of the
1308 objects does not have a `dtype`.
1309
1310 Parameters
1311 ----------
1312 left : np.ndarray or ExtensionArray
1313 right : Any
1314
1315 Returns
1316 -------
1317 np.dtype or ExtensionDtype
1318
1319 See also
1320 --------
1321 find_common_type
1322 numpy.result_type
1323 """
1324 new_dtype: DtypeObj
1325
1326 if (
1327 isinstance(left, np.ndarray)
1328 and left.dtype.kind in ["i", "u", "c"]
1329 and (lib.is_integer(right) or lib.is_float(right))
1330 ):
1331 # e.g. with int8 dtype and right=512, we want to end up with
1332 # np.int16, whereas infer_dtype_from(512) gives np.int64,
1333 # which will make us upcast too far.
1334 if lib.is_float(right) and right.is_integer() and left.dtype.kind != "f":
1335 right = int(right)
1336
1337 new_dtype = np.result_type(left, right)
1338
1339 elif is_valid_na_for_dtype(right, left.dtype):
1340 # e.g. IntervalDtype[int] and None/np.nan
1341 new_dtype = ensure_dtype_can_hold_na(left.dtype)
1342
1343 else:
1344 dtype, _ = infer_dtype_from(right, pandas_dtype=True)
1345
1346 new_dtype = find_common_type([left.dtype, dtype])
1347
1348 return new_dtype
1349
1350
1351def common_dtype_categorical_compat(
1352 objs: list[Index | ArrayLike], dtype: DtypeObj
1353) -> DtypeObj:
1354 """
1355 Update the result of find_common_type to account for NAs in a Categorical.
1356
1357 Parameters
1358 ----------
1359 objs : list[np.ndarray | ExtensionArray | Index]
1360 dtype : np.dtype or ExtensionDtype
1361
1362 Returns
1363 -------
1364 np.dtype or ExtensionDtype
1365 """
1366 # GH#38240
1367
1368 # TODO: more generally, could do `not can_hold_na(dtype)`
1369 if isinstance(dtype, np.dtype) and dtype.kind in ["i", "u"]:
1370 for obj in objs:
1371 # We don't want to accientally allow e.g. "categorical" str here
1372 obj_dtype = getattr(obj, "dtype", None)
1373 if isinstance(obj_dtype, CategoricalDtype):
1374 if isinstance(obj, ABCIndex):
1375 # This check may already be cached
1376 hasnas = obj.hasnans
1377 else:
1378 # Categorical
1379 hasnas = cast("Categorical", obj)._hasna
1380
1381 if hasnas:
1382 # see test_union_int_categorical_with_nan
1383 dtype = np.dtype(np.float64)
1384 break
1385 return dtype
1386
1387
1388def np_find_common_type(*dtypes: np.dtype) -> np.dtype:
1389 """
1390 np.find_common_type implementation pre-1.25 deprecation using np.result_type
1391 https://github.com/pandas-dev/pandas/pull/49569#issuecomment-1308300065
1392
1393 Parameters
1394 ----------
1395 dtypes : np.dtypes
1396
1397 Returns
1398 -------
1399 np.dtype
1400 """
1401 try:
1402 common_dtype = np.result_type(*dtypes)
1403 if common_dtype.kind in "mMSU":
1404 # NumPy promotion currently (1.25) misbehaves for for times and strings,
1405 # so fall back to object (find_common_dtype did unless there
1406 # was only one dtype)
1407 common_dtype = np.dtype("O")
1408
1409 except TypeError:
1410 common_dtype = np.dtype("O")
1411 return common_dtype
1412
1413
1414@overload
1415def find_common_type(types: list[np.dtype]) -> np.dtype:
1416 ...
1417
1418
1419@overload
1420def find_common_type(types: list[ExtensionDtype]) -> DtypeObj:
1421 ...
1422
1423
1424@overload
1425def find_common_type(types: list[DtypeObj]) -> DtypeObj:
1426 ...
1427
1428
1429def find_common_type(types):
1430 """
1431 Find a common data type among the given dtypes.
1432
1433 Parameters
1434 ----------
1435 types : list of dtypes
1436
1437 Returns
1438 -------
1439 pandas extension or numpy dtype
1440
1441 See Also
1442 --------
1443 numpy.find_common_type
1444
1445 """
1446 if not types:
1447 raise ValueError("no types given")
1448
1449 first = types[0]
1450
1451 # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)
1452 # => object
1453 if lib.dtypes_all_equal(list(types)):
1454 return first
1455
1456 # get unique types (dict.fromkeys is used as order-preserving set())
1457 types = list(dict.fromkeys(types).keys())
1458
1459 if any(isinstance(t, ExtensionDtype) for t in types):
1460 for t in types:
1461 if isinstance(t, ExtensionDtype):
1462 res = t._get_common_dtype(types)
1463 if res is not None:
1464 return res
1465 return np.dtype("object")
1466
1467 # take lowest unit
1468 if all(is_datetime64_dtype(t) for t in types):
1469 return np.dtype(max(types))
1470 if all(is_timedelta64_dtype(t) for t in types):
1471 return np.dtype(max(types))
1472
1473 # don't mix bool / int or float or complex
1474 # this is different from numpy, which casts bool with float/int as int
1475 has_bools = any(is_bool_dtype(t) for t in types)
1476 if has_bools:
1477 for t in types:
1478 if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):
1479 return np.dtype("object")
1480
1481 return np_find_common_type(*types)
1482
1483
1484def construct_2d_arraylike_from_scalar(
1485 value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool
1486) -> np.ndarray:
1487 shape = (length, width)
1488
1489 if dtype.kind in ["m", "M"]:
1490 value = _maybe_box_and_unbox_datetimelike(value, dtype)
1491 elif dtype == _dtype_obj:
1492 if isinstance(value, (np.timedelta64, np.datetime64)):
1493 # calling np.array below would cast to pytimedelta/pydatetime
1494 out = np.empty(shape, dtype=object)
1495 out.fill(value)
1496 return out
1497
1498 # Attempt to coerce to a numpy array
1499 try:
1500 arr = np.array(value, dtype=dtype, copy=copy)
1501 except (ValueError, TypeError) as err:
1502 raise TypeError(
1503 f"DataFrame constructor called with incompatible data and dtype: {err}"
1504 ) from err
1505
1506 if arr.ndim != 0:
1507 raise ValueError("DataFrame constructor not properly called!")
1508
1509 return np.full(shape, arr)
1510
1511
1512def construct_1d_arraylike_from_scalar(
1513 value: Scalar, length: int, dtype: DtypeObj | None
1514) -> ArrayLike:
1515 """
1516 create a np.ndarray / pandas type of specified shape and dtype
1517 filled with values
1518
1519 Parameters
1520 ----------
1521 value : scalar value
1522 length : int
1523 dtype : pandas_dtype or np.dtype
1524
1525 Returns
1526 -------
1527 np.ndarray / pandas type of length, filled with value
1528
1529 """
1530
1531 if dtype is None:
1532 try:
1533 dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True)
1534 except OutOfBoundsDatetime:
1535 dtype = _dtype_obj
1536
1537 if isinstance(dtype, ExtensionDtype):
1538 cls = dtype.construct_array_type()
1539 seq = [] if length == 0 else [value]
1540 subarr = cls._from_sequence(seq, dtype=dtype).repeat(length)
1541
1542 else:
1543 if length and is_integer_dtype(dtype) and isna(value):
1544 # coerce if we have nan for an integer dtype
1545 dtype = np.dtype("float64")
1546 elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"):
1547 # we need to coerce to object dtype to avoid
1548 # to allow numpy to take our string as a scalar value
1549 dtype = np.dtype("object")
1550 if not isna(value):
1551 value = ensure_str(value)
1552 elif dtype.kind in ["M", "m"]:
1553 value = _maybe_box_and_unbox_datetimelike(value, dtype)
1554
1555 subarr = np.empty(length, dtype=dtype)
1556 if length:
1557 # GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes
1558 subarr.fill(value)
1559
1560 return subarr
1561
1562
1563def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj):
1564 # Caller is responsible for checking dtype.kind in ["m", "M"]
1565
1566 if isinstance(value, dt.datetime):
1567 # we dont want to box dt64, in particular datetime64("NaT")
1568 value = maybe_box_datetimelike(value, dtype)
1569
1570 return _maybe_unbox_datetimelike(value, dtype)
1571
1572
1573def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
1574 """
1575 Transform any list-like object in a 1-dimensional numpy array of object
1576 dtype.
1577
1578 Parameters
1579 ----------
1580 values : any iterable which has a len()
1581
1582 Raises
1583 ------
1584 TypeError
1585 * If `values` does not have a len()
1586
1587 Returns
1588 -------
1589 1-dimensional numpy array of dtype object
1590 """
1591 # numpy will try to interpret nested lists as further dimensions, hence
1592 # making a 1D array that contains list-likes is a bit tricky:
1593 result = np.empty(len(values), dtype="object")
1594 result[:] = values
1595 return result
1596
1597
1598def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray:
1599 """
1600 Takes any dtype and returns the casted version, raising for when data is
1601 incompatible with integer/unsigned integer dtypes.
1602
1603 Parameters
1604 ----------
1605 arr : np.ndarray or list
1606 The array to cast.
1607 dtype : np.dtype
1608 The integer dtype to cast the array to.
1609
1610 Returns
1611 -------
1612 ndarray
1613 Array of integer or unsigned integer dtype.
1614
1615 Raises
1616 ------
1617 OverflowError : the dtype is incompatible with the data
1618 ValueError : loss of precision has occurred during casting
1619
1620 Examples
1621 --------
1622 If you try to coerce negative values to unsigned integers, it raises:
1623
1624 >>> pd.Series([-1], dtype="uint64")
1625 Traceback (most recent call last):
1626 ...
1627 OverflowError: Trying to coerce negative values to unsigned integers
1628
1629 Also, if you try to coerce float values to integers, it raises:
1630
1631 >>> maybe_cast_to_integer_array([1, 2, 3.5], dtype=np.dtype("int64"))
1632 Traceback (most recent call last):
1633 ...
1634 ValueError: Trying to coerce float values to integers
1635 """
1636 assert is_integer_dtype(dtype)
1637
1638 try:
1639 if not isinstance(arr, np.ndarray):
1640 with warnings.catch_warnings():
1641 # We already disallow dtype=uint w/ negative numbers
1642 # (test_constructor_coercion_signed_to_unsigned) so safe to ignore.
1643 warnings.filterwarnings(
1644 "ignore",
1645 "NumPy will stop allowing conversion of out-of-bound Python int",
1646 DeprecationWarning,
1647 )
1648 casted = np.array(arr, dtype=dtype, copy=False)
1649 else:
1650 with warnings.catch_warnings():
1651 warnings.filterwarnings("ignore", category=RuntimeWarning)
1652 casted = arr.astype(dtype, copy=False)
1653 except OverflowError as err:
1654 raise OverflowError(
1655 "The elements provided in the data cannot all be "
1656 f"casted to the dtype {dtype}"
1657 ) from err
1658
1659 if isinstance(arr, np.ndarray) and arr.dtype == dtype:
1660 # avoid expensive array_equal check
1661 return casted
1662
1663 with warnings.catch_warnings():
1664 warnings.filterwarnings("ignore", category=RuntimeWarning)
1665 if np.array_equal(arr, casted):
1666 return casted
1667
1668 # We do this casting to allow for proper
1669 # data and dtype checking.
1670 #
1671 # We didn't do this earlier because NumPy
1672 # doesn't handle `uint64` correctly.
1673 arr = np.asarray(arr)
1674
1675 if np.issubdtype(arr.dtype, str):
1676 if (casted.astype(str) == arr).all():
1677 return casted
1678 raise ValueError(f"string values cannot be losslessly cast to {dtype}")
1679
1680 if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
1681 raise OverflowError("Trying to coerce negative values to unsigned integers")
1682
1683 if is_float_dtype(arr.dtype):
1684 if not np.isfinite(arr).all():
1685 raise IntCastingNaNError(
1686 "Cannot convert non-finite values (NA or inf) to integer"
1687 )
1688 raise ValueError("Trying to coerce float values to integers")
1689 if is_object_dtype(arr.dtype):
1690 raise ValueError("Trying to coerce float values to integers")
1691
1692 if casted.dtype < arr.dtype:
1693 # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows
1694 raise ValueError(
1695 f"Values are too large to be losslessly converted to {dtype}. "
1696 f"To cast anyway, use pd.Series(values).astype({dtype})"
1697 )
1698
1699 if arr.dtype.kind in ["m", "M"]:
1700 # test_constructor_maskedarray_nonfloat
1701 raise TypeError(
1702 f"Constructing a Series or DataFrame from {arr.dtype} values and "
1703 f"dtype={dtype} is not supported. Use values.view({dtype}) instead."
1704 )
1705
1706 # No known cases that get here, but raising explicitly to cover our bases.
1707 raise ValueError(f"values cannot be losslessly cast to {dtype}")
1708
1709
1710def can_hold_element(arr: ArrayLike, element: Any) -> bool:
1711 """
1712 Can we do an inplace setitem with this element in an array with this dtype?
1713
1714 Parameters
1715 ----------
1716 arr : np.ndarray or ExtensionArray
1717 element : Any
1718
1719 Returns
1720 -------
1721 bool
1722 """
1723 dtype = arr.dtype
1724 if not isinstance(dtype, np.dtype) or dtype.kind in ["m", "M"]:
1725 if isinstance(dtype, (PeriodDtype, IntervalDtype, DatetimeTZDtype, np.dtype)):
1726 # np.dtype here catches datetime64ns and timedelta64ns; we assume
1727 # in this case that we have DatetimeArray/TimedeltaArray
1728 arr = cast(
1729 "PeriodArray | DatetimeArray | TimedeltaArray | IntervalArray", arr
1730 )
1731 try:
1732 arr._validate_setitem_value(element)
1733 return True
1734 except (ValueError, TypeError):
1735 # TODO: re-use _catch_deprecated_value_error to ensure we are
1736 # strict about what exceptions we allow through here.
1737 return False
1738
1739 # This is technically incorrect, but maintains the behavior of
1740 # ExtensionBlock._can_hold_element
1741 return True
1742
1743 try:
1744 np_can_hold_element(dtype, element)
1745 return True
1746 except (TypeError, LossySetitemError):
1747 return False
1748
1749
1750def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:
1751 """
1752 Raise if we cannot losslessly set this element into an ndarray with this dtype.
1753
1754 Specifically about places where we disagree with numpy. i.e. there are
1755 cases where numpy will raise in doing the setitem that we do not check
1756 for here, e.g. setting str "X" into a numeric ndarray.
1757
1758 Returns
1759 -------
1760 Any
1761 The element, potentially cast to the dtype.
1762
1763 Raises
1764 ------
1765 ValueError : If we cannot losslessly store this element with this dtype.
1766 """
1767 if dtype == _dtype_obj:
1768 return element
1769
1770 tipo = _maybe_infer_dtype_type(element)
1771
1772 if dtype.kind in ["i", "u"]:
1773 if isinstance(element, range):
1774 if _dtype_can_hold_range(element, dtype):
1775 return element
1776 raise LossySetitemError
1777
1778 if is_integer(element) or (is_float(element) and element.is_integer()):
1779 # e.g. test_setitem_series_int8 if we have a python int 1
1780 # tipo may be np.int32, despite the fact that it will fit
1781 # in smaller int dtypes.
1782 info = np.iinfo(dtype)
1783 if info.min <= element <= info.max:
1784 return dtype.type(element)
1785 raise LossySetitemError
1786
1787 if tipo is not None:
1788 if tipo.kind not in ["i", "u"]:
1789 if isinstance(element, np.ndarray) and element.dtype.kind == "f":
1790 # If all can be losslessly cast to integers, then we can hold them
1791 with np.errstate(invalid="ignore"):
1792 # We check afterwards if cast was losslessly, so no need to show
1793 # the warning
1794 casted = element.astype(dtype)
1795 comp = casted == element
1796 if comp.all():
1797 # Return the casted values bc they can be passed to
1798 # np.putmask, whereas the raw values cannot.
1799 # see TestSetitemFloatNDarrayIntoIntegerSeries
1800 return casted
1801 raise LossySetitemError
1802
1803 # Anything other than integer we cannot hold
1804 raise LossySetitemError
1805 if (
1806 dtype.kind == "u"
1807 and isinstance(element, np.ndarray)
1808 and element.dtype.kind == "i"
1809 ):
1810 # see test_where_uint64
1811 casted = element.astype(dtype)
1812 if (casted == element).all():
1813 # TODO: faster to check (element >=0).all()? potential
1814 # itemsize issues there?
1815 return casted
1816 raise LossySetitemError
1817 if dtype.itemsize < tipo.itemsize:
1818 raise LossySetitemError
1819 if not isinstance(tipo, np.dtype):
1820 # i.e. nullable IntegerDtype; we can put this into an ndarray
1821 # losslessly iff it has no NAs
1822 if element._hasna:
1823 raise LossySetitemError
1824 return element
1825
1826 return element
1827
1828 raise LossySetitemError
1829
1830 if dtype.kind == "f":
1831 if lib.is_integer(element) or lib.is_float(element):
1832 casted = dtype.type(element)
1833 if np.isnan(casted) or casted == element:
1834 return casted
1835 # otherwise e.g. overflow see TestCoercionFloat32
1836 raise LossySetitemError
1837
1838 if tipo is not None:
1839 # TODO: itemsize check?
1840 if tipo.kind not in ["f", "i", "u"]:
1841 # Anything other than float/integer we cannot hold
1842 raise LossySetitemError
1843 if not isinstance(tipo, np.dtype):
1844 # i.e. nullable IntegerDtype or FloatingDtype;
1845 # we can put this into an ndarray losslessly iff it has no NAs
1846 if element._hasna:
1847 raise LossySetitemError
1848 return element
1849 elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind:
1850 if isinstance(element, np.ndarray):
1851 # e.g. TestDataFrameIndexingWhere::test_where_alignment
1852 casted = element.astype(dtype)
1853 if np.array_equal(casted, element, equal_nan=True):
1854 return casted
1855 raise LossySetitemError
1856
1857 return element
1858
1859 raise LossySetitemError
1860
1861 if dtype.kind == "c":
1862 if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element):
1863 if np.isnan(element):
1864 # see test_where_complex GH#6345
1865 return dtype.type(element)
1866
1867 with warnings.catch_warnings():
1868 warnings.filterwarnings("ignore")
1869 casted = dtype.type(element)
1870 if casted == element:
1871 return casted
1872 # otherwise e.g. overflow see test_32878_complex_itemsize
1873 raise LossySetitemError
1874
1875 if tipo is not None:
1876 if tipo.kind in ["c", "f", "i", "u"]:
1877 return element
1878 raise LossySetitemError
1879 raise LossySetitemError
1880
1881 if dtype.kind == "b":
1882 if tipo is not None:
1883 if tipo.kind == "b":
1884 if not isinstance(tipo, np.dtype):
1885 # i.e. we have a BooleanArray
1886 if element._hasna:
1887 # i.e. there are pd.NA elements
1888 raise LossySetitemError
1889 return element
1890 raise LossySetitemError
1891 if lib.is_bool(element):
1892 return element
1893 raise LossySetitemError
1894
1895 if dtype.kind == "S":
1896 # TODO: test tests.frame.methods.test_replace tests get here,
1897 # need more targeted tests. xref phofl has a PR about this
1898 if tipo is not None:
1899 if tipo.kind == "S" and tipo.itemsize <= dtype.itemsize:
1900 return element
1901 raise LossySetitemError
1902 if isinstance(element, bytes) and len(element) <= dtype.itemsize:
1903 return element
1904 raise LossySetitemError
1905
1906 if dtype.kind == "V":
1907 # i.e. np.void, which cannot hold _anything_
1908 raise LossySetitemError
1909
1910 raise NotImplementedError(dtype)
1911
1912
1913def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
1914 """
1915 _maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints),
1916 but in many cases a range can be held by a smaller integer dtype.
1917 Check if this is one of those cases.
1918 """
1919 if not len(rng):
1920 return True
1921 return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)