1"""
2Routines for casting.
3"""
4
5from __future__ import annotations
6
7import datetime as dt
8import functools
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Literal,
13 TypeVar,
14 cast,
15 overload,
16)
17import warnings
18
19import numpy as np
20
21from pandas._config import using_pyarrow_string_dtype
22
23from pandas._libs import (
24 Interval,
25 Period,
26 lib,
27)
28from pandas._libs.missing import (
29 NA,
30 NAType,
31 checknull,
32)
33from pandas._libs.tslibs import (
34 NaT,
35 OutOfBoundsDatetime,
36 OutOfBoundsTimedelta,
37 Timedelta,
38 Timestamp,
39 is_supported_dtype,
40)
41from pandas._libs.tslibs.timedeltas import array_to_timedelta64
42from pandas.errors import (
43 IntCastingNaNError,
44 LossySetitemError,
45)
46
47from pandas.core.dtypes.common import (
48 ensure_int8,
49 ensure_int16,
50 ensure_int32,
51 ensure_int64,
52 ensure_object,
53 ensure_str,
54 is_bool,
55 is_complex,
56 is_float,
57 is_integer,
58 is_object_dtype,
59 is_scalar,
60 is_string_dtype,
61 pandas_dtype as pandas_dtype_func,
62)
63from pandas.core.dtypes.dtypes import (
64 ArrowDtype,
65 BaseMaskedDtype,
66 CategoricalDtype,
67 DatetimeTZDtype,
68 ExtensionDtype,
69 IntervalDtype,
70 PandasExtensionDtype,
71 PeriodDtype,
72)
73from pandas.core.dtypes.generic import (
74 ABCExtensionArray,
75 ABCIndex,
76 ABCSeries,
77)
78from pandas.core.dtypes.inference import is_list_like
79from pandas.core.dtypes.missing import (
80 is_valid_na_for_dtype,
81 isna,
82 na_value_for_dtype,
83 notna,
84)
85
86from pandas.io._util import _arrow_dtype_mapping
87
88if TYPE_CHECKING:
89 from collections.abc import (
90 Sequence,
91 Sized,
92 )
93
94 from pandas._typing import (
95 ArrayLike,
96 Dtype,
97 DtypeObj,
98 NumpyIndexT,
99 Scalar,
100 npt,
101 )
102
103 from pandas import Index
104 from pandas.core.arrays import (
105 Categorical,
106 DatetimeArray,
107 ExtensionArray,
108 IntervalArray,
109 PeriodArray,
110 TimedeltaArray,
111 )
112
113
114_int8_max = np.iinfo(np.int8).max
115_int16_max = np.iinfo(np.int16).max
116_int32_max = np.iinfo(np.int32).max
117
118_dtype_obj = np.dtype(object)
119
120NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray)
121
122
123def maybe_convert_platform(
124 values: list | tuple | range | np.ndarray | ExtensionArray,
125) -> ArrayLike:
126 """try to do platform conversion, allow ndarray or list here"""
127 arr: ArrayLike
128
129 if isinstance(values, (list, tuple, range)):
130 arr = construct_1d_object_array_from_listlike(values)
131 else:
132 # The caller is responsible for ensuring that we have np.ndarray
133 # or ExtensionArray here.
134 arr = values
135
136 if arr.dtype == _dtype_obj:
137 arr = cast(np.ndarray, arr)
138 arr = lib.maybe_convert_objects(arr)
139
140 return arr
141
142
143def is_nested_object(obj) -> bool:
144 """
145 return a boolean if we have a nested object, e.g. a Series with 1 or
146 more Series elements
147
148 This may not be necessarily be performant.
149
150 """
151 return bool(
152 isinstance(obj, ABCSeries)
153 and is_object_dtype(obj.dtype)
154 and any(isinstance(v, ABCSeries) for v in obj._values)
155 )
156
157
158def maybe_box_datetimelike(value: Scalar, dtype: Dtype | None = None) -> Scalar:
159 """
160 Cast scalar to Timestamp or Timedelta if scalar is datetime-like
161 and dtype is not object.
162
163 Parameters
164 ----------
165 value : scalar
166 dtype : Dtype, optional
167
168 Returns
169 -------
170 scalar
171 """
172 if dtype == _dtype_obj:
173 pass
174 elif isinstance(value, (np.datetime64, dt.datetime)):
175 value = Timestamp(value)
176 elif isinstance(value, (np.timedelta64, dt.timedelta)):
177 value = Timedelta(value)
178
179 return value
180
181
182def maybe_box_native(value: Scalar | None | NAType) -> Scalar | None | NAType:
183 """
184 If passed a scalar cast the scalar to a python native type.
185
186 Parameters
187 ----------
188 value : scalar or Series
189
190 Returns
191 -------
192 scalar or Series
193 """
194 if is_float(value):
195 value = float(value)
196 elif is_integer(value):
197 value = int(value)
198 elif is_bool(value):
199 value = bool(value)
200 elif isinstance(value, (np.datetime64, np.timedelta64)):
201 value = maybe_box_datetimelike(value)
202 elif value is NA:
203 value = None
204 return value
205
206
207def _maybe_unbox_datetimelike(value: Scalar, dtype: DtypeObj) -> Scalar:
208 """
209 Convert a Timedelta or Timestamp to timedelta64 or datetime64 for setting
210 into a numpy array. Failing to unbox would risk dropping nanoseconds.
211
212 Notes
213 -----
214 Caller is responsible for checking dtype.kind in "mM"
215 """
216 if is_valid_na_for_dtype(value, dtype):
217 # GH#36541: can't fill array directly with pd.NaT
218 # > np.empty(10, dtype="datetime64[ns]").fill(pd.NaT)
219 # ValueError: cannot convert float NaN to integer
220 value = dtype.type("NaT", "ns")
221 elif isinstance(value, Timestamp):
222 if value.tz is None:
223 value = value.to_datetime64()
224 elif not isinstance(dtype, DatetimeTZDtype):
225 raise TypeError("Cannot unbox tzaware Timestamp to tznaive dtype")
226 elif isinstance(value, Timedelta):
227 value = value.to_timedelta64()
228
229 _disallow_mismatched_datetimelike(value, dtype)
230 return value
231
232
233def _disallow_mismatched_datetimelike(value, dtype: DtypeObj):
234 """
235 numpy allows np.array(dt64values, dtype="timedelta64[ns]") and
236 vice-versa, but we do not want to allow this, so we need to
237 check explicitly
238 """
239 vdtype = getattr(value, "dtype", None)
240 if vdtype is None:
241 return
242 elif (vdtype.kind == "m" and dtype.kind == "M") or (
243 vdtype.kind == "M" and dtype.kind == "m"
244 ):
245 raise TypeError(f"Cannot cast {repr(value)} to {dtype}")
246
247
248@overload
249def maybe_downcast_to_dtype(result: np.ndarray, dtype: str | np.dtype) -> np.ndarray:
250 ...
251
252
253@overload
254def maybe_downcast_to_dtype(result: ExtensionArray, dtype: str | np.dtype) -> ArrayLike:
255 ...
256
257
258def maybe_downcast_to_dtype(result: ArrayLike, dtype: str | np.dtype) -> ArrayLike:
259 """
260 try to cast to the specified dtype (e.g. convert back to bool/int
261 or could be an astype of float64->float32
262 """
263 if isinstance(result, ABCSeries):
264 result = result._values
265 do_round = False
266
267 if isinstance(dtype, str):
268 if dtype == "infer":
269 inferred_type = lib.infer_dtype(result, skipna=False)
270 if inferred_type == "boolean":
271 dtype = "bool"
272 elif inferred_type == "integer":
273 dtype = "int64"
274 elif inferred_type == "datetime64":
275 dtype = "datetime64[ns]"
276 elif inferred_type in ["timedelta", "timedelta64"]:
277 dtype = "timedelta64[ns]"
278
279 # try to upcast here
280 elif inferred_type == "floating":
281 dtype = "int64"
282 if issubclass(result.dtype.type, np.number):
283 do_round = True
284
285 else:
286 # TODO: complex? what if result is already non-object?
287 dtype = "object"
288
289 dtype = np.dtype(dtype)
290
291 if not isinstance(dtype, np.dtype):
292 # enforce our signature annotation
293 raise TypeError(dtype) # pragma: no cover
294
295 converted = maybe_downcast_numeric(result, dtype, do_round)
296 if converted is not result:
297 return converted
298
299 # a datetimelike
300 # GH12821, iNaT is cast to float
301 if dtype.kind in "mM" and result.dtype.kind in "if":
302 result = result.astype(dtype)
303
304 elif dtype.kind == "m" and result.dtype == _dtype_obj:
305 # test_where_downcast_to_td64
306 result = cast(np.ndarray, result)
307 result = array_to_timedelta64(result)
308
309 elif dtype == np.dtype("M8[ns]") and result.dtype == _dtype_obj:
310 result = cast(np.ndarray, result)
311 return np.asarray(maybe_cast_to_datetime(result, dtype=dtype))
312
313 return result
314
315
316@overload
317def maybe_downcast_numeric(
318 result: np.ndarray, dtype: np.dtype, do_round: bool = False
319) -> np.ndarray:
320 ...
321
322
323@overload
324def maybe_downcast_numeric(
325 result: ExtensionArray, dtype: DtypeObj, do_round: bool = False
326) -> ArrayLike:
327 ...
328
329
330def maybe_downcast_numeric(
331 result: ArrayLike, dtype: DtypeObj, do_round: bool = False
332) -> ArrayLike:
333 """
334 Subset of maybe_downcast_to_dtype restricted to numeric dtypes.
335
336 Parameters
337 ----------
338 result : ndarray or ExtensionArray
339 dtype : np.dtype or ExtensionDtype
340 do_round : bool
341
342 Returns
343 -------
344 ndarray or ExtensionArray
345 """
346 if not isinstance(dtype, np.dtype) or not isinstance(result.dtype, np.dtype):
347 # e.g. SparseDtype has no itemsize attr
348 return result
349
350 def trans(x):
351 if do_round:
352 return x.round()
353 return x
354
355 if dtype.kind == result.dtype.kind:
356 # don't allow upcasts here (except if empty)
357 if result.dtype.itemsize <= dtype.itemsize and result.size:
358 return result
359
360 if dtype.kind in "biu":
361 if not result.size:
362 # if we don't have any elements, just astype it
363 return trans(result).astype(dtype)
364
365 if isinstance(result, np.ndarray):
366 element = result.item(0)
367 else:
368 element = result.iloc[0]
369 if not isinstance(element, (np.integer, np.floating, int, float, bool)):
370 # a comparable, e.g. a Decimal may slip in here
371 return result
372
373 if (
374 issubclass(result.dtype.type, (np.object_, np.number))
375 and notna(result).all()
376 ):
377 new_result = trans(result).astype(dtype)
378 if new_result.dtype.kind == "O" or result.dtype.kind == "O":
379 # np.allclose may raise TypeError on object-dtype
380 if (new_result == result).all():
381 return new_result
382 else:
383 if np.allclose(new_result, result, rtol=0):
384 return new_result
385
386 elif (
387 issubclass(dtype.type, np.floating)
388 and result.dtype.kind != "b"
389 and not is_string_dtype(result.dtype)
390 ):
391 with warnings.catch_warnings():
392 warnings.filterwarnings(
393 "ignore", "overflow encountered in cast", RuntimeWarning
394 )
395 new_result = result.astype(dtype)
396
397 # Adjust tolerances based on floating point size
398 size_tols = {4: 5e-4, 8: 5e-8, 16: 5e-16}
399
400 atol = size_tols.get(new_result.dtype.itemsize, 0.0)
401
402 # Check downcast float values are still equal within 7 digits when
403 # converting from float64 to float32
404 if np.allclose(new_result, result, equal_nan=True, rtol=0.0, atol=atol):
405 return new_result
406
407 elif dtype.kind == result.dtype.kind == "c":
408 new_result = result.astype(dtype)
409
410 if np.array_equal(new_result, result, equal_nan=True):
411 # TODO: use tolerance like we do for float?
412 return new_result
413
414 return result
415
416
417def maybe_upcast_numeric_to_64bit(arr: NumpyIndexT) -> NumpyIndexT:
418 """
419 If array is a int/uint/float bit size lower than 64 bit, upcast it to 64 bit.
420
421 Parameters
422 ----------
423 arr : ndarray or ExtensionArray
424
425 Returns
426 -------
427 ndarray or ExtensionArray
428 """
429 dtype = arr.dtype
430 if dtype.kind == "i" and dtype != np.int64:
431 return arr.astype(np.int64)
432 elif dtype.kind == "u" and dtype != np.uint64:
433 return arr.astype(np.uint64)
434 elif dtype.kind == "f" and dtype != np.float64:
435 return arr.astype(np.float64)
436 else:
437 return arr
438
439
440def maybe_cast_pointwise_result(
441 result: ArrayLike,
442 dtype: DtypeObj,
443 numeric_only: bool = False,
444 same_dtype: bool = True,
445) -> ArrayLike:
446 """
447 Try casting result of a pointwise operation back to the original dtype if
448 appropriate.
449
450 Parameters
451 ----------
452 result : array-like
453 Result to cast.
454 dtype : np.dtype or ExtensionDtype
455 Input Series from which result was calculated.
456 numeric_only : bool, default False
457 Whether to cast only numerics or datetimes as well.
458 same_dtype : bool, default True
459 Specify dtype when calling _from_sequence
460
461 Returns
462 -------
463 result : array-like
464 result maybe casted to the dtype.
465 """
466
467 if isinstance(dtype, ExtensionDtype):
468 cls = dtype.construct_array_type()
469 if same_dtype:
470 result = _maybe_cast_to_extension_array(cls, result, dtype=dtype)
471 else:
472 result = _maybe_cast_to_extension_array(cls, result)
473
474 elif (numeric_only and dtype.kind in "iufcb") or not numeric_only:
475 result = maybe_downcast_to_dtype(result, dtype)
476
477 return result
478
479
480def _maybe_cast_to_extension_array(
481 cls: type[ExtensionArray], obj: ArrayLike, dtype: ExtensionDtype | None = None
482) -> ArrayLike:
483 """
484 Call to `_from_sequence` that returns the object unchanged on Exception.
485
486 Parameters
487 ----------
488 cls : class, subclass of ExtensionArray
489 obj : arraylike
490 Values to pass to cls._from_sequence
491 dtype : ExtensionDtype, optional
492
493 Returns
494 -------
495 ExtensionArray or obj
496 """
497 result: ArrayLike
498
499 if dtype is not None:
500 try:
501 result = cls._from_scalars(obj, dtype=dtype)
502 except (TypeError, ValueError):
503 return obj
504 return result
505
506 try:
507 result = cls._from_sequence(obj, dtype=dtype)
508 except Exception:
509 # We can't predict what downstream EA constructors may raise
510 result = obj
511 return result
512
513
514@overload
515def ensure_dtype_can_hold_na(dtype: np.dtype) -> np.dtype:
516 ...
517
518
519@overload
520def ensure_dtype_can_hold_na(dtype: ExtensionDtype) -> ExtensionDtype:
521 ...
522
523
524def ensure_dtype_can_hold_na(dtype: DtypeObj) -> DtypeObj:
525 """
526 If we have a dtype that cannot hold NA values, find the best match that can.
527 """
528 if isinstance(dtype, ExtensionDtype):
529 if dtype._can_hold_na:
530 return dtype
531 elif isinstance(dtype, IntervalDtype):
532 # TODO(GH#45349): don't special-case IntervalDtype, allow
533 # overriding instead of returning object below.
534 return IntervalDtype(np.float64, closed=dtype.closed)
535 return _dtype_obj
536 elif dtype.kind == "b":
537 return _dtype_obj
538 elif dtype.kind in "iu":
539 return np.dtype(np.float64)
540 return dtype
541
542
543_canonical_nans = {
544 np.datetime64: np.datetime64("NaT", "ns"),
545 np.timedelta64: np.timedelta64("NaT", "ns"),
546 type(np.nan): np.nan,
547}
548
549
550def maybe_promote(dtype: np.dtype, fill_value=np.nan):
551 """
552 Find the minimal dtype that can hold both the given dtype and fill_value.
553
554 Parameters
555 ----------
556 dtype : np.dtype
557 fill_value : scalar, default np.nan
558
559 Returns
560 -------
561 dtype
562 Upcasted from dtype argument if necessary.
563 fill_value
564 Upcasted from fill_value argument if necessary.
565
566 Raises
567 ------
568 ValueError
569 If fill_value is a non-scalar and dtype is not object.
570 """
571 orig = fill_value
572 orig_is_nat = False
573 if checknull(fill_value):
574 # https://github.com/pandas-dev/pandas/pull/39692#issuecomment-1441051740
575 # avoid cache misses with NaN/NaT values that are not singletons
576 if fill_value is not NA:
577 try:
578 orig_is_nat = np.isnat(fill_value)
579 except TypeError:
580 pass
581
582 fill_value = _canonical_nans.get(type(fill_value), fill_value)
583
584 # for performance, we are using a cached version of the actual implementation
585 # of the function in _maybe_promote. However, this doesn't always work (in case
586 # of non-hashable arguments), so we fallback to the actual implementation if needed
587 try:
588 # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type
589 # "Type[Any]"; expected "Hashable" [arg-type]
590 dtype, fill_value = _maybe_promote_cached(
591 dtype, fill_value, type(fill_value) # type: ignore[arg-type]
592 )
593 except TypeError:
594 # if fill_value is not hashable (required for caching)
595 dtype, fill_value = _maybe_promote(dtype, fill_value)
596
597 if (dtype == _dtype_obj and orig is not None) or (
598 orig_is_nat and np.datetime_data(orig)[0] != "ns"
599 ):
600 # GH#51592,53497 restore our potentially non-canonical fill_value
601 fill_value = orig
602 return dtype, fill_value
603
604
605@functools.lru_cache
606def _maybe_promote_cached(dtype, fill_value, fill_value_type):
607 # The cached version of _maybe_promote below
608 # This also use fill_value_type as (unused) argument to use this in the
609 # cache lookup -> to differentiate 1 and True
610 return _maybe_promote(dtype, fill_value)
611
612
613def _maybe_promote(dtype: np.dtype, fill_value=np.nan):
614 # The actual implementation of the function, use `maybe_promote` above for
615 # a cached version.
616 if not is_scalar(fill_value):
617 # with object dtype there is nothing to promote, and the user can
618 # pass pretty much any weird fill_value they like
619 if dtype != object:
620 # with object dtype there is nothing to promote, and the user can
621 # pass pretty much any weird fill_value they like
622 raise ValueError("fill_value must be a scalar")
623 dtype = _dtype_obj
624 return dtype, fill_value
625
626 if is_valid_na_for_dtype(fill_value, dtype) and dtype.kind in "iufcmM":
627 dtype = ensure_dtype_can_hold_na(dtype)
628 fv = na_value_for_dtype(dtype)
629 return dtype, fv
630
631 elif isinstance(dtype, CategoricalDtype):
632 if fill_value in dtype.categories or isna(fill_value):
633 return dtype, fill_value
634 else:
635 return object, ensure_object(fill_value)
636
637 elif isna(fill_value):
638 dtype = _dtype_obj
639 if fill_value is None:
640 # but we retain e.g. pd.NA
641 fill_value = np.nan
642 return dtype, fill_value
643
644 # returns tuple of (dtype, fill_value)
645 if issubclass(dtype.type, np.datetime64):
646 inferred, fv = infer_dtype_from_scalar(fill_value)
647 if inferred == dtype:
648 return dtype, fv
649
650 from pandas.core.arrays import DatetimeArray
651
652 dta = DatetimeArray._from_sequence([], dtype="M8[ns]")
653 try:
654 fv = dta._validate_setitem_value(fill_value)
655 return dta.dtype, fv
656 except (ValueError, TypeError):
657 return _dtype_obj, fill_value
658
659 elif issubclass(dtype.type, np.timedelta64):
660 inferred, fv = infer_dtype_from_scalar(fill_value)
661 if inferred == dtype:
662 return dtype, fv
663
664 elif inferred.kind == "m":
665 # different unit, e.g. passed np.timedelta64(24, "h") with dtype=m8[ns]
666 # see if we can losslessly cast it to our dtype
667 unit = np.datetime_data(dtype)[0]
668 try:
669 td = Timedelta(fill_value).as_unit(unit, round_ok=False)
670 except OutOfBoundsTimedelta:
671 return _dtype_obj, fill_value
672 else:
673 return dtype, td.asm8
674
675 return _dtype_obj, fill_value
676
677 elif is_float(fill_value):
678 if issubclass(dtype.type, np.bool_):
679 dtype = np.dtype(np.object_)
680
681 elif issubclass(dtype.type, np.integer):
682 dtype = np.dtype(np.float64)
683
684 elif dtype.kind == "f":
685 mst = np.min_scalar_type(fill_value)
686 if mst > dtype:
687 # e.g. mst is np.float64 and dtype is np.float32
688 dtype = mst
689
690 elif dtype.kind == "c":
691 mst = np.min_scalar_type(fill_value)
692 dtype = np.promote_types(dtype, mst)
693
694 elif is_bool(fill_value):
695 if not issubclass(dtype.type, np.bool_):
696 dtype = np.dtype(np.object_)
697
698 elif is_integer(fill_value):
699 if issubclass(dtype.type, np.bool_):
700 dtype = np.dtype(np.object_)
701
702 elif issubclass(dtype.type, np.integer):
703 if not np_can_cast_scalar(fill_value, dtype): # type: ignore[arg-type]
704 # upcast to prevent overflow
705 mst = np.min_scalar_type(fill_value)
706 dtype = np.promote_types(dtype, mst)
707 if dtype.kind == "f":
708 # Case where we disagree with numpy
709 dtype = np.dtype(np.object_)
710
711 elif is_complex(fill_value):
712 if issubclass(dtype.type, np.bool_):
713 dtype = np.dtype(np.object_)
714
715 elif issubclass(dtype.type, (np.integer, np.floating)):
716 mst = np.min_scalar_type(fill_value)
717 dtype = np.promote_types(dtype, mst)
718
719 elif dtype.kind == "c":
720 mst = np.min_scalar_type(fill_value)
721 if mst > dtype:
722 # e.g. mst is np.complex128 and dtype is np.complex64
723 dtype = mst
724
725 else:
726 dtype = np.dtype(np.object_)
727
728 # in case we have a string that looked like a number
729 if issubclass(dtype.type, (bytes, str)):
730 dtype = np.dtype(np.object_)
731
732 fill_value = _ensure_dtype_type(fill_value, dtype)
733 return dtype, fill_value
734
735
736def _ensure_dtype_type(value, dtype: np.dtype):
737 """
738 Ensure that the given value is an instance of the given dtype.
739
740 e.g. if out dtype is np.complex64_, we should have an instance of that
741 as opposed to a python complex object.
742
743 Parameters
744 ----------
745 value : object
746 dtype : np.dtype
747
748 Returns
749 -------
750 object
751 """
752 # Start with exceptions in which we do _not_ cast to numpy types
753
754 if dtype == _dtype_obj:
755 return value
756
757 # Note: before we get here we have already excluded isna(value)
758 return dtype.type(value)
759
760
761def infer_dtype_from(val) -> tuple[DtypeObj, Any]:
762 """
763 Interpret the dtype from a scalar or array.
764
765 Parameters
766 ----------
767 val : object
768 """
769 if not is_list_like(val):
770 return infer_dtype_from_scalar(val)
771 return infer_dtype_from_array(val)
772
773
774def infer_dtype_from_scalar(val) -> tuple[DtypeObj, Any]:
775 """
776 Interpret the dtype from a scalar.
777
778 Parameters
779 ----------
780 val : object
781 """
782 dtype: DtypeObj = _dtype_obj
783
784 # a 1-element ndarray
785 if isinstance(val, np.ndarray):
786 if val.ndim != 0:
787 msg = "invalid ndarray passed to infer_dtype_from_scalar"
788 raise ValueError(msg)
789
790 dtype = val.dtype
791 val = lib.item_from_zerodim(val)
792
793 elif isinstance(val, str):
794 # If we create an empty array using a string to infer
795 # the dtype, NumPy will only allocate one character per entry
796 # so this is kind of bad. Alternately we could use np.repeat
797 # instead of np.empty (but then you still don't want things
798 # coming out as np.str_!
799
800 dtype = _dtype_obj
801 if using_pyarrow_string_dtype():
802 from pandas.core.arrays.string_ import StringDtype
803
804 dtype = StringDtype(storage="pyarrow_numpy")
805
806 elif isinstance(val, (np.datetime64, dt.datetime)):
807 try:
808 val = Timestamp(val)
809 except OutOfBoundsDatetime:
810 return _dtype_obj, val
811
812 if val is NaT or val.tz is None:
813 val = val.to_datetime64()
814 dtype = val.dtype
815 # TODO: test with datetime(2920, 10, 1) based on test_replace_dtypes
816 else:
817 dtype = DatetimeTZDtype(unit=val.unit, tz=val.tz)
818
819 elif isinstance(val, (np.timedelta64, dt.timedelta)):
820 try:
821 val = Timedelta(val)
822 except (OutOfBoundsTimedelta, OverflowError):
823 dtype = _dtype_obj
824 else:
825 if val is NaT:
826 val = np.timedelta64("NaT", "ns")
827 else:
828 val = val.asm8
829 dtype = val.dtype
830
831 elif is_bool(val):
832 dtype = np.dtype(np.bool_)
833
834 elif is_integer(val):
835 if isinstance(val, np.integer):
836 dtype = np.dtype(type(val))
837 else:
838 dtype = np.dtype(np.int64)
839
840 try:
841 np.array(val, dtype=dtype)
842 except OverflowError:
843 dtype = np.array(val).dtype
844
845 elif is_float(val):
846 if isinstance(val, np.floating):
847 dtype = np.dtype(type(val))
848 else:
849 dtype = np.dtype(np.float64)
850
851 elif is_complex(val):
852 dtype = np.dtype(np.complex128)
853
854 if isinstance(val, Period):
855 dtype = PeriodDtype(freq=val.freq)
856 elif isinstance(val, Interval):
857 subtype = infer_dtype_from_scalar(val.left)[0]
858 dtype = IntervalDtype(subtype=subtype, closed=val.closed)
859
860 return dtype, val
861
862
863def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]:
864 """
865 Convert datetimelike-keyed dicts to a Timestamp-keyed dict.
866
867 Parameters
868 ----------
869 d: dict-like object
870
871 Returns
872 -------
873 dict
874 """
875 return {maybe_box_datetimelike(key): value for key, value in d.items()}
876
877
878def infer_dtype_from_array(arr) -> tuple[DtypeObj, ArrayLike]:
879 """
880 Infer the dtype from an array.
881
882 Parameters
883 ----------
884 arr : array
885
886 Returns
887 -------
888 tuple (pandas-compat dtype, array)
889
890
891 Examples
892 --------
893 >>> np.asarray([1, '1'])
894 array(['1', '1'], dtype='<U21')
895
896 >>> infer_dtype_from_array([1, '1'])
897 (dtype('O'), [1, '1'])
898 """
899 if isinstance(arr, np.ndarray):
900 return arr.dtype, arr
901
902 if not is_list_like(arr):
903 raise TypeError("'arr' must be list-like")
904
905 arr_dtype = getattr(arr, "dtype", None)
906 if isinstance(arr_dtype, ExtensionDtype):
907 return arr.dtype, arr
908
909 elif isinstance(arr, ABCSeries):
910 return arr.dtype, np.asarray(arr)
911
912 # don't force numpy coerce with nan's
913 inferred = lib.infer_dtype(arr, skipna=False)
914 if inferred in ["string", "bytes", "mixed", "mixed-integer"]:
915 return (np.dtype(np.object_), arr)
916
917 arr = np.asarray(arr)
918 return arr.dtype, arr
919
920
921def _maybe_infer_dtype_type(element):
922 """
923 Try to infer an object's dtype, for use in arithmetic ops.
924
925 Uses `element.dtype` if that's available.
926 Objects implementing the iterator protocol are cast to a NumPy array,
927 and from there the array's type is used.
928
929 Parameters
930 ----------
931 element : object
932 Possibly has a `.dtype` attribute, and possibly the iterator
933 protocol.
934
935 Returns
936 -------
937 tipo : type
938
939 Examples
940 --------
941 >>> from collections import namedtuple
942 >>> Foo = namedtuple("Foo", "dtype")
943 >>> _maybe_infer_dtype_type(Foo(np.dtype("i8")))
944 dtype('int64')
945 """
946 tipo = None
947 if hasattr(element, "dtype"):
948 tipo = element.dtype
949 elif is_list_like(element):
950 element = np.asarray(element)
951 tipo = element.dtype
952 return tipo
953
954
955def invalidate_string_dtypes(dtype_set: set[DtypeObj]) -> None:
956 """
957 Change string like dtypes to object for
958 ``DataFrame.select_dtypes()``.
959 """
960 # error: Argument 1 to <set> has incompatible type "Type[generic]"; expected
961 # "Union[dtype[Any], ExtensionDtype, None]"
962 # error: Argument 2 to <set> has incompatible type "Type[generic]"; expected
963 # "Union[dtype[Any], ExtensionDtype, None]"
964 non_string_dtypes = dtype_set - {
965 np.dtype("S").type, # type: ignore[arg-type]
966 np.dtype("<U").type, # type: ignore[arg-type]
967 }
968 if non_string_dtypes != dtype_set:
969 raise TypeError("string dtypes are not allowed, use 'object' instead")
970
971
972def coerce_indexer_dtype(indexer, categories) -> np.ndarray:
973 """coerce the indexer input array to the smallest dtype possible"""
974 length = len(categories)
975 if length < _int8_max:
976 return ensure_int8(indexer)
977 elif length < _int16_max:
978 return ensure_int16(indexer)
979 elif length < _int32_max:
980 return ensure_int32(indexer)
981 return ensure_int64(indexer)
982
983
984def convert_dtypes(
985 input_array: ArrayLike,
986 convert_string: bool = True,
987 convert_integer: bool = True,
988 convert_boolean: bool = True,
989 convert_floating: bool = True,
990 infer_objects: bool = False,
991 dtype_backend: Literal["numpy_nullable", "pyarrow"] = "numpy_nullable",
992) -> DtypeObj:
993 """
994 Convert objects to best possible type, and optionally,
995 to types supporting ``pd.NA``.
996
997 Parameters
998 ----------
999 input_array : ExtensionArray or np.ndarray
1000 convert_string : bool, default True
1001 Whether object dtypes should be converted to ``StringDtype()``.
1002 convert_integer : bool, default True
1003 Whether, if possible, conversion can be done to integer extension types.
1004 convert_boolean : bool, defaults True
1005 Whether object dtypes should be converted to ``BooleanDtypes()``.
1006 convert_floating : bool, defaults True
1007 Whether, if possible, conversion can be done to floating extension types.
1008 If `convert_integer` is also True, preference will be give to integer
1009 dtypes if the floats can be faithfully casted to integers.
1010 infer_objects : bool, defaults False
1011 Whether to also infer objects to float/int if possible. Is only hit if the
1012 object array contains pd.NA.
1013 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
1014 Back-end data type applied to the resultant :class:`DataFrame`
1015 (still experimental). Behaviour is as follows:
1016
1017 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
1018 (default).
1019 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
1020 DataFrame.
1021
1022 .. versionadded:: 2.0
1023
1024 Returns
1025 -------
1026 np.dtype, or ExtensionDtype
1027 """
1028 inferred_dtype: str | DtypeObj
1029
1030 if (
1031 convert_string or convert_integer or convert_boolean or convert_floating
1032 ) and isinstance(input_array, np.ndarray):
1033 if input_array.dtype == object:
1034 inferred_dtype = lib.infer_dtype(input_array)
1035 else:
1036 inferred_dtype = input_array.dtype
1037
1038 if is_string_dtype(inferred_dtype):
1039 if not convert_string or inferred_dtype == "bytes":
1040 inferred_dtype = input_array.dtype
1041 else:
1042 inferred_dtype = pandas_dtype_func("string")
1043
1044 if convert_integer:
1045 target_int_dtype = pandas_dtype_func("Int64")
1046
1047 if input_array.dtype.kind in "iu":
1048 from pandas.core.arrays.integer import NUMPY_INT_TO_DTYPE
1049
1050 inferred_dtype = NUMPY_INT_TO_DTYPE.get(
1051 input_array.dtype, target_int_dtype
1052 )
1053 elif input_array.dtype.kind in "fcb":
1054 # TODO: de-dup with maybe_cast_to_integer_array?
1055 arr = input_array[notna(input_array)]
1056 if (arr.astype(int) == arr).all():
1057 inferred_dtype = target_int_dtype
1058 else:
1059 inferred_dtype = input_array.dtype
1060 elif (
1061 infer_objects
1062 and input_array.dtype == object
1063 and (isinstance(inferred_dtype, str) and inferred_dtype == "integer")
1064 ):
1065 inferred_dtype = target_int_dtype
1066
1067 if convert_floating:
1068 if input_array.dtype.kind in "fcb":
1069 # i.e. numeric but not integer
1070 from pandas.core.arrays.floating import NUMPY_FLOAT_TO_DTYPE
1071
1072 inferred_float_dtype: DtypeObj = NUMPY_FLOAT_TO_DTYPE.get(
1073 input_array.dtype, pandas_dtype_func("Float64")
1074 )
1075 # if we could also convert to integer, check if all floats
1076 # are actually integers
1077 if convert_integer:
1078 # TODO: de-dup with maybe_cast_to_integer_array?
1079 arr = input_array[notna(input_array)]
1080 if (arr.astype(int) == arr).all():
1081 inferred_dtype = pandas_dtype_func("Int64")
1082 else:
1083 inferred_dtype = inferred_float_dtype
1084 else:
1085 inferred_dtype = inferred_float_dtype
1086 elif (
1087 infer_objects
1088 and input_array.dtype == object
1089 and (
1090 isinstance(inferred_dtype, str)
1091 and inferred_dtype == "mixed-integer-float"
1092 )
1093 ):
1094 inferred_dtype = pandas_dtype_func("Float64")
1095
1096 if convert_boolean:
1097 if input_array.dtype.kind == "b":
1098 inferred_dtype = pandas_dtype_func("boolean")
1099 elif isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
1100 inferred_dtype = pandas_dtype_func("boolean")
1101
1102 if isinstance(inferred_dtype, str):
1103 # If we couldn't do anything else, then we retain the dtype
1104 inferred_dtype = input_array.dtype
1105
1106 else:
1107 inferred_dtype = input_array.dtype
1108
1109 if dtype_backend == "pyarrow":
1110 from pandas.core.arrays.arrow.array import to_pyarrow_type
1111 from pandas.core.arrays.string_ import StringDtype
1112
1113 assert not isinstance(inferred_dtype, str)
1114
1115 if (
1116 (convert_integer and inferred_dtype.kind in "iu")
1117 or (convert_floating and inferred_dtype.kind in "fc")
1118 or (convert_boolean and inferred_dtype.kind == "b")
1119 or (convert_string and isinstance(inferred_dtype, StringDtype))
1120 or (
1121 inferred_dtype.kind not in "iufcb"
1122 and not isinstance(inferred_dtype, StringDtype)
1123 )
1124 ):
1125 if isinstance(inferred_dtype, PandasExtensionDtype) and not isinstance(
1126 inferred_dtype, DatetimeTZDtype
1127 ):
1128 base_dtype = inferred_dtype.base
1129 elif isinstance(inferred_dtype, (BaseMaskedDtype, ArrowDtype)):
1130 base_dtype = inferred_dtype.numpy_dtype
1131 elif isinstance(inferred_dtype, StringDtype):
1132 base_dtype = np.dtype(str)
1133 else:
1134 base_dtype = inferred_dtype
1135 if (
1136 base_dtype.kind == "O" # type: ignore[union-attr]
1137 and input_array.size > 0
1138 and isna(input_array).all()
1139 ):
1140 import pyarrow as pa
1141
1142 pa_type = pa.null()
1143 else:
1144 pa_type = to_pyarrow_type(base_dtype)
1145 if pa_type is not None:
1146 inferred_dtype = ArrowDtype(pa_type)
1147 elif dtype_backend == "numpy_nullable" and isinstance(inferred_dtype, ArrowDtype):
1148 # GH 53648
1149 inferred_dtype = _arrow_dtype_mapping()[inferred_dtype.pyarrow_dtype]
1150
1151 # error: Incompatible return value type (got "Union[str, Union[dtype[Any],
1152 # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]")
1153 return inferred_dtype # type: ignore[return-value]
1154
1155
1156def maybe_infer_to_datetimelike(
1157 value: npt.NDArray[np.object_],
1158) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray:
1159 """
1160 we might have a array (or single object) that is datetime like,
1161 and no dtype is passed don't change the value unless we find a
1162 datetime/timedelta set
1163
1164 this is pretty strict in that a datetime/timedelta is REQUIRED
1165 in addition to possible nulls/string likes
1166
1167 Parameters
1168 ----------
1169 value : np.ndarray[object]
1170
1171 Returns
1172 -------
1173 np.ndarray, DatetimeArray, TimedeltaArray, PeriodArray, or IntervalArray
1174
1175 """
1176 if not isinstance(value, np.ndarray) or value.dtype != object:
1177 # Caller is responsible for passing only ndarray[object]
1178 raise TypeError(type(value)) # pragma: no cover
1179 if value.ndim != 1:
1180 # Caller is responsible
1181 raise ValueError(value.ndim) # pragma: no cover
1182
1183 if not len(value):
1184 return value
1185
1186 # error: Incompatible return value type (got "Union[ExtensionArray,
1187 # ndarray[Any, Any]]", expected "Union[ndarray[Any, Any], DatetimeArray,
1188 # TimedeltaArray, PeriodArray, IntervalArray]")
1189 return lib.maybe_convert_objects( # type: ignore[return-value]
1190 value,
1191 # Here we do not convert numeric dtypes, as if we wanted that,
1192 # numpy would have done it for us.
1193 convert_numeric=False,
1194 convert_non_numeric=True,
1195 dtype_if_all_nat=np.dtype("M8[ns]"),
1196 )
1197
1198
1199def maybe_cast_to_datetime(
1200 value: np.ndarray | list, dtype: np.dtype
1201) -> ExtensionArray | np.ndarray:
1202 """
1203 try to cast the array/value to a datetimelike dtype, converting float
1204 nan to iNaT
1205
1206 Caller is responsible for handling ExtensionDtype cases and non dt64/td64
1207 cases.
1208 """
1209 from pandas.core.arrays.datetimes import DatetimeArray
1210 from pandas.core.arrays.timedeltas import TimedeltaArray
1211
1212 assert dtype.kind in "mM"
1213 if not is_list_like(value):
1214 raise TypeError("value must be listlike")
1215
1216 # TODO: _from_sequence would raise ValueError in cases where
1217 # _ensure_nanosecond_dtype raises TypeError
1218 _ensure_nanosecond_dtype(dtype)
1219
1220 if lib.is_np_dtype(dtype, "m"):
1221 res = TimedeltaArray._from_sequence(value, dtype=dtype)
1222 return res
1223 else:
1224 try:
1225 dta = DatetimeArray._from_sequence(value, dtype=dtype)
1226 except ValueError as err:
1227 # We can give a Series-specific exception message.
1228 if "cannot supply both a tz and a timezone-naive dtype" in str(err):
1229 raise ValueError(
1230 "Cannot convert timezone-aware data to "
1231 "timezone-naive dtype. Use "
1232 "pd.Series(values).dt.tz_localize(None) instead."
1233 ) from err
1234 raise
1235
1236 return dta
1237
1238
1239def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None:
1240 """
1241 Convert dtypes with granularity less than nanosecond to nanosecond
1242
1243 >>> _ensure_nanosecond_dtype(np.dtype("M8[us]"))
1244
1245 >>> _ensure_nanosecond_dtype(np.dtype("M8[D]"))
1246 Traceback (most recent call last):
1247 ...
1248 TypeError: dtype=datetime64[D] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns'
1249
1250 >>> _ensure_nanosecond_dtype(np.dtype("m8[ps]"))
1251 Traceback (most recent call last):
1252 ...
1253 TypeError: dtype=timedelta64[ps] is not supported. Supported resolutions are 's', 'ms', 'us', and 'ns'
1254 """ # noqa: E501
1255 msg = (
1256 f"The '{dtype.name}' dtype has no unit. "
1257 f"Please pass in '{dtype.name}[ns]' instead."
1258 )
1259
1260 # unpack e.g. SparseDtype
1261 dtype = getattr(dtype, "subtype", dtype)
1262
1263 if not isinstance(dtype, np.dtype):
1264 # i.e. datetime64tz
1265 pass
1266
1267 elif dtype.kind in "mM":
1268 if not is_supported_dtype(dtype):
1269 # pre-2.0 we would silently swap in nanos for lower-resolutions,
1270 # raise for above-nano resolutions
1271 if dtype.name in ["datetime64", "timedelta64"]:
1272 raise ValueError(msg)
1273 # TODO: ValueError or TypeError? existing test
1274 # test_constructor_generic_timestamp_bad_frequency expects TypeError
1275 raise TypeError(
1276 f"dtype={dtype} is not supported. Supported resolutions are 's', "
1277 "'ms', 'us', and 'ns'"
1278 )
1279
1280
1281# TODO: other value-dependent functions to standardize here include
1282# Index._find_common_type_compat
1283def find_result_type(left_dtype: DtypeObj, right: Any) -> DtypeObj:
1284 """
1285 Find the type/dtype for the result of an operation between objects.
1286
1287 This is similar to find_common_type, but looks at the right object instead
1288 of just its dtype. This can be useful in particular when the right
1289 object does not have a `dtype`.
1290
1291 Parameters
1292 ----------
1293 left_dtype : np.dtype or ExtensionDtype
1294 right : Any
1295
1296 Returns
1297 -------
1298 np.dtype or ExtensionDtype
1299
1300 See also
1301 --------
1302 find_common_type
1303 numpy.result_type
1304 """
1305 new_dtype: DtypeObj
1306
1307 if (
1308 isinstance(left_dtype, np.dtype)
1309 and left_dtype.kind in "iuc"
1310 and (lib.is_integer(right) or lib.is_float(right))
1311 ):
1312 # e.g. with int8 dtype and right=512, we want to end up with
1313 # np.int16, whereas infer_dtype_from(512) gives np.int64,
1314 # which will make us upcast too far.
1315 if lib.is_float(right) and right.is_integer() and left_dtype.kind != "f":
1316 right = int(right)
1317 # After NEP 50, numpy won't inspect Python scalars
1318 # TODO: do we need to recreate numpy's inspection logic for floats too
1319 # (this breaks some tests)
1320 if isinstance(right, int) and not isinstance(right, np.integer):
1321 # This gives an unsigned type by default
1322 # (if our number is positive)
1323
1324 # If our left dtype is signed, we might not want this since
1325 # this might give us 1 dtype too big
1326 # We should check if the corresponding int dtype (e.g. int64 for uint64)
1327 # can hold the number
1328 right_dtype = np.min_scalar_type(right)
1329 if right == 0:
1330 # Special case 0
1331 right = left_dtype
1332 elif (
1333 not np.issubdtype(left_dtype, np.unsignedinteger)
1334 and 0 < right <= np.iinfo(right_dtype).max
1335 ):
1336 # If left dtype isn't unsigned, check if it fits in the signed dtype
1337 right = np.dtype(f"i{right_dtype.itemsize}")
1338 else:
1339 right = right_dtype
1340
1341 new_dtype = np.result_type(left_dtype, right)
1342
1343 elif is_valid_na_for_dtype(right, left_dtype):
1344 # e.g. IntervalDtype[int] and None/np.nan
1345 new_dtype = ensure_dtype_can_hold_na(left_dtype)
1346
1347 else:
1348 dtype, _ = infer_dtype_from(right)
1349 new_dtype = find_common_type([left_dtype, dtype])
1350
1351 return new_dtype
1352
1353
1354def common_dtype_categorical_compat(
1355 objs: Sequence[Index | ArrayLike], dtype: DtypeObj
1356) -> DtypeObj:
1357 """
1358 Update the result of find_common_type to account for NAs in a Categorical.
1359
1360 Parameters
1361 ----------
1362 objs : list[np.ndarray | ExtensionArray | Index]
1363 dtype : np.dtype or ExtensionDtype
1364
1365 Returns
1366 -------
1367 np.dtype or ExtensionDtype
1368 """
1369 # GH#38240
1370
1371 # TODO: more generally, could do `not can_hold_na(dtype)`
1372 if lib.is_np_dtype(dtype, "iu"):
1373 for obj in objs:
1374 # We don't want to accientally allow e.g. "categorical" str here
1375 obj_dtype = getattr(obj, "dtype", None)
1376 if isinstance(obj_dtype, CategoricalDtype):
1377 if isinstance(obj, ABCIndex):
1378 # This check may already be cached
1379 hasnas = obj.hasnans
1380 else:
1381 # Categorical
1382 hasnas = cast("Categorical", obj)._hasna
1383
1384 if hasnas:
1385 # see test_union_int_categorical_with_nan
1386 dtype = np.dtype(np.float64)
1387 break
1388 return dtype
1389
1390
1391def np_find_common_type(*dtypes: np.dtype) -> np.dtype:
1392 """
1393 np.find_common_type implementation pre-1.25 deprecation using np.result_type
1394 https://github.com/pandas-dev/pandas/pull/49569#issuecomment-1308300065
1395
1396 Parameters
1397 ----------
1398 dtypes : np.dtypes
1399
1400 Returns
1401 -------
1402 np.dtype
1403 """
1404 try:
1405 common_dtype = np.result_type(*dtypes)
1406 if common_dtype.kind in "mMSU":
1407 # NumPy promotion currently (1.25) misbehaves for for times and strings,
1408 # so fall back to object (find_common_dtype did unless there
1409 # was only one dtype)
1410 common_dtype = np.dtype("O")
1411
1412 except TypeError:
1413 common_dtype = np.dtype("O")
1414 return common_dtype
1415
1416
1417@overload
1418def find_common_type(types: list[np.dtype]) -> np.dtype:
1419 ...
1420
1421
1422@overload
1423def find_common_type(types: list[ExtensionDtype]) -> DtypeObj:
1424 ...
1425
1426
1427@overload
1428def find_common_type(types: list[DtypeObj]) -> DtypeObj:
1429 ...
1430
1431
1432def find_common_type(types):
1433 """
1434 Find a common data type among the given dtypes.
1435
1436 Parameters
1437 ----------
1438 types : list of dtypes
1439
1440 Returns
1441 -------
1442 pandas extension or numpy dtype
1443
1444 See Also
1445 --------
1446 numpy.find_common_type
1447
1448 """
1449 if not types:
1450 raise ValueError("no types given")
1451
1452 first = types[0]
1453
1454 # workaround for find_common_type([np.dtype('datetime64[ns]')] * 2)
1455 # => object
1456 if lib.dtypes_all_equal(list(types)):
1457 return first
1458
1459 # get unique types (dict.fromkeys is used as order-preserving set())
1460 types = list(dict.fromkeys(types).keys())
1461
1462 if any(isinstance(t, ExtensionDtype) for t in types):
1463 for t in types:
1464 if isinstance(t, ExtensionDtype):
1465 res = t._get_common_dtype(types)
1466 if res is not None:
1467 return res
1468 return np.dtype("object")
1469
1470 # take lowest unit
1471 if all(lib.is_np_dtype(t, "M") for t in types):
1472 return np.dtype(max(types))
1473 if all(lib.is_np_dtype(t, "m") for t in types):
1474 return np.dtype(max(types))
1475
1476 # don't mix bool / int or float or complex
1477 # this is different from numpy, which casts bool with float/int as int
1478 has_bools = any(t.kind == "b" for t in types)
1479 if has_bools:
1480 for t in types:
1481 if t.kind in "iufc":
1482 return np.dtype("object")
1483
1484 return np_find_common_type(*types)
1485
1486
1487def construct_2d_arraylike_from_scalar(
1488 value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool
1489) -> np.ndarray:
1490 shape = (length, width)
1491
1492 if dtype.kind in "mM":
1493 value = _maybe_box_and_unbox_datetimelike(value, dtype)
1494 elif dtype == _dtype_obj:
1495 if isinstance(value, (np.timedelta64, np.datetime64)):
1496 # calling np.array below would cast to pytimedelta/pydatetime
1497 out = np.empty(shape, dtype=object)
1498 out.fill(value)
1499 return out
1500
1501 # Attempt to coerce to a numpy array
1502 try:
1503 if not copy:
1504 arr = np.asarray(value, dtype=dtype)
1505 else:
1506 arr = np.array(value, dtype=dtype, copy=copy)
1507 except (ValueError, TypeError) as err:
1508 raise TypeError(
1509 f"DataFrame constructor called with incompatible data and dtype: {err}"
1510 ) from err
1511
1512 if arr.ndim != 0:
1513 raise ValueError("DataFrame constructor not properly called!")
1514
1515 return np.full(shape, arr)
1516
1517
1518def construct_1d_arraylike_from_scalar(
1519 value: Scalar, length: int, dtype: DtypeObj | None
1520) -> ArrayLike:
1521 """
1522 create a np.ndarray / pandas type of specified shape and dtype
1523 filled with values
1524
1525 Parameters
1526 ----------
1527 value : scalar value
1528 length : int
1529 dtype : pandas_dtype or np.dtype
1530
1531 Returns
1532 -------
1533 np.ndarray / pandas type of length, filled with value
1534
1535 """
1536
1537 if dtype is None:
1538 try:
1539 dtype, value = infer_dtype_from_scalar(value)
1540 except OutOfBoundsDatetime:
1541 dtype = _dtype_obj
1542
1543 if isinstance(dtype, ExtensionDtype):
1544 cls = dtype.construct_array_type()
1545 seq = [] if length == 0 else [value]
1546 subarr = cls._from_sequence(seq, dtype=dtype).repeat(length)
1547
1548 else:
1549 if length and dtype.kind in "iu" and isna(value):
1550 # coerce if we have nan for an integer dtype
1551 dtype = np.dtype("float64")
1552 elif lib.is_np_dtype(dtype, "US"):
1553 # we need to coerce to object dtype to avoid
1554 # to allow numpy to take our string as a scalar value
1555 dtype = np.dtype("object")
1556 if not isna(value):
1557 value = ensure_str(value)
1558 elif dtype.kind in "mM":
1559 value = _maybe_box_and_unbox_datetimelike(value, dtype)
1560
1561 subarr = np.empty(length, dtype=dtype)
1562 if length:
1563 # GH 47391: numpy > 1.24 will raise filling np.nan into int dtypes
1564 subarr.fill(value)
1565
1566 return subarr
1567
1568
1569def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj):
1570 # Caller is responsible for checking dtype.kind in "mM"
1571
1572 if isinstance(value, dt.datetime):
1573 # we dont want to box dt64, in particular datetime64("NaT")
1574 value = maybe_box_datetimelike(value, dtype)
1575
1576 return _maybe_unbox_datetimelike(value, dtype)
1577
1578
1579def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
1580 """
1581 Transform any list-like object in a 1-dimensional numpy array of object
1582 dtype.
1583
1584 Parameters
1585 ----------
1586 values : any iterable which has a len()
1587
1588 Raises
1589 ------
1590 TypeError
1591 * If `values` does not have a len()
1592
1593 Returns
1594 -------
1595 1-dimensional numpy array of dtype object
1596 """
1597 # numpy will try to interpret nested lists as further dimensions, hence
1598 # making a 1D array that contains list-likes is a bit tricky:
1599 result = np.empty(len(values), dtype="object")
1600 result[:] = values
1601 return result
1602
1603
1604def maybe_cast_to_integer_array(arr: list | np.ndarray, dtype: np.dtype) -> np.ndarray:
1605 """
1606 Takes any dtype and returns the casted version, raising for when data is
1607 incompatible with integer/unsigned integer dtypes.
1608
1609 Parameters
1610 ----------
1611 arr : np.ndarray or list
1612 The array to cast.
1613 dtype : np.dtype
1614 The integer dtype to cast the array to.
1615
1616 Returns
1617 -------
1618 ndarray
1619 Array of integer or unsigned integer dtype.
1620
1621 Raises
1622 ------
1623 OverflowError : the dtype is incompatible with the data
1624 ValueError : loss of precision has occurred during casting
1625
1626 Examples
1627 --------
1628 If you try to coerce negative values to unsigned integers, it raises:
1629
1630 >>> pd.Series([-1], dtype="uint64")
1631 Traceback (most recent call last):
1632 ...
1633 OverflowError: Trying to coerce negative values to unsigned integers
1634
1635 Also, if you try to coerce float values to integers, it raises:
1636
1637 >>> maybe_cast_to_integer_array([1, 2, 3.5], dtype=np.dtype("int64"))
1638 Traceback (most recent call last):
1639 ...
1640 ValueError: Trying to coerce float values to integers
1641 """
1642 assert dtype.kind in "iu"
1643
1644 try:
1645 if not isinstance(arr, np.ndarray):
1646 with warnings.catch_warnings():
1647 # We already disallow dtype=uint w/ negative numbers
1648 # (test_constructor_coercion_signed_to_unsigned) so safe to ignore.
1649 warnings.filterwarnings(
1650 "ignore",
1651 "NumPy will stop allowing conversion of out-of-bound Python int",
1652 DeprecationWarning,
1653 )
1654 casted = np.asarray(arr, dtype=dtype)
1655 else:
1656 with warnings.catch_warnings():
1657 warnings.filterwarnings("ignore", category=RuntimeWarning)
1658 casted = arr.astype(dtype, copy=False)
1659 except OverflowError as err:
1660 raise OverflowError(
1661 "The elements provided in the data cannot all be "
1662 f"casted to the dtype {dtype}"
1663 ) from err
1664
1665 if isinstance(arr, np.ndarray) and arr.dtype == dtype:
1666 # avoid expensive array_equal check
1667 return casted
1668
1669 with warnings.catch_warnings():
1670 warnings.filterwarnings("ignore", category=RuntimeWarning)
1671 warnings.filterwarnings(
1672 "ignore", "elementwise comparison failed", FutureWarning
1673 )
1674 if np.array_equal(arr, casted):
1675 return casted
1676
1677 # We do this casting to allow for proper
1678 # data and dtype checking.
1679 #
1680 # We didn't do this earlier because NumPy
1681 # doesn't handle `uint64` correctly.
1682 arr = np.asarray(arr)
1683
1684 if np.issubdtype(arr.dtype, str):
1685 # TODO(numpy-2.0 min): This case will raise an OverflowError above
1686 if (casted.astype(str) == arr).all():
1687 return casted
1688 raise ValueError(f"string values cannot be losslessly cast to {dtype}")
1689
1690 if dtype.kind == "u" and (arr < 0).any():
1691 # TODO: can this be hit anymore after numpy 2.0?
1692 raise OverflowError("Trying to coerce negative values to unsigned integers")
1693
1694 if arr.dtype.kind == "f":
1695 if not np.isfinite(arr).all():
1696 raise IntCastingNaNError(
1697 "Cannot convert non-finite values (NA or inf) to integer"
1698 )
1699 raise ValueError("Trying to coerce float values to integers")
1700 if arr.dtype == object:
1701 raise ValueError("Trying to coerce float values to integers")
1702
1703 if casted.dtype < arr.dtype:
1704 # TODO: Can this path be hit anymore with numpy > 2
1705 # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows
1706 raise ValueError(
1707 f"Values are too large to be losslessly converted to {dtype}. "
1708 f"To cast anyway, use pd.Series(values).astype({dtype})"
1709 )
1710
1711 if arr.dtype.kind in "mM":
1712 # test_constructor_maskedarray_nonfloat
1713 raise TypeError(
1714 f"Constructing a Series or DataFrame from {arr.dtype} values and "
1715 f"dtype={dtype} is not supported. Use values.view({dtype}) instead."
1716 )
1717
1718 # No known cases that get here, but raising explicitly to cover our bases.
1719 raise ValueError(f"values cannot be losslessly cast to {dtype}")
1720
1721
1722def can_hold_element(arr: ArrayLike, element: Any) -> bool:
1723 """
1724 Can we do an inplace setitem with this element in an array with this dtype?
1725
1726 Parameters
1727 ----------
1728 arr : np.ndarray or ExtensionArray
1729 element : Any
1730
1731 Returns
1732 -------
1733 bool
1734 """
1735 dtype = arr.dtype
1736 if not isinstance(dtype, np.dtype) or dtype.kind in "mM":
1737 if isinstance(dtype, (PeriodDtype, IntervalDtype, DatetimeTZDtype, np.dtype)):
1738 # np.dtype here catches datetime64ns and timedelta64ns; we assume
1739 # in this case that we have DatetimeArray/TimedeltaArray
1740 arr = cast(
1741 "PeriodArray | DatetimeArray | TimedeltaArray | IntervalArray", arr
1742 )
1743 try:
1744 arr._validate_setitem_value(element)
1745 return True
1746 except (ValueError, TypeError):
1747 return False
1748
1749 # This is technically incorrect, but maintains the behavior of
1750 # ExtensionBlock._can_hold_element
1751 return True
1752
1753 try:
1754 np_can_hold_element(dtype, element)
1755 return True
1756 except (TypeError, LossySetitemError):
1757 return False
1758
1759
1760def np_can_hold_element(dtype: np.dtype, element: Any) -> Any:
1761 """
1762 Raise if we cannot losslessly set this element into an ndarray with this dtype.
1763
1764 Specifically about places where we disagree with numpy. i.e. there are
1765 cases where numpy will raise in doing the setitem that we do not check
1766 for here, e.g. setting str "X" into a numeric ndarray.
1767
1768 Returns
1769 -------
1770 Any
1771 The element, potentially cast to the dtype.
1772
1773 Raises
1774 ------
1775 ValueError : If we cannot losslessly store this element with this dtype.
1776 """
1777 if dtype == _dtype_obj:
1778 return element
1779
1780 tipo = _maybe_infer_dtype_type(element)
1781
1782 if dtype.kind in "iu":
1783 if isinstance(element, range):
1784 if _dtype_can_hold_range(element, dtype):
1785 return element
1786 raise LossySetitemError
1787
1788 if is_integer(element) or (is_float(element) and element.is_integer()):
1789 # e.g. test_setitem_series_int8 if we have a python int 1
1790 # tipo may be np.int32, despite the fact that it will fit
1791 # in smaller int dtypes.
1792 info = np.iinfo(dtype)
1793 if info.min <= element <= info.max:
1794 return dtype.type(element)
1795 raise LossySetitemError
1796
1797 if tipo is not None:
1798 if tipo.kind not in "iu":
1799 if isinstance(element, np.ndarray) and element.dtype.kind == "f":
1800 # If all can be losslessly cast to integers, then we can hold them
1801 with np.errstate(invalid="ignore"):
1802 # We check afterwards if cast was losslessly, so no need to show
1803 # the warning
1804 casted = element.astype(dtype)
1805 comp = casted == element
1806 if comp.all():
1807 # Return the casted values bc they can be passed to
1808 # np.putmask, whereas the raw values cannot.
1809 # see TestSetitemFloatNDarrayIntoIntegerSeries
1810 return casted
1811 raise LossySetitemError
1812
1813 elif isinstance(element, ABCExtensionArray) and isinstance(
1814 element.dtype, CategoricalDtype
1815 ):
1816 # GH#52927 setting Categorical value into non-EA frame
1817 # TODO: general-case for EAs?
1818 try:
1819 casted = element.astype(dtype)
1820 except (ValueError, TypeError):
1821 raise LossySetitemError
1822 # Check for cases of either
1823 # a) lossy overflow/rounding or
1824 # b) semantic changes like dt64->int64
1825 comp = casted == element
1826 if not comp.all():
1827 raise LossySetitemError
1828 return casted
1829
1830 # Anything other than integer we cannot hold
1831 raise LossySetitemError
1832 if (
1833 dtype.kind == "u"
1834 and isinstance(element, np.ndarray)
1835 and element.dtype.kind == "i"
1836 ):
1837 # see test_where_uint64
1838 casted = element.astype(dtype)
1839 if (casted == element).all():
1840 # TODO: faster to check (element >=0).all()? potential
1841 # itemsize issues there?
1842 return casted
1843 raise LossySetitemError
1844 if dtype.itemsize < tipo.itemsize:
1845 raise LossySetitemError
1846 if not isinstance(tipo, np.dtype):
1847 # i.e. nullable IntegerDtype; we can put this into an ndarray
1848 # losslessly iff it has no NAs
1849 arr = element._values if isinstance(element, ABCSeries) else element
1850 if arr._hasna:
1851 raise LossySetitemError
1852 return element
1853
1854 return element
1855
1856 raise LossySetitemError
1857
1858 if dtype.kind == "f":
1859 if lib.is_integer(element) or lib.is_float(element):
1860 casted = dtype.type(element)
1861 if np.isnan(casted) or casted == element:
1862 return casted
1863 # otherwise e.g. overflow see TestCoercionFloat32
1864 raise LossySetitemError
1865
1866 if tipo is not None:
1867 # TODO: itemsize check?
1868 if tipo.kind not in "iuf":
1869 # Anything other than float/integer we cannot hold
1870 raise LossySetitemError
1871 if not isinstance(tipo, np.dtype):
1872 # i.e. nullable IntegerDtype or FloatingDtype;
1873 # we can put this into an ndarray losslessly iff it has no NAs
1874 if element._hasna:
1875 raise LossySetitemError
1876 return element
1877 elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind:
1878 if isinstance(element, np.ndarray):
1879 # e.g. TestDataFrameIndexingWhere::test_where_alignment
1880 casted = element.astype(dtype)
1881 if np.array_equal(casted, element, equal_nan=True):
1882 return casted
1883 raise LossySetitemError
1884
1885 return element
1886
1887 raise LossySetitemError
1888
1889 if dtype.kind == "c":
1890 if lib.is_integer(element) or lib.is_complex(element) or lib.is_float(element):
1891 if np.isnan(element):
1892 # see test_where_complex GH#6345
1893 return dtype.type(element)
1894
1895 with warnings.catch_warnings():
1896 warnings.filterwarnings("ignore")
1897 casted = dtype.type(element)
1898 if casted == element:
1899 return casted
1900 # otherwise e.g. overflow see test_32878_complex_itemsize
1901 raise LossySetitemError
1902
1903 if tipo is not None:
1904 if tipo.kind in "iufc":
1905 return element
1906 raise LossySetitemError
1907 raise LossySetitemError
1908
1909 if dtype.kind == "b":
1910 if tipo is not None:
1911 if tipo.kind == "b":
1912 if not isinstance(tipo, np.dtype):
1913 # i.e. we have a BooleanArray
1914 if element._hasna:
1915 # i.e. there are pd.NA elements
1916 raise LossySetitemError
1917 return element
1918 raise LossySetitemError
1919 if lib.is_bool(element):
1920 return element
1921 raise LossySetitemError
1922
1923 if dtype.kind == "S":
1924 # TODO: test tests.frame.methods.test_replace tests get here,
1925 # need more targeted tests. xref phofl has a PR about this
1926 if tipo is not None:
1927 if tipo.kind == "S" and tipo.itemsize <= dtype.itemsize:
1928 return element
1929 raise LossySetitemError
1930 if isinstance(element, bytes) and len(element) <= dtype.itemsize:
1931 return element
1932 raise LossySetitemError
1933
1934 if dtype.kind == "V":
1935 # i.e. np.void, which cannot hold _anything_
1936 raise LossySetitemError
1937
1938 raise NotImplementedError(dtype)
1939
1940
1941def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
1942 """
1943 _maybe_infer_dtype_type infers to int64 (and float64 for very large endpoints),
1944 but in many cases a range can be held by a smaller integer dtype.
1945 Check if this is one of those cases.
1946 """
1947 if not len(rng):
1948 return True
1949 return np_can_cast_scalar(rng.start, dtype) and np_can_cast_scalar(rng.stop, dtype)
1950
1951
1952def np_can_cast_scalar(element: Scalar, dtype: np.dtype) -> bool:
1953 """
1954 np.can_cast pandas-equivalent for pre 2-0 behavior that allowed scalar
1955 inference
1956
1957 Parameters
1958 ----------
1959 element : Scalar
1960 dtype : np.dtype
1961
1962 Returns
1963 -------
1964 bool
1965 """
1966 try:
1967 np_can_hold_element(dtype, element)
1968 return True
1969 except (LossySetitemError, NotImplementedError):
1970 return False