1"""
2missing types & inference
3"""
4from __future__ import annotations
5
6from decimal import Decimal
7from functools import partial
8from typing import (
9 TYPE_CHECKING,
10 overload,
11)
12
13import numpy as np
14
15from pandas._config import get_option
16
17from pandas._libs import lib
18import pandas._libs.missing as libmissing
19from pandas._libs.tslibs import (
20 NaT,
21 iNaT,
22)
23
24from pandas.core.dtypes.common import (
25 DT64NS_DTYPE,
26 TD64NS_DTYPE,
27 ensure_object,
28 is_bool_dtype,
29 is_categorical_dtype,
30 is_complex_dtype,
31 is_dtype_equal,
32 is_extension_array_dtype,
33 is_float_dtype,
34 is_integer_dtype,
35 is_object_dtype,
36 is_scalar,
37 is_string_or_object_np_dtype,
38 needs_i8_conversion,
39)
40from pandas.core.dtypes.dtypes import (
41 CategoricalDtype,
42 DatetimeTZDtype,
43 ExtensionDtype,
44 IntervalDtype,
45 PeriodDtype,
46)
47from pandas.core.dtypes.generic import (
48 ABCDataFrame,
49 ABCExtensionArray,
50 ABCIndex,
51 ABCMultiIndex,
52 ABCSeries,
53)
54from pandas.core.dtypes.inference import is_list_like
55
56if TYPE_CHECKING:
57 from pandas._typing import (
58 ArrayLike,
59 DtypeObj,
60 NDFrame,
61 NDFrameT,
62 Scalar,
63 npt,
64 )
65
66 from pandas.core.indexes.base import Index
67
68
69isposinf_scalar = libmissing.isposinf_scalar
70isneginf_scalar = libmissing.isneginf_scalar
71
72nan_checker = np.isnan
73INF_AS_NA = False
74_dtype_object = np.dtype("object")
75_dtype_str = np.dtype(str)
76
77
78@overload
79def isna(obj: Scalar) -> bool:
80 ...
81
82
83@overload
84def isna(
85 obj: ArrayLike | Index | list,
86) -> npt.NDArray[np.bool_]:
87 ...
88
89
90@overload
91def isna(obj: NDFrameT) -> NDFrameT:
92 ...
93
94
95# handle unions
96@overload
97def isna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]:
98 ...
99
100
101@overload
102def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
103 ...
104
105
106def isna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
107 """
108 Detect missing values for an array-like object.
109
110 This function takes a scalar or array-like object and indicates
111 whether values are missing (``NaN`` in numeric arrays, ``None`` or ``NaN``
112 in object arrays, ``NaT`` in datetimelike).
113
114 Parameters
115 ----------
116 obj : scalar or array-like
117 Object to check for null or missing values.
118
119 Returns
120 -------
121 bool or array-like of bool
122 For scalar input, returns a scalar boolean.
123 For array input, returns an array of boolean indicating whether each
124 corresponding element is missing.
125
126 See Also
127 --------
128 notna : Boolean inverse of pandas.isna.
129 Series.isna : Detect missing values in a Series.
130 DataFrame.isna : Detect missing values in a DataFrame.
131 Index.isna : Detect missing values in an Index.
132
133 Examples
134 --------
135 Scalar arguments (including strings) result in a scalar boolean.
136
137 >>> pd.isna('dog')
138 False
139
140 >>> pd.isna(pd.NA)
141 True
142
143 >>> pd.isna(np.nan)
144 True
145
146 ndarrays result in an ndarray of booleans.
147
148 >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
149 >>> array
150 array([[ 1., nan, 3.],
151 [ 4., 5., nan]])
152 >>> pd.isna(array)
153 array([[False, True, False],
154 [False, False, True]])
155
156 For indexes, an ndarray of booleans is returned.
157
158 >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
159 ... "2017-07-08"])
160 >>> index
161 DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
162 dtype='datetime64[ns]', freq=None)
163 >>> pd.isna(index)
164 array([False, False, True, False])
165
166 For Series and DataFrame, the same type is returned, containing booleans.
167
168 >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
169 >>> df
170 0 1 2
171 0 ant bee cat
172 1 dog None fly
173 >>> pd.isna(df)
174 0 1 2
175 0 False False False
176 1 False True False
177
178 >>> pd.isna(df[1])
179 0 False
180 1 True
181 Name: 1, dtype: bool
182 """
183 return _isna(obj)
184
185
186isnull = isna
187
188
189def _isna(obj, inf_as_na: bool = False):
190 """
191 Detect missing values, treating None, NaN or NA as null. Infinite
192 values will also be treated as null if inf_as_na is True.
193
194 Parameters
195 ----------
196 obj: ndarray or object value
197 Input array or scalar value.
198 inf_as_na: bool
199 Whether to treat infinity as null.
200
201 Returns
202 -------
203 boolean ndarray or boolean
204 """
205 if is_scalar(obj):
206 return libmissing.checknull(obj, inf_as_na=inf_as_na)
207 elif isinstance(obj, ABCMultiIndex):
208 raise NotImplementedError("isna is not defined for MultiIndex")
209 elif isinstance(obj, type):
210 return False
211 elif isinstance(obj, (np.ndarray, ABCExtensionArray)):
212 return _isna_array(obj, inf_as_na=inf_as_na)
213 elif isinstance(obj, ABCIndex):
214 # Try to use cached isna, which also short-circuits for integer dtypes
215 # and avoids materializing RangeIndex._values
216 if not obj._can_hold_na:
217 return obj.isna()
218 return _isna_array(obj._values, inf_as_na=inf_as_na)
219
220 elif isinstance(obj, ABCSeries):
221 result = _isna_array(obj._values, inf_as_na=inf_as_na)
222 # box
223 result = obj._constructor(result, index=obj.index, name=obj.name, copy=False)
224 return result
225 elif isinstance(obj, ABCDataFrame):
226 return obj.isna()
227 elif isinstance(obj, list):
228 return _isna_array(np.asarray(obj, dtype=object), inf_as_na=inf_as_na)
229 elif hasattr(obj, "__array__"):
230 return _isna_array(np.asarray(obj), inf_as_na=inf_as_na)
231 else:
232 return False
233
234
235def _use_inf_as_na(key) -> None:
236 """
237 Option change callback for na/inf behaviour.
238
239 Choose which replacement for numpy.isnan / -numpy.isfinite is used.
240
241 Parameters
242 ----------
243 flag: bool
244 True means treat None, NaN, INF, -INF as null (old way),
245 False means None and NaN are null, but INF, -INF are not null
246 (new way).
247
248 Notes
249 -----
250 This approach to setting global module values is discussed and
251 approved here:
252
253 * https://stackoverflow.com/questions/4859217/
254 programmatically-creating-variables-in-python/4859312#4859312
255 """
256 inf_as_na = get_option(key)
257 globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na)
258 if inf_as_na:
259 globals()["nan_checker"] = lambda x: ~np.isfinite(x)
260 globals()["INF_AS_NA"] = True
261 else:
262 globals()["nan_checker"] = np.isnan
263 globals()["INF_AS_NA"] = False
264
265
266def _isna_array(values: ArrayLike, inf_as_na: bool = False):
267 """
268 Return an array indicating which values of the input array are NaN / NA.
269
270 Parameters
271 ----------
272 obj: ndarray or ExtensionArray
273 The input array whose elements are to be checked.
274 inf_as_na: bool
275 Whether or not to treat infinite values as NA.
276
277 Returns
278 -------
279 array-like
280 Array of boolean values denoting the NA status of each element.
281 """
282 dtype = values.dtype
283
284 if not isinstance(values, np.ndarray):
285 # i.e. ExtensionArray
286 if inf_as_na and is_categorical_dtype(dtype):
287 result = libmissing.isnaobj(values.to_numpy(), inf_as_na=inf_as_na)
288 else:
289 # error: Incompatible types in assignment (expression has type
290 # "Union[ndarray[Any, Any], ExtensionArraySupportsAnyAll]", variable has
291 # type "ndarray[Any, dtype[bool_]]")
292 result = values.isna() # type: ignore[assignment]
293 elif is_string_or_object_np_dtype(values.dtype):
294 result = _isna_string_dtype(values, inf_as_na=inf_as_na)
295 elif needs_i8_conversion(dtype):
296 # this is the NaT pattern
297 result = values.view("i8") == iNaT
298 else:
299 if inf_as_na:
300 result = ~np.isfinite(values)
301 else:
302 result = np.isnan(values)
303
304 return result
305
306
307def _isna_string_dtype(values: np.ndarray, inf_as_na: bool) -> npt.NDArray[np.bool_]:
308 # Working around NumPy ticket 1542
309 dtype = values.dtype
310
311 if dtype.kind in ("S", "U"):
312 result = np.zeros(values.shape, dtype=bool)
313 else:
314 if values.ndim in {1, 2}:
315 result = libmissing.isnaobj(values, inf_as_na=inf_as_na)
316 else:
317 # 0-D, reached via e.g. mask_missing
318 result = libmissing.isnaobj(values.ravel(), inf_as_na=inf_as_na)
319 result = result.reshape(values.shape)
320
321 return result
322
323
324@overload
325def notna(obj: Scalar) -> bool:
326 ...
327
328
329@overload
330def notna(
331 obj: ArrayLike | Index | list,
332) -> npt.NDArray[np.bool_]:
333 ...
334
335
336@overload
337def notna(obj: NDFrameT) -> NDFrameT:
338 ...
339
340
341# handle unions
342@overload
343def notna(obj: NDFrameT | ArrayLike | Index | list) -> NDFrameT | npt.NDArray[np.bool_]:
344 ...
345
346
347@overload
348def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
349 ...
350
351
352def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame:
353 """
354 Detect non-missing values for an array-like object.
355
356 This function takes a scalar or array-like object and indicates
357 whether values are valid (not missing, which is ``NaN`` in numeric
358 arrays, ``None`` or ``NaN`` in object arrays, ``NaT`` in datetimelike).
359
360 Parameters
361 ----------
362 obj : array-like or object value
363 Object to check for *not* null or *non*-missing values.
364
365 Returns
366 -------
367 bool or array-like of bool
368 For scalar input, returns a scalar boolean.
369 For array input, returns an array of boolean indicating whether each
370 corresponding element is valid.
371
372 See Also
373 --------
374 isna : Boolean inverse of pandas.notna.
375 Series.notna : Detect valid values in a Series.
376 DataFrame.notna : Detect valid values in a DataFrame.
377 Index.notna : Detect valid values in an Index.
378
379 Examples
380 --------
381 Scalar arguments (including strings) result in a scalar boolean.
382
383 >>> pd.notna('dog')
384 True
385
386 >>> pd.notna(pd.NA)
387 False
388
389 >>> pd.notna(np.nan)
390 False
391
392 ndarrays result in an ndarray of booleans.
393
394 >>> array = np.array([[1, np.nan, 3], [4, 5, np.nan]])
395 >>> array
396 array([[ 1., nan, 3.],
397 [ 4., 5., nan]])
398 >>> pd.notna(array)
399 array([[ True, False, True],
400 [ True, True, False]])
401
402 For indexes, an ndarray of booleans is returned.
403
404 >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None,
405 ... "2017-07-08"])
406 >>> index
407 DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'],
408 dtype='datetime64[ns]', freq=None)
409 >>> pd.notna(index)
410 array([ True, True, False, True])
411
412 For Series and DataFrame, the same type is returned, containing booleans.
413
414 >>> df = pd.DataFrame([['ant', 'bee', 'cat'], ['dog', None, 'fly']])
415 >>> df
416 0 1 2
417 0 ant bee cat
418 1 dog None fly
419 >>> pd.notna(df)
420 0 1 2
421 0 True True True
422 1 True False True
423
424 >>> pd.notna(df[1])
425 0 True
426 1 False
427 Name: 1, dtype: bool
428 """
429 res = isna(obj)
430 if isinstance(res, bool):
431 return not res
432 return ~res
433
434
435notnull = notna
436
437
438def isna_compat(arr, fill_value=np.nan) -> bool:
439 """
440 Parameters
441 ----------
442 arr: a numpy array
443 fill_value: fill value, default to np.nan
444
445 Returns
446 -------
447 True if we can fill using this fill_value
448 """
449 if isna(fill_value):
450 dtype = arr.dtype
451 return not (is_bool_dtype(dtype) or is_integer_dtype(dtype))
452 return True
453
454
455def array_equivalent(
456 left,
457 right,
458 strict_nan: bool = False,
459 dtype_equal: bool = False,
460) -> bool:
461 """
462 True if two arrays, left and right, have equal non-NaN elements, and NaNs
463 in corresponding locations. False otherwise. It is assumed that left and
464 right are NumPy arrays of the same dtype. The behavior of this function
465 (particularly with respect to NaNs) is not defined if the dtypes are
466 different.
467
468 Parameters
469 ----------
470 left, right : ndarrays
471 strict_nan : bool, default False
472 If True, consider NaN and None to be different.
473 dtype_equal : bool, default False
474 Whether `left` and `right` are known to have the same dtype
475 according to `is_dtype_equal`. Some methods like `BlockManager.equals`.
476 require that the dtypes match. Setting this to ``True`` can improve
477 performance, but will give different results for arrays that are
478 equal but different dtypes.
479
480 Returns
481 -------
482 b : bool
483 Returns True if the arrays are equivalent.
484
485 Examples
486 --------
487 >>> array_equivalent(
488 ... np.array([1, 2, np.nan]),
489 ... np.array([1, 2, np.nan]))
490 True
491 >>> array_equivalent(
492 ... np.array([1, np.nan, 2]),
493 ... np.array([1, 2, np.nan]))
494 False
495 """
496 left, right = np.asarray(left), np.asarray(right)
497
498 # shape compat
499 if left.shape != right.shape:
500 return False
501
502 if dtype_equal:
503 # fastpath when we require that the dtypes match (Block.equals)
504 if left.dtype.kind in ["f", "c"]:
505 return _array_equivalent_float(left, right)
506 elif needs_i8_conversion(left.dtype):
507 return _array_equivalent_datetimelike(left, right)
508 elif is_string_or_object_np_dtype(left.dtype):
509 # TODO: fastpath for pandas' StringDtype
510 return _array_equivalent_object(left, right, strict_nan)
511 else:
512 return np.array_equal(left, right)
513
514 # Slow path when we allow comparing different dtypes.
515 # Object arrays can contain None, NaN and NaT.
516 # string dtypes must be come to this path for NumPy 1.7.1 compat
517 if left.dtype.kind in "OSU" or right.dtype.kind in "OSU":
518 # Note: `in "OSU"` is non-trivially faster than `in ["O", "S", "U"]`
519 # or `in ("O", "S", "U")`
520 return _array_equivalent_object(left, right, strict_nan)
521
522 # NaNs can occur in float and complex arrays.
523 if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype):
524 if not (left.size and right.size):
525 return True
526 return ((left == right) | (isna(left) & isna(right))).all()
527
528 elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype):
529 # datetime64, timedelta64, Period
530 if not is_dtype_equal(left.dtype, right.dtype):
531 return False
532
533 left = left.view("i8")
534 right = right.view("i8")
535
536 # if we have structured dtypes, compare first
537 if (
538 left.dtype.type is np.void or right.dtype.type is np.void
539 ) and left.dtype != right.dtype:
540 return False
541
542 return np.array_equal(left, right)
543
544
545def _array_equivalent_float(left, right) -> bool:
546 return bool(((left == right) | (np.isnan(left) & np.isnan(right))).all())
547
548
549def _array_equivalent_datetimelike(left, right):
550 return np.array_equal(left.view("i8"), right.view("i8"))
551
552
553def _array_equivalent_object(left: np.ndarray, right: np.ndarray, strict_nan: bool):
554 if not strict_nan:
555 # isna considers NaN and None to be equivalent.
556
557 return lib.array_equivalent_object(ensure_object(left), ensure_object(right))
558
559 for left_value, right_value in zip(left, right):
560 if left_value is NaT and right_value is not NaT:
561 return False
562
563 elif left_value is libmissing.NA and right_value is not libmissing.NA:
564 return False
565
566 elif isinstance(left_value, float) and np.isnan(left_value):
567 if not isinstance(right_value, float) or not np.isnan(right_value):
568 return False
569 else:
570 try:
571 if np.any(np.asarray(left_value != right_value)):
572 return False
573 except TypeError as err:
574 if "boolean value of NA is ambiguous" in str(err):
575 return False
576 raise
577 except ValueError:
578 # numpy can raise a ValueError if left and right cannot be
579 # compared (e.g. nested arrays)
580 return False
581 return True
582
583
584def array_equals(left: ArrayLike, right: ArrayLike) -> bool:
585 """
586 ExtensionArray-compatible implementation of array_equivalent.
587 """
588 if not is_dtype_equal(left.dtype, right.dtype):
589 return False
590 elif isinstance(left, ABCExtensionArray):
591 return left.equals(right)
592 else:
593 return array_equivalent(left, right, dtype_equal=True)
594
595
596def infer_fill_value(val):
597 """
598 infer the fill value for the nan/NaT from the provided
599 scalar/ndarray/list-like if we are a NaT, return the correct dtyped
600 element to provide proper block construction
601 """
602 if not is_list_like(val):
603 val = [val]
604 val = np.array(val, copy=False)
605 if needs_i8_conversion(val.dtype):
606 return np.array("NaT", dtype=val.dtype)
607 elif is_object_dtype(val.dtype):
608 dtype = lib.infer_dtype(ensure_object(val), skipna=False)
609 if dtype in ["datetime", "datetime64"]:
610 return np.array("NaT", dtype=DT64NS_DTYPE)
611 elif dtype in ["timedelta", "timedelta64"]:
612 return np.array("NaT", dtype=TD64NS_DTYPE)
613 return np.nan
614
615
616def maybe_fill(arr: np.ndarray) -> np.ndarray:
617 """
618 Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype.
619 """
620 if arr.dtype.kind not in ("u", "i", "b"):
621 arr.fill(np.nan)
622 return arr
623
624
625def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
626 """
627 Return a dtype compat na value
628
629 Parameters
630 ----------
631 dtype : string / dtype
632 compat : bool, default True
633
634 Returns
635 -------
636 np.dtype or a pandas dtype
637
638 Examples
639 --------
640 >>> na_value_for_dtype(np.dtype('int64'))
641 0
642 >>> na_value_for_dtype(np.dtype('int64'), compat=False)
643 nan
644 >>> na_value_for_dtype(np.dtype('float64'))
645 nan
646 >>> na_value_for_dtype(np.dtype('bool'))
647 False
648 >>> na_value_for_dtype(np.dtype('datetime64[ns]'))
649 numpy.datetime64('NaT')
650 """
651
652 if isinstance(dtype, ExtensionDtype):
653 return dtype.na_value
654 elif needs_i8_conversion(dtype):
655 return dtype.type("NaT", "ns")
656 elif is_float_dtype(dtype):
657 return np.nan
658 elif is_integer_dtype(dtype):
659 if compat:
660 return 0
661 return np.nan
662 elif is_bool_dtype(dtype):
663 if compat:
664 return False
665 return np.nan
666 return np.nan
667
668
669def remove_na_arraylike(arr):
670 """
671 Return array-like containing only true/non-NaN values, possibly empty.
672 """
673 if is_extension_array_dtype(arr):
674 return arr[notna(arr)]
675 else:
676 return arr[notna(np.asarray(arr))]
677
678
679def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool:
680 """
681 isna check that excludes incompatible dtypes
682
683 Parameters
684 ----------
685 obj : object
686 dtype : np.datetime64, np.timedelta64, DatetimeTZDtype, or PeriodDtype
687
688 Returns
689 -------
690 bool
691 """
692 if not lib.is_scalar(obj) or not isna(obj):
693 return False
694 elif dtype.kind == "M":
695 if isinstance(dtype, np.dtype):
696 # i.e. not tzaware
697 return not isinstance(obj, (np.timedelta64, Decimal))
698 # we have to rule out tznaive dt64("NaT")
699 return not isinstance(obj, (np.timedelta64, np.datetime64, Decimal))
700 elif dtype.kind == "m":
701 return not isinstance(obj, (np.datetime64, Decimal))
702 elif dtype.kind in ["i", "u", "f", "c"]:
703 # Numeric
704 return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64))
705 elif dtype.kind == "b":
706 # We allow pd.NA, None, np.nan in BooleanArray (same as IntervalDtype)
707 return lib.is_float(obj) or obj is None or obj is libmissing.NA
708
709 elif dtype == _dtype_str:
710 # numpy string dtypes to avoid float np.nan
711 return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal, float))
712
713 elif dtype == _dtype_object:
714 # This is needed for Categorical, but is kind of weird
715 return True
716
717 elif isinstance(dtype, PeriodDtype):
718 return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
719
720 elif isinstance(dtype, IntervalDtype):
721 return lib.is_float(obj) or obj is None or obj is libmissing.NA
722
723 elif isinstance(dtype, CategoricalDtype):
724 return is_valid_na_for_dtype(obj, dtype.categories.dtype)
725
726 # fallback, default to allowing NaN, None, NA, NaT
727 return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal))
728
729
730def isna_all(arr: ArrayLike) -> bool:
731 """
732 Optimized equivalent to isna(arr).all()
733 """
734 total_len = len(arr)
735
736 # Usually it's enough to check but a small fraction of values to see if
737 # a block is NOT null, chunks should help in such cases.
738 # parameters 1000 and 40 were chosen arbitrarily
739 chunk_len = max(total_len // 40, 1000)
740
741 dtype = arr.dtype
742 if dtype.kind == "f" and isinstance(dtype, np.dtype):
743 checker = nan_checker
744
745 elif (isinstance(dtype, np.dtype) and dtype.kind in ["m", "M"]) or isinstance(
746 dtype, (DatetimeTZDtype, PeriodDtype)
747 ):
748 # error: Incompatible types in assignment (expression has type
749 # "Callable[[Any], Any]", variable has type "ufunc")
750 checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment]
751
752 else:
753 # error: Incompatible types in assignment (expression has type "Callable[[Any],
754 # Any]", variable has type "ufunc")
755 checker = lambda x: _isna_array( # type: ignore[assignment]
756 x, inf_as_na=INF_AS_NA
757 )
758
759 return all(
760 checker(arr[i : i + chunk_len]).all() for i in range(0, total_len, chunk_len)
761 )