1from __future__ import annotations
2
3from typing import (
4 TYPE_CHECKING,
5 Any,
6 Iterator,
7 Literal,
8 Sequence,
9 TypeVar,
10 overload,
11)
12import warnings
13
14import numpy as np
15
16from pandas._libs import (
17 lib,
18 missing as libmissing,
19)
20from pandas._libs.tslibs import (
21 get_unit_from_dtype,
22 is_supported_unit,
23)
24from pandas._typing import (
25 ArrayLike,
26 AstypeArg,
27 AxisInt,
28 DtypeObj,
29 NpDtype,
30 PositionalIndexer,
31 Scalar,
32 ScalarIndexer,
33 SequenceIndexer,
34 Shape,
35 npt,
36)
37from pandas.errors import AbstractMethodError
38from pandas.util._decorators import doc
39from pandas.util._validators import validate_fillna_kwargs
40
41from pandas.core.dtypes.base import ExtensionDtype
42from pandas.core.dtypes.common import (
43 is_bool,
44 is_bool_dtype,
45 is_datetime64_dtype,
46 is_dtype_equal,
47 is_float_dtype,
48 is_integer_dtype,
49 is_list_like,
50 is_object_dtype,
51 is_scalar,
52 is_string_dtype,
53 pandas_dtype,
54)
55from pandas.core.dtypes.dtypes import BaseMaskedDtype
56from pandas.core.dtypes.inference import is_array_like
57from pandas.core.dtypes.missing import (
58 array_equivalent,
59 is_valid_na_for_dtype,
60 isna,
61 notna,
62)
63
64from pandas.core import (
65 algorithms as algos,
66 arraylike,
67 missing,
68 nanops,
69 ops,
70)
71from pandas.core.algorithms import (
72 factorize_array,
73 isin,
74 take,
75)
76from pandas.core.array_algos import (
77 masked_accumulations,
78 masked_reductions,
79)
80from pandas.core.array_algos.quantile import quantile_with_mask
81from pandas.core.arraylike import OpsMixin
82from pandas.core.arrays import ExtensionArray
83from pandas.core.construction import ensure_wrapped_if_datetimelike
84from pandas.core.indexers import check_array_indexer
85from pandas.core.ops import invalid_comparison
86
87if TYPE_CHECKING:
88 from pandas import Series
89 from pandas.core.arrays import BooleanArray
90 from pandas._typing import (
91 NumpySorter,
92 NumpyValueArrayLike,
93 )
94
95from pandas.compat.numpy import function as nv
96
97BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
98
99
100class BaseMaskedArray(OpsMixin, ExtensionArray):
101 """
102 Base class for masked arrays (which use _data and _mask to store the data).
103
104 numpy based
105 """
106
107 # The value used to fill '_data' to avoid upcasting
108 _internal_fill_value: Scalar
109 # our underlying data and mask are each ndarrays
110 _data: np.ndarray
111 _mask: npt.NDArray[np.bool_]
112
113 # Fill values used for any/all
114 _truthy_value = Scalar # bool(_truthy_value) = True
115 _falsey_value = Scalar # bool(_falsey_value) = False
116
117 def __init__(
118 self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
119 ) -> None:
120 # values is supposed to already be validated in the subclass
121 if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_):
122 raise TypeError(
123 "mask should be boolean numpy array. Use "
124 "the 'pd.array' function instead"
125 )
126 if values.shape != mask.shape:
127 raise ValueError("values.shape must match mask.shape")
128
129 if copy:
130 values = values.copy()
131 mask = mask.copy()
132
133 self._data = values
134 self._mask = mask
135
136 @classmethod
137 def _from_sequence(
138 cls: type[BaseMaskedArrayT], scalars, *, dtype=None, copy: bool = False
139 ) -> BaseMaskedArrayT:
140 values, mask = cls._coerce_to_array(scalars, dtype=dtype, copy=copy)
141 return cls(values, mask)
142
143 @property
144 def dtype(self) -> BaseMaskedDtype:
145 raise AbstractMethodError(self)
146
147 @overload
148 def __getitem__(self, item: ScalarIndexer) -> Any:
149 ...
150
151 @overload
152 def __getitem__(self: BaseMaskedArrayT, item: SequenceIndexer) -> BaseMaskedArrayT:
153 ...
154
155 def __getitem__(
156 self: BaseMaskedArrayT, item: PositionalIndexer
157 ) -> BaseMaskedArrayT | Any:
158 item = check_array_indexer(self, item)
159
160 newmask = self._mask[item]
161 if is_bool(newmask):
162 # This is a scalar indexing
163 if newmask:
164 return self.dtype.na_value
165 return self._data[item]
166
167 return type(self)(self._data[item], newmask)
168
169 @doc(ExtensionArray.fillna)
170 def fillna(
171 self: BaseMaskedArrayT, value=None, method=None, limit=None
172 ) -> BaseMaskedArrayT:
173 value, method = validate_fillna_kwargs(value, method)
174
175 mask = self._mask
176
177 if is_array_like(value):
178 if len(value) != len(self):
179 raise ValueError(
180 f"Length of 'value' does not match. Got ({len(value)}) "
181 f" expected {len(self)}"
182 )
183 value = value[mask]
184
185 if mask.any():
186 if method is not None:
187 func = missing.get_fill_func(method, ndim=self.ndim)
188 npvalues = self._data.copy().T
189 new_mask = mask.copy().T
190 func(npvalues, limit=limit, mask=new_mask)
191 return type(self)(npvalues.T, new_mask.T)
192 else:
193 # fill with value
194 new_values = self.copy()
195 new_values[mask] = value
196 else:
197 new_values = self.copy()
198 return new_values
199
200 @classmethod
201 def _coerce_to_array(
202 cls, values, *, dtype: DtypeObj, copy: bool = False
203 ) -> tuple[np.ndarray, np.ndarray]:
204 raise AbstractMethodError(cls)
205
206 def _validate_setitem_value(self, value):
207 """
208 Check if we have a scalar that we can cast losslessly.
209
210 Raises
211 ------
212 TypeError
213 """
214 kind = self.dtype.kind
215 # TODO: get this all from np_can_hold_element?
216 if kind == "b":
217 if lib.is_bool(value):
218 return value
219
220 elif kind == "f":
221 if lib.is_integer(value) or lib.is_float(value):
222 return value
223
224 else:
225 if lib.is_integer(value) or (lib.is_float(value) and value.is_integer()):
226 return value
227 # TODO: unsigned checks
228
229 # Note: without the "str" here, the f-string rendering raises in
230 # py38 builds.
231 raise TypeError(f"Invalid value '{str(value)}' for dtype {self.dtype}")
232
233 def __setitem__(self, key, value) -> None:
234 key = check_array_indexer(self, key)
235
236 if is_scalar(value):
237 if is_valid_na_for_dtype(value, self.dtype):
238 self._mask[key] = True
239 else:
240 value = self._validate_setitem_value(value)
241 self._data[key] = value
242 self._mask[key] = False
243 return
244
245 value, mask = self._coerce_to_array(value, dtype=self.dtype)
246
247 self._data[key] = value
248 self._mask[key] = mask
249
250 def __iter__(self) -> Iterator:
251 if self.ndim == 1:
252 if not self._hasna:
253 for val in self._data:
254 yield val
255 else:
256 na_value = self.dtype.na_value
257 for isna_, val in zip(self._mask, self._data):
258 if isna_:
259 yield na_value
260 else:
261 yield val
262 else:
263 for i in range(len(self)):
264 yield self[i]
265
266 def __len__(self) -> int:
267 return len(self._data)
268
269 @property
270 def shape(self) -> Shape:
271 return self._data.shape
272
273 @property
274 def ndim(self) -> int:
275 return self._data.ndim
276
277 def swapaxes(self: BaseMaskedArrayT, axis1, axis2) -> BaseMaskedArrayT:
278 data = self._data.swapaxes(axis1, axis2)
279 mask = self._mask.swapaxes(axis1, axis2)
280 return type(self)(data, mask)
281
282 def delete(self: BaseMaskedArrayT, loc, axis: AxisInt = 0) -> BaseMaskedArrayT:
283 data = np.delete(self._data, loc, axis=axis)
284 mask = np.delete(self._mask, loc, axis=axis)
285 return type(self)(data, mask)
286
287 def reshape(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT:
288 data = self._data.reshape(*args, **kwargs)
289 mask = self._mask.reshape(*args, **kwargs)
290 return type(self)(data, mask)
291
292 def ravel(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT:
293 # TODO: need to make sure we have the same order for data/mask
294 data = self._data.ravel(*args, **kwargs)
295 mask = self._mask.ravel(*args, **kwargs)
296 return type(self)(data, mask)
297
298 @property
299 def T(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
300 return type(self)(self._data.T, self._mask.T)
301
302 def round(self, decimals: int = 0, *args, **kwargs):
303 """
304 Round each value in the array a to the given number of decimals.
305
306 Parameters
307 ----------
308 decimals : int, default 0
309 Number of decimal places to round to. If decimals is negative,
310 it specifies the number of positions to the left of the decimal point.
311 *args, **kwargs
312 Additional arguments and keywords have no effect but might be
313 accepted for compatibility with NumPy.
314
315 Returns
316 -------
317 NumericArray
318 Rounded values of the NumericArray.
319
320 See Also
321 --------
322 numpy.around : Round values of an np.array.
323 DataFrame.round : Round values of a DataFrame.
324 Series.round : Round values of a Series.
325 """
326 nv.validate_round(args, kwargs)
327 values = np.round(self._data, decimals=decimals, **kwargs)
328
329 # Usually we'll get same type as self, but ndarray[bool] casts to float
330 return self._maybe_mask_result(values, self._mask.copy())
331
332 # ------------------------------------------------------------------
333 # Unary Methods
334
335 def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
336 return type(self)(~self._data, self._mask.copy())
337
338 def __neg__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
339 return type(self)(-self._data, self._mask.copy())
340
341 def __pos__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
342 return self.copy()
343
344 def __abs__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
345 return type(self)(abs(self._data), self._mask.copy())
346
347 # ------------------------------------------------------------------
348
349 def to_numpy(
350 self,
351 dtype: npt.DTypeLike | None = None,
352 copy: bool = False,
353 na_value: object = lib.no_default,
354 ) -> np.ndarray:
355 """
356 Convert to a NumPy Array.
357
358 By default converts to an object-dtype NumPy array. Specify the `dtype` and
359 `na_value` keywords to customize the conversion.
360
361 Parameters
362 ----------
363 dtype : dtype, default object
364 The numpy dtype to convert to.
365 copy : bool, default False
366 Whether to ensure that the returned value is a not a view on
367 the array. Note that ``copy=False`` does not *ensure* that
368 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
369 a copy is made, even if not strictly necessary. This is typically
370 only possible when no missing values are present and `dtype`
371 is the equivalent numpy dtype.
372 na_value : scalar, optional
373 Scalar missing value indicator to use in numpy array. Defaults
374 to the native missing value indicator of this array (pd.NA).
375
376 Returns
377 -------
378 numpy.ndarray
379
380 Examples
381 --------
382 An object-dtype is the default result
383
384 >>> a = pd.array([True, False, pd.NA], dtype="boolean")
385 >>> a.to_numpy()
386 array([True, False, <NA>], dtype=object)
387
388 When no missing values are present, an equivalent dtype can be used.
389
390 >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool")
391 array([ True, False])
392 >>> pd.array([1, 2], dtype="Int64").to_numpy("int64")
393 array([1, 2])
394
395 However, requesting such dtype will raise a ValueError if
396 missing values are present and the default missing value :attr:`NA`
397 is used.
398
399 >>> a = pd.array([True, False, pd.NA], dtype="boolean")
400 >>> a
401 <BooleanArray>
402 [True, False, <NA>]
403 Length: 3, dtype: boolean
404
405 >>> a.to_numpy(dtype="bool")
406 Traceback (most recent call last):
407 ...
408 ValueError: cannot convert to bool numpy array in presence of missing values
409
410 Specify a valid `na_value` instead
411
412 >>> a.to_numpy(dtype="bool", na_value=False)
413 array([ True, False, False])
414 """
415 if na_value is lib.no_default:
416 na_value = libmissing.NA
417 if dtype is None:
418 dtype = object
419 if self._hasna:
420 if (
421 not is_object_dtype(dtype)
422 and not is_string_dtype(dtype)
423 and na_value is libmissing.NA
424 ):
425 raise ValueError(
426 f"cannot convert to '{dtype}'-dtype NumPy array "
427 "with missing values. Specify an appropriate 'na_value' "
428 "for this dtype."
429 )
430 # don't pass copy to astype -> always need a copy since we are mutating
431 with warnings.catch_warnings():
432 warnings.filterwarnings("ignore", category=RuntimeWarning)
433 data = self._data.astype(dtype)
434 data[self._mask] = na_value
435 else:
436 with warnings.catch_warnings():
437 warnings.filterwarnings("ignore", category=RuntimeWarning)
438 data = self._data.astype(dtype, copy=copy)
439 return data
440
441 @doc(ExtensionArray.tolist)
442 def tolist(self):
443 if self.ndim > 1:
444 return [x.tolist() for x in self]
445 dtype = None if self._hasna else self._data.dtype
446 return self.to_numpy(dtype=dtype).tolist()
447
448 @overload
449 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
450 ...
451
452 @overload
453 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
454 ...
455
456 @overload
457 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
458 ...
459
460 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
461 dtype = pandas_dtype(dtype)
462
463 if is_dtype_equal(dtype, self.dtype):
464 if copy:
465 return self.copy()
466 return self
467
468 # if we are astyping to another nullable masked dtype, we can fastpath
469 if isinstance(dtype, BaseMaskedDtype):
470 # TODO deal with NaNs for FloatingArray case
471 with warnings.catch_warnings():
472 warnings.filterwarnings("ignore", category=RuntimeWarning)
473 # TODO: Is rounding what we want long term?
474 data = self._data.astype(dtype.numpy_dtype, copy=copy)
475 # mask is copied depending on whether the data was copied, and
476 # not directly depending on the `copy` keyword
477 mask = self._mask if data is self._data else self._mask.copy()
478 cls = dtype.construct_array_type()
479 return cls(data, mask, copy=False)
480
481 if isinstance(dtype, ExtensionDtype):
482 eacls = dtype.construct_array_type()
483 return eacls._from_sequence(self, dtype=dtype, copy=copy)
484
485 na_value: float | np.datetime64 | lib.NoDefault
486
487 # coerce
488 if is_float_dtype(dtype):
489 # In astype, we consider dtype=float to also mean na_value=np.nan
490 na_value = np.nan
491 elif is_datetime64_dtype(dtype):
492 na_value = np.datetime64("NaT")
493 else:
494 na_value = lib.no_default
495
496 # to_numpy will also raise, but we get somewhat nicer exception messages here
497 if is_integer_dtype(dtype) and self._hasna:
498 raise ValueError("cannot convert NA to integer")
499 if is_bool_dtype(dtype) and self._hasna:
500 # careful: astype_nansafe converts np.nan to True
501 raise ValueError("cannot convert float NaN to bool")
502
503 data = self.to_numpy(dtype=dtype, na_value=na_value, copy=copy)
504 return data
505
506 __array_priority__ = 1000 # higher than ndarray so ops dispatch to us
507
508 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
509 """
510 the array interface, return my values
511 We return an object array here to preserve our scalar values
512 """
513 return self.to_numpy(dtype=dtype)
514
515 _HANDLED_TYPES: tuple[type, ...]
516
517 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
518 # For MaskedArray inputs, we apply the ufunc to ._data
519 # and mask the result.
520
521 out = kwargs.get("out", ())
522
523 for x in inputs + out:
524 if not isinstance(x, self._HANDLED_TYPES + (BaseMaskedArray,)):
525 return NotImplemented
526
527 # for binary ops, use our custom dunder methods
528 result = ops.maybe_dispatch_ufunc_to_dunder_op(
529 self, ufunc, method, *inputs, **kwargs
530 )
531 if result is not NotImplemented:
532 return result
533
534 if "out" in kwargs:
535 # e.g. test_ufunc_with_out
536 return arraylike.dispatch_ufunc_with_out(
537 self, ufunc, method, *inputs, **kwargs
538 )
539
540 if method == "reduce":
541 result = arraylike.dispatch_reduction_ufunc(
542 self, ufunc, method, *inputs, **kwargs
543 )
544 if result is not NotImplemented:
545 return result
546
547 mask = np.zeros(len(self), dtype=bool)
548 inputs2 = []
549 for x in inputs:
550 if isinstance(x, BaseMaskedArray):
551 mask |= x._mask
552 inputs2.append(x._data)
553 else:
554 inputs2.append(x)
555
556 def reconstruct(x):
557 # we don't worry about scalar `x` here, since we
558 # raise for reduce up above.
559 from pandas.core.arrays import (
560 BooleanArray,
561 FloatingArray,
562 IntegerArray,
563 )
564
565 if is_bool_dtype(x.dtype):
566 m = mask.copy()
567 return BooleanArray(x, m)
568 elif is_integer_dtype(x.dtype):
569 m = mask.copy()
570 return IntegerArray(x, m)
571 elif is_float_dtype(x.dtype):
572 m = mask.copy()
573 if x.dtype == np.float16:
574 # reached in e.g. np.sqrt on BooleanArray
575 # we don't support float16
576 x = x.astype(np.float32)
577 return FloatingArray(x, m)
578 else:
579 x[mask] = np.nan
580 return x
581
582 result = getattr(ufunc, method)(*inputs2, **kwargs)
583 if ufunc.nout > 1:
584 # e.g. np.divmod
585 return tuple(reconstruct(x) for x in result)
586 elif method == "reduce":
587 # e.g. np.add.reduce; test_ufunc_reduce_raises
588 if self._mask.any():
589 return self._na_value
590 return result
591 else:
592 return reconstruct(result)
593
594 def __arrow_array__(self, type=None):
595 """
596 Convert myself into a pyarrow Array.
597 """
598 import pyarrow as pa
599
600 return pa.array(self._data, mask=self._mask, type=type)
601
602 @property
603 def _hasna(self) -> bool:
604 # Note: this is expensive right now! The hope is that we can
605 # make this faster by having an optional mask, but not have to change
606 # source code using it..
607
608 # error: Incompatible return value type (got "bool_", expected "bool")
609 return self._mask.any() # type: ignore[return-value]
610
611 def _propagate_mask(
612 self, mask: npt.NDArray[np.bool_] | None, other
613 ) -> npt.NDArray[np.bool_]:
614 if mask is None:
615 mask = self._mask.copy() # TODO: need test for BooleanArray needing a copy
616 if other is libmissing.NA:
617 # GH#45421 don't alter inplace
618 mask = mask | True
619 elif is_list_like(other) and len(other) == len(mask):
620 mask = mask | isna(other)
621 else:
622 mask = self._mask | mask
623 # Incompatible return value type (got "Optional[ndarray[Any, dtype[bool_]]]",
624 # expected "ndarray[Any, dtype[bool_]]")
625 return mask # type: ignore[return-value]
626
627 def _arith_method(self, other, op):
628 op_name = op.__name__
629 omask = None
630
631 if (
632 not hasattr(other, "dtype")
633 and is_list_like(other)
634 and len(other) == len(self)
635 ):
636 # Try inferring masked dtype instead of casting to object
637 inferred_dtype = lib.infer_dtype(other, skipna=True)
638 if inferred_dtype == "integer":
639 from pandas.core.arrays import IntegerArray
640
641 other = IntegerArray._from_sequence(other)
642 elif inferred_dtype in ["floating", "mixed-integer-float"]:
643 from pandas.core.arrays import FloatingArray
644
645 other = FloatingArray._from_sequence(other)
646
647 elif inferred_dtype in ["boolean"]:
648 from pandas.core.arrays import BooleanArray
649
650 other = BooleanArray._from_sequence(other)
651
652 if isinstance(other, BaseMaskedArray):
653 other, omask = other._data, other._mask
654
655 elif is_list_like(other):
656 if not isinstance(other, ExtensionArray):
657 other = np.asarray(other)
658 if other.ndim > 1:
659 raise NotImplementedError("can only perform ops with 1-d structures")
660
661 # We wrap the non-masked arithmetic logic used for numpy dtypes
662 # in Series/Index arithmetic ops.
663 other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
664 pd_op = ops.get_array_op(op)
665 other = ensure_wrapped_if_datetimelike(other)
666
667 if op_name in {"pow", "rpow"} and isinstance(other, np.bool_):
668 # Avoid DeprecationWarning: In future, it will be an error
669 # for 'np.bool_' scalars to be interpreted as an index
670 # e.g. test_array_scalar_like_equivalence
671 other = bool(other)
672
673 mask = self._propagate_mask(omask, other)
674
675 if other is libmissing.NA:
676 result = np.ones_like(self._data)
677 if self.dtype.kind == "b":
678 if op_name in {
679 "floordiv",
680 "rfloordiv",
681 "pow",
682 "rpow",
683 "truediv",
684 "rtruediv",
685 }:
686 # GH#41165 Try to match non-masked Series behavior
687 # This is still imperfect GH#46043
688 raise NotImplementedError(
689 f"operator '{op_name}' not implemented for bool dtypes"
690 )
691 if op_name in {"mod", "rmod"}:
692 dtype = "int8"
693 else:
694 dtype = "bool"
695 result = result.astype(dtype)
696 elif "truediv" in op_name and self.dtype.kind != "f":
697 # The actual data here doesn't matter since the mask
698 # will be all-True, but since this is division, we want
699 # to end up with floating dtype.
700 result = result.astype(np.float64)
701 else:
702 # Make sure we do this before the "pow" mask checks
703 # to get an expected exception message on shape mismatch.
704 if self.dtype.kind in ["i", "u"] and op_name in ["floordiv", "mod"]:
705 # TODO(GH#30188) ATM we don't match the behavior of non-masked
706 # types with respect to floordiv-by-zero
707 pd_op = op
708
709 with np.errstate(all="ignore"):
710 result = pd_op(self._data, other)
711
712 if op_name == "pow":
713 # 1 ** x is 1.
714 mask = np.where((self._data == 1) & ~self._mask, False, mask)
715 # x ** 0 is 1.
716 if omask is not None:
717 mask = np.where((other == 0) & ~omask, False, mask)
718 elif other is not libmissing.NA:
719 mask = np.where(other == 0, False, mask)
720
721 elif op_name == "rpow":
722 # 1 ** x is 1.
723 if omask is not None:
724 mask = np.where((other == 1) & ~omask, False, mask)
725 elif other is not libmissing.NA:
726 mask = np.where(other == 1, False, mask)
727 # x ** 0 is 1.
728 mask = np.where((self._data == 0) & ~self._mask, False, mask)
729
730 return self._maybe_mask_result(result, mask)
731
732 _logical_method = _arith_method
733
734 def _cmp_method(self, other, op) -> BooleanArray:
735 from pandas.core.arrays import BooleanArray
736
737 mask = None
738
739 if isinstance(other, BaseMaskedArray):
740 other, mask = other._data, other._mask
741
742 elif is_list_like(other):
743 other = np.asarray(other)
744 if other.ndim > 1:
745 raise NotImplementedError("can only perform ops with 1-d structures")
746 if len(self) != len(other):
747 raise ValueError("Lengths must match to compare")
748
749 if other is libmissing.NA:
750 # numpy does not handle pd.NA well as "other" scalar (it returns
751 # a scalar False instead of an array)
752 # This may be fixed by NA.__array_ufunc__. Revisit this check
753 # once that's implemented.
754 result = np.zeros(self._data.shape, dtype="bool")
755 mask = np.ones(self._data.shape, dtype="bool")
756 else:
757 with warnings.catch_warnings():
758 # numpy may show a FutureWarning or DeprecationWarning:
759 # elementwise comparison failed; returning scalar instead,
760 # but in the future will perform elementwise comparison
761 # before returning NotImplemented. We fall back to the correct
762 # behavior today, so that should be fine to ignore.
763 warnings.filterwarnings("ignore", "elementwise", FutureWarning)
764 warnings.filterwarnings("ignore", "elementwise", DeprecationWarning)
765 with np.errstate(all="ignore"):
766 method = getattr(self._data, f"__{op.__name__}__")
767 result = method(other)
768
769 if result is NotImplemented:
770 result = invalid_comparison(self._data, other, op)
771
772 mask = self._propagate_mask(mask, other)
773 return BooleanArray(result, mask, copy=False)
774
775 def _maybe_mask_result(self, result, mask):
776 """
777 Parameters
778 ----------
779 result : array-like or tuple[array-like]
780 mask : array-like bool
781 """
782 if isinstance(result, tuple):
783 # i.e. divmod
784 div, mod = result
785 return (
786 self._maybe_mask_result(div, mask),
787 self._maybe_mask_result(mod, mask),
788 )
789
790 if is_float_dtype(result.dtype):
791 from pandas.core.arrays import FloatingArray
792
793 return FloatingArray(result, mask, copy=False)
794
795 elif is_bool_dtype(result.dtype):
796 from pandas.core.arrays import BooleanArray
797
798 return BooleanArray(result, mask, copy=False)
799
800 elif (
801 isinstance(result.dtype, np.dtype)
802 and result.dtype.kind == "m"
803 and is_supported_unit(get_unit_from_dtype(result.dtype))
804 ):
805 # e.g. test_numeric_arr_mul_tdscalar_numexpr_path
806 from pandas.core.arrays import TimedeltaArray
807
808 if not isinstance(result, TimedeltaArray):
809 result = TimedeltaArray._simple_new(result, dtype=result.dtype)
810
811 result[mask] = result.dtype.type("NaT")
812 return result
813
814 elif is_integer_dtype(result.dtype):
815 from pandas.core.arrays import IntegerArray
816
817 return IntegerArray(result, mask, copy=False)
818
819 else:
820 result[mask] = np.nan
821 return result
822
823 def isna(self) -> np.ndarray:
824 return self._mask.copy()
825
826 @property
827 def _na_value(self):
828 return self.dtype.na_value
829
830 @property
831 def nbytes(self) -> int:
832 return self._data.nbytes + self._mask.nbytes
833
834 @classmethod
835 def _concat_same_type(
836 cls: type[BaseMaskedArrayT],
837 to_concat: Sequence[BaseMaskedArrayT],
838 axis: AxisInt = 0,
839 ) -> BaseMaskedArrayT:
840 data = np.concatenate([x._data for x in to_concat], axis=axis)
841 mask = np.concatenate([x._mask for x in to_concat], axis=axis)
842 return cls(data, mask)
843
844 def take(
845 self: BaseMaskedArrayT,
846 indexer,
847 *,
848 allow_fill: bool = False,
849 fill_value: Scalar | None = None,
850 axis: AxisInt = 0,
851 ) -> BaseMaskedArrayT:
852 # we always fill with 1 internally
853 # to avoid upcasting
854 data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value
855 result = take(
856 self._data,
857 indexer,
858 fill_value=data_fill_value,
859 allow_fill=allow_fill,
860 axis=axis,
861 )
862
863 mask = take(
864 self._mask, indexer, fill_value=True, allow_fill=allow_fill, axis=axis
865 )
866
867 # if we are filling
868 # we only fill where the indexer is null
869 # not existing missing values
870 # TODO(jreback) what if we have a non-na float as a fill value?
871 if allow_fill and notna(fill_value):
872 fill_mask = np.asarray(indexer) == -1
873 result[fill_mask] = fill_value
874 mask = mask ^ fill_mask
875
876 return type(self)(result, mask, copy=False)
877
878 # error: Return type "BooleanArray" of "isin" incompatible with return type
879 # "ndarray" in supertype "ExtensionArray"
880 def isin(self, values) -> BooleanArray: # type: ignore[override]
881 from pandas.core.arrays import BooleanArray
882
883 # algorithms.isin will eventually convert values to an ndarray, so no extra
884 # cost to doing it here first
885 values_arr = np.asarray(values)
886 result = isin(self._data, values_arr)
887
888 if self._hasna:
889 values_have_NA = is_object_dtype(values_arr.dtype) and any(
890 val is self.dtype.na_value for val in values_arr
891 )
892
893 # For now, NA does not propagate so set result according to presence of NA,
894 # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
895 result[self._mask] = values_have_NA
896
897 mask = np.zeros(self._data.shape, dtype=bool)
898 return BooleanArray(result, mask, copy=False)
899
900 def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
901 data, mask = self._data, self._mask
902 data = data.copy()
903 mask = mask.copy()
904 return type(self)(data, mask, copy=False)
905
906 def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
907 """
908 Compute the BaseMaskedArray of unique values.
909
910 Returns
911 -------
912 uniques : BaseMaskedArray
913 """
914 uniques, mask = algos.unique_with_mask(self._data, self._mask)
915 return type(self)(uniques, mask, copy=False)
916
917 @doc(ExtensionArray.searchsorted)
918 def searchsorted(
919 self,
920 value: NumpyValueArrayLike | ExtensionArray,
921 side: Literal["left", "right"] = "left",
922 sorter: NumpySorter = None,
923 ) -> npt.NDArray[np.intp] | np.intp:
924 if self._hasna:
925 raise ValueError(
926 "searchsorted requires array to be sorted, which is impossible "
927 "with NAs present."
928 )
929 if isinstance(value, ExtensionArray):
930 value = value.astype(object)
931 # Base class searchsorted would cast to object, which is *much* slower.
932 return self._data.searchsorted(value, side=side, sorter=sorter)
933
934 @doc(ExtensionArray.factorize)
935 def factorize(
936 self,
937 use_na_sentinel: bool = True,
938 ) -> tuple[np.ndarray, ExtensionArray]:
939 arr = self._data
940 mask = self._mask
941
942 # Use a sentinel for na; recode and add NA to uniques if necessary below
943 codes, uniques = factorize_array(arr, use_na_sentinel=True, mask=mask)
944
945 # check that factorize_array correctly preserves dtype.
946 assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
947
948 has_na = mask.any()
949 if use_na_sentinel or not has_na:
950 size = len(uniques)
951 else:
952 # Make room for an NA value
953 size = len(uniques) + 1
954 uniques_mask = np.zeros(size, dtype=bool)
955 if not use_na_sentinel and has_na:
956 na_index = mask.argmax()
957 # Insert na with the proper code
958 if na_index == 0:
959 na_code = np.intp(0)
960 else:
961 # mypy error: Slice index must be an integer or None
962 # https://github.com/python/mypy/issues/2410
963 na_code = codes[:na_index].max() + 1 # type: ignore[misc]
964 codes[codes >= na_code] += 1
965 codes[codes == -1] = na_code
966 # dummy value for uniques; not used since uniques_mask will be True
967 uniques = np.insert(uniques, na_code, 0)
968 uniques_mask[na_code] = True
969 uniques_ea = type(self)(uniques, uniques_mask)
970
971 return codes, uniques_ea
972
973 @doc(ExtensionArray._values_for_argsort)
974 def _values_for_argsort(self) -> np.ndarray:
975 return self._data
976
977 def value_counts(self, dropna: bool = True) -> Series:
978 """
979 Returns a Series containing counts of each unique value.
980
981 Parameters
982 ----------
983 dropna : bool, default True
984 Don't include counts of missing values.
985
986 Returns
987 -------
988 counts : Series
989
990 See Also
991 --------
992 Series.value_counts
993 """
994 from pandas import (
995 Index,
996 Series,
997 )
998 from pandas.arrays import IntegerArray
999
1000 keys, value_counts = algos.value_counts_arraylike(
1001 self._data, dropna=True, mask=self._mask
1002 )
1003
1004 if dropna:
1005 res = Series(value_counts, index=keys, name="count", copy=False)
1006 res.index = res.index.astype(self.dtype)
1007 res = res.astype("Int64")
1008 return res
1009
1010 # if we want nans, count the mask
1011 counts = np.empty(len(value_counts) + 1, dtype="int64")
1012 counts[:-1] = value_counts
1013 counts[-1] = self._mask.sum()
1014
1015 index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value)
1016 index = index.astype(self.dtype)
1017
1018 mask = np.zeros(len(counts), dtype="bool")
1019 counts_array = IntegerArray(counts, mask)
1020
1021 return Series(counts_array, index=index, name="count", copy=False)
1022
1023 @doc(ExtensionArray.equals)
1024 def equals(self, other) -> bool:
1025 if type(self) != type(other):
1026 return False
1027 if other.dtype != self.dtype:
1028 return False
1029
1030 # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT
1031 # equal.
1032 if not np.array_equal(self._mask, other._mask):
1033 return False
1034
1035 left = self._data[~self._mask]
1036 right = other._data[~other._mask]
1037 return array_equivalent(left, right, dtype_equal=True)
1038
1039 def _quantile(
1040 self, qs: npt.NDArray[np.float64], interpolation: str
1041 ) -> BaseMaskedArray:
1042 """
1043 Dispatch to quantile_with_mask, needed because we do not have
1044 _from_factorized.
1045
1046 Notes
1047 -----
1048 We assume that all impacted cases are 1D-only.
1049 """
1050 res = quantile_with_mask(
1051 self._data,
1052 mask=self._mask,
1053 # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype)
1054 # instead of np.nan
1055 fill_value=np.nan,
1056 qs=qs,
1057 interpolation=interpolation,
1058 )
1059
1060 if self._hasna:
1061 # Our result mask is all-False unless we are all-NA, in which
1062 # case it is all-True.
1063 if self.ndim == 2:
1064 # I think this should be out_mask=self.isna().all(axis=1)
1065 # but am holding off until we have tests
1066 raise NotImplementedError
1067 if self.isna().all():
1068 out_mask = np.ones(res.shape, dtype=bool)
1069
1070 if is_integer_dtype(self.dtype):
1071 # We try to maintain int dtype if possible for not all-na case
1072 # as well
1073 res = np.zeros(res.shape, dtype=self.dtype.numpy_dtype)
1074 else:
1075 out_mask = np.zeros(res.shape, dtype=bool)
1076 else:
1077 out_mask = np.zeros(res.shape, dtype=bool)
1078 return self._maybe_mask_result(res, mask=out_mask)
1079
1080 # ------------------------------------------------------------------
1081 # Reductions
1082
1083 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
1084 if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}:
1085 return getattr(self, name)(skipna=skipna, **kwargs)
1086
1087 data = self._data
1088 mask = self._mask
1089
1090 # median, skew, kurt, sem
1091 op = getattr(nanops, f"nan{name}")
1092 result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
1093
1094 if np.isnan(result):
1095 return libmissing.NA
1096
1097 return result
1098
1099 def _wrap_reduction_result(self, name: str, result, skipna, **kwargs):
1100 if isinstance(result, np.ndarray):
1101 axis = kwargs["axis"]
1102 if skipna:
1103 # we only retain mask for all-NA rows/columns
1104 mask = self._mask.all(axis=axis)
1105 else:
1106 mask = self._mask.any(axis=axis)
1107
1108 return self._maybe_mask_result(result, mask)
1109 return result
1110
1111 def sum(
1112 self,
1113 *,
1114 skipna: bool = True,
1115 min_count: int = 0,
1116 axis: AxisInt | None = 0,
1117 **kwargs,
1118 ):
1119 nv.validate_sum((), kwargs)
1120
1121 # TODO: do this in validate_sum?
1122 if "out" in kwargs:
1123 # np.sum; test_floating_array_numpy_sum
1124 if kwargs["out"] is not None:
1125 raise NotImplementedError
1126 kwargs.pop("out")
1127
1128 result = masked_reductions.sum(
1129 self._data,
1130 self._mask,
1131 skipna=skipna,
1132 min_count=min_count,
1133 axis=axis,
1134 )
1135 return self._wrap_reduction_result(
1136 "sum", result, skipna=skipna, axis=axis, **kwargs
1137 )
1138
1139 def prod(
1140 self,
1141 *,
1142 skipna: bool = True,
1143 min_count: int = 0,
1144 axis: AxisInt | None = 0,
1145 **kwargs,
1146 ):
1147 nv.validate_prod((), kwargs)
1148 result = masked_reductions.prod(
1149 self._data,
1150 self._mask,
1151 skipna=skipna,
1152 min_count=min_count,
1153 axis=axis,
1154 )
1155 return self._wrap_reduction_result(
1156 "prod", result, skipna=skipna, axis=axis, **kwargs
1157 )
1158
1159 def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
1160 nv.validate_mean((), kwargs)
1161 result = masked_reductions.mean(
1162 self._data,
1163 self._mask,
1164 skipna=skipna,
1165 axis=axis,
1166 )
1167 return self._wrap_reduction_result(
1168 "mean", result, skipna=skipna, axis=axis, **kwargs
1169 )
1170
1171 def var(
1172 self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs
1173 ):
1174 nv.validate_stat_ddof_func((), kwargs, fname="var")
1175 result = masked_reductions.var(
1176 self._data,
1177 self._mask,
1178 skipna=skipna,
1179 axis=axis,
1180 ddof=ddof,
1181 )
1182 return self._wrap_reduction_result(
1183 "var", result, skipna=skipna, axis=axis, **kwargs
1184 )
1185
1186 def std(
1187 self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs
1188 ):
1189 nv.validate_stat_ddof_func((), kwargs, fname="std")
1190 result = masked_reductions.std(
1191 self._data,
1192 self._mask,
1193 skipna=skipna,
1194 axis=axis,
1195 ddof=ddof,
1196 )
1197 return self._wrap_reduction_result(
1198 "std", result, skipna=skipna, axis=axis, **kwargs
1199 )
1200
1201 def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
1202 nv.validate_min((), kwargs)
1203 return masked_reductions.min(
1204 self._data,
1205 self._mask,
1206 skipna=skipna,
1207 axis=axis,
1208 )
1209
1210 def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
1211 nv.validate_max((), kwargs)
1212 return masked_reductions.max(
1213 self._data,
1214 self._mask,
1215 skipna=skipna,
1216 axis=axis,
1217 )
1218
1219 def any(self, *, skipna: bool = True, **kwargs):
1220 """
1221 Return whether any element is truthy.
1222
1223 Returns False unless there is at least one element that is truthy.
1224 By default, NAs are skipped. If ``skipna=False`` is specified and
1225 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
1226 is used as for logical operations.
1227
1228 .. versionchanged:: 1.4.0
1229
1230 Parameters
1231 ----------
1232 skipna : bool, default True
1233 Exclude NA values. If the entire array is NA and `skipna` is
1234 True, then the result will be False, as for an empty array.
1235 If `skipna` is False, the result will still be True if there is
1236 at least one element that is truthy, otherwise NA will be returned
1237 if there are NA's present.
1238 **kwargs : any, default None
1239 Additional keywords have no effect but might be accepted for
1240 compatibility with NumPy.
1241
1242 Returns
1243 -------
1244 bool or :attr:`pandas.NA`
1245
1246 See Also
1247 --------
1248 numpy.any : Numpy version of this method.
1249 BaseMaskedArray.all : Return whether all elements are truthy.
1250
1251 Examples
1252 --------
1253 The result indicates whether any element is truthy (and by default
1254 skips NAs):
1255
1256 >>> pd.array([True, False, True]).any()
1257 True
1258 >>> pd.array([True, False, pd.NA]).any()
1259 True
1260 >>> pd.array([False, False, pd.NA]).any()
1261 False
1262 >>> pd.array([], dtype="boolean").any()
1263 False
1264 >>> pd.array([pd.NA], dtype="boolean").any()
1265 False
1266 >>> pd.array([pd.NA], dtype="Float64").any()
1267 False
1268
1269 With ``skipna=False``, the result can be NA if this is logically
1270 required (whether ``pd.NA`` is True or False influences the result):
1271
1272 >>> pd.array([True, False, pd.NA]).any(skipna=False)
1273 True
1274 >>> pd.array([1, 0, pd.NA]).any(skipna=False)
1275 True
1276 >>> pd.array([False, False, pd.NA]).any(skipna=False)
1277 <NA>
1278 >>> pd.array([0, 0, pd.NA]).any(skipna=False)
1279 <NA>
1280 """
1281 kwargs.pop("axis", None)
1282 nv.validate_any((), kwargs)
1283
1284 values = self._data.copy()
1285 # error: Argument 3 to "putmask" has incompatible type "object";
1286 # expected "Union[_SupportsArray[dtype[Any]],
1287 # _NestedSequence[_SupportsArray[dtype[Any]]],
1288 # bool, int, float, complex, str, bytes,
1289 # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
1290 np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type]
1291 result = values.any()
1292 if skipna:
1293 return result
1294 else:
1295 if result or len(self) == 0 or not self._mask.any():
1296 return result
1297 else:
1298 return self.dtype.na_value
1299
1300 def all(self, *, skipna: bool = True, **kwargs):
1301 """
1302 Return whether all elements are truthy.
1303
1304 Returns True unless there is at least one element that is falsey.
1305 By default, NAs are skipped. If ``skipna=False`` is specified and
1306 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
1307 is used as for logical operations.
1308
1309 .. versionchanged:: 1.4.0
1310
1311 Parameters
1312 ----------
1313 skipna : bool, default True
1314 Exclude NA values. If the entire array is NA and `skipna` is
1315 True, then the result will be True, as for an empty array.
1316 If `skipna` is False, the result will still be False if there is
1317 at least one element that is falsey, otherwise NA will be returned
1318 if there are NA's present.
1319 **kwargs : any, default None
1320 Additional keywords have no effect but might be accepted for
1321 compatibility with NumPy.
1322
1323 Returns
1324 -------
1325 bool or :attr:`pandas.NA`
1326
1327 See Also
1328 --------
1329 numpy.all : Numpy version of this method.
1330 BooleanArray.any : Return whether any element is truthy.
1331
1332 Examples
1333 --------
1334 The result indicates whether all elements are truthy (and by default
1335 skips NAs):
1336
1337 >>> pd.array([True, True, pd.NA]).all()
1338 True
1339 >>> pd.array([1, 1, pd.NA]).all()
1340 True
1341 >>> pd.array([True, False, pd.NA]).all()
1342 False
1343 >>> pd.array([], dtype="boolean").all()
1344 True
1345 >>> pd.array([pd.NA], dtype="boolean").all()
1346 True
1347 >>> pd.array([pd.NA], dtype="Float64").all()
1348 True
1349
1350 With ``skipna=False``, the result can be NA if this is logically
1351 required (whether ``pd.NA`` is True or False influences the result):
1352
1353 >>> pd.array([True, True, pd.NA]).all(skipna=False)
1354 <NA>
1355 >>> pd.array([1, 1, pd.NA]).all(skipna=False)
1356 <NA>
1357 >>> pd.array([True, False, pd.NA]).all(skipna=False)
1358 False
1359 >>> pd.array([1, 0, pd.NA]).all(skipna=False)
1360 False
1361 """
1362 kwargs.pop("axis", None)
1363 nv.validate_all((), kwargs)
1364
1365 values = self._data.copy()
1366 # error: Argument 3 to "putmask" has incompatible type "object";
1367 # expected "Union[_SupportsArray[dtype[Any]],
1368 # _NestedSequence[_SupportsArray[dtype[Any]]],
1369 # bool, int, float, complex, str, bytes,
1370 # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
1371 np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type]
1372 result = values.all()
1373
1374 if skipna:
1375 return result
1376 else:
1377 if not result or len(self) == 0 or not self._mask.any():
1378 return result
1379 else:
1380 return self.dtype.na_value
1381
1382 def _accumulate(
1383 self, name: str, *, skipna: bool = True, **kwargs
1384 ) -> BaseMaskedArray:
1385 data = self._data
1386 mask = self._mask
1387
1388 op = getattr(masked_accumulations, name)
1389 data, mask = op(data, mask, skipna=skipna, **kwargs)
1390
1391 return type(self)(data, mask, copy=False)