1from __future__ import annotations
2
3import numbers
4from typing import (
5 TYPE_CHECKING,
6 ClassVar,
7 cast,
8)
9
10import numpy as np
11
12from pandas._libs import (
13 lib,
14 missing as libmissing,
15)
16
17from pandas.core.dtypes.common import is_list_like
18from pandas.core.dtypes.dtypes import register_extension_dtype
19from pandas.core.dtypes.missing import isna
20
21from pandas.core import ops
22from pandas.core.array_algos import masked_accumulations
23from pandas.core.arrays.masked import (
24 BaseMaskedArray,
25 BaseMaskedDtype,
26)
27
28if TYPE_CHECKING:
29 import pyarrow
30
31 from pandas._typing import (
32 Dtype,
33 DtypeObj,
34 Self,
35 npt,
36 type_t,
37 )
38
39
40@register_extension_dtype
41class BooleanDtype(BaseMaskedDtype):
42 """
43 Extension dtype for boolean data.
44
45 .. warning::
46
47 BooleanDtype is considered experimental. The implementation and
48 parts of the API may change without warning.
49
50 Attributes
51 ----------
52 None
53
54 Methods
55 -------
56 None
57
58 Examples
59 --------
60 >>> pd.BooleanDtype()
61 BooleanDtype
62 """
63
64 name: ClassVar[str] = "boolean"
65
66 # https://github.com/python/mypy/issues/4125
67 # error: Signature of "type" incompatible with supertype "BaseMaskedDtype"
68 @property
69 def type(self) -> type: # type: ignore[override]
70 return np.bool_
71
72 @property
73 def kind(self) -> str:
74 return "b"
75
76 @property
77 def numpy_dtype(self) -> np.dtype:
78 return np.dtype("bool")
79
80 @classmethod
81 def construct_array_type(cls) -> type_t[BooleanArray]:
82 """
83 Return the array type associated with this dtype.
84
85 Returns
86 -------
87 type
88 """
89 return BooleanArray
90
91 def __repr__(self) -> str:
92 return "BooleanDtype"
93
94 @property
95 def _is_boolean(self) -> bool:
96 return True
97
98 @property
99 def _is_numeric(self) -> bool:
100 return True
101
102 def __from_arrow__(
103 self, array: pyarrow.Array | pyarrow.ChunkedArray
104 ) -> BooleanArray:
105 """
106 Construct BooleanArray from pyarrow Array/ChunkedArray.
107 """
108 import pyarrow
109
110 if array.type != pyarrow.bool_() and not pyarrow.types.is_null(array.type):
111 raise TypeError(f"Expected array of boolean type, got {array.type} instead")
112
113 if isinstance(array, pyarrow.Array):
114 chunks = [array]
115 length = len(array)
116 else:
117 # pyarrow.ChunkedArray
118 chunks = array.chunks
119 length = array.length()
120
121 if pyarrow.types.is_null(array.type):
122 mask = np.ones(length, dtype=bool)
123 # No need to init data, since all null
124 data = np.empty(length, dtype=bool)
125 return BooleanArray(data, mask)
126
127 results = []
128 for arr in chunks:
129 buflist = arr.buffers()
130 data = pyarrow.BooleanArray.from_buffers(
131 arr.type, len(arr), [None, buflist[1]], offset=arr.offset
132 ).to_numpy(zero_copy_only=False)
133 if arr.null_count != 0:
134 mask = pyarrow.BooleanArray.from_buffers(
135 arr.type, len(arr), [None, buflist[0]], offset=arr.offset
136 ).to_numpy(zero_copy_only=False)
137 mask = ~mask
138 else:
139 mask = np.zeros(len(arr), dtype=bool)
140
141 bool_arr = BooleanArray(data, mask)
142 results.append(bool_arr)
143
144 if not results:
145 return BooleanArray(
146 np.array([], dtype=np.bool_), np.array([], dtype=np.bool_)
147 )
148 else:
149 return BooleanArray._concat_same_type(results)
150
151
152def coerce_to_array(
153 values, mask=None, copy: bool = False
154) -> tuple[np.ndarray, np.ndarray]:
155 """
156 Coerce the input values array to numpy arrays with a mask.
157
158 Parameters
159 ----------
160 values : 1D list-like
161 mask : bool 1D array, optional
162 copy : bool, default False
163 if True, copy the input
164
165 Returns
166 -------
167 tuple of (values, mask)
168 """
169 if isinstance(values, BooleanArray):
170 if mask is not None:
171 raise ValueError("cannot pass mask for BooleanArray input")
172 values, mask = values._data, values._mask
173 if copy:
174 values = values.copy()
175 mask = mask.copy()
176 return values, mask
177
178 mask_values = None
179 if isinstance(values, np.ndarray) and values.dtype == np.bool_:
180 if copy:
181 values = values.copy()
182 elif isinstance(values, np.ndarray) and values.dtype.kind in "iufcb":
183 mask_values = isna(values)
184
185 values_bool = np.zeros(len(values), dtype=bool)
186 values_bool[~mask_values] = values[~mask_values].astype(bool)
187
188 if not np.all(
189 values_bool[~mask_values].astype(values.dtype) == values[~mask_values]
190 ):
191 raise TypeError("Need to pass bool-like values")
192
193 values = values_bool
194 else:
195 values_object = np.asarray(values, dtype=object)
196
197 inferred_dtype = lib.infer_dtype(values_object, skipna=True)
198 integer_like = ("floating", "integer", "mixed-integer-float")
199 if inferred_dtype not in ("boolean", "empty") + integer_like:
200 raise TypeError("Need to pass bool-like values")
201
202 # mypy does not narrow the type of mask_values to npt.NDArray[np.bool_]
203 # within this branch, it assumes it can also be None
204 mask_values = cast("npt.NDArray[np.bool_]", isna(values_object))
205 values = np.zeros(len(values), dtype=bool)
206 values[~mask_values] = values_object[~mask_values].astype(bool)
207
208 # if the values were integer-like, validate it were actually 0/1's
209 if (inferred_dtype in integer_like) and not (
210 np.all(
211 values[~mask_values].astype(float)
212 == values_object[~mask_values].astype(float)
213 )
214 ):
215 raise TypeError("Need to pass bool-like values")
216
217 if mask is None and mask_values is None:
218 mask = np.zeros(values.shape, dtype=bool)
219 elif mask is None:
220 mask = mask_values
221 else:
222 if isinstance(mask, np.ndarray) and mask.dtype == np.bool_:
223 if mask_values is not None:
224 mask = mask | mask_values
225 else:
226 if copy:
227 mask = mask.copy()
228 else:
229 mask = np.array(mask, dtype=bool)
230 if mask_values is not None:
231 mask = mask | mask_values
232
233 if values.shape != mask.shape:
234 raise ValueError("values.shape and mask.shape must match")
235
236 return values, mask
237
238
239class BooleanArray(BaseMaskedArray):
240 """
241 Array of boolean (True/False) data with missing values.
242
243 This is a pandas Extension array for boolean data, under the hood
244 represented by 2 numpy arrays: a boolean array with the data and
245 a boolean array with the mask (True indicating missing).
246
247 BooleanArray implements Kleene logic (sometimes called three-value
248 logic) for logical operations. See :ref:`boolean.kleene` for more.
249
250 To construct an BooleanArray from generic array-like input, use
251 :func:`pandas.array` specifying ``dtype="boolean"`` (see examples
252 below).
253
254 .. warning::
255
256 BooleanArray is considered experimental. The implementation and
257 parts of the API may change without warning.
258
259 Parameters
260 ----------
261 values : numpy.ndarray
262 A 1-d boolean-dtype array with the data.
263 mask : numpy.ndarray
264 A 1-d boolean-dtype array indicating missing values (True
265 indicates missing).
266 copy : bool, default False
267 Whether to copy the `values` and `mask` arrays.
268
269 Attributes
270 ----------
271 None
272
273 Methods
274 -------
275 None
276
277 Returns
278 -------
279 BooleanArray
280
281 Examples
282 --------
283 Create an BooleanArray with :func:`pandas.array`:
284
285 >>> pd.array([True, False, None], dtype="boolean")
286 <BooleanArray>
287 [True, False, <NA>]
288 Length: 3, dtype: boolean
289 """
290
291 # The value used to fill '_data' to avoid upcasting
292 _internal_fill_value = False
293 # Fill values used for any/all
294 # Incompatible types in assignment (expression has type "bool", base class
295 # "BaseMaskedArray" defined the type as "<typing special form>")
296 _truthy_value = True # type: ignore[assignment]
297 _falsey_value = False # type: ignore[assignment]
298 _TRUE_VALUES = {"True", "TRUE", "true", "1", "1.0"}
299 _FALSE_VALUES = {"False", "FALSE", "false", "0", "0.0"}
300
301 @classmethod
302 def _simple_new(cls, values: np.ndarray, mask: npt.NDArray[np.bool_]) -> Self:
303 result = super()._simple_new(values, mask)
304 result._dtype = BooleanDtype()
305 return result
306
307 def __init__(
308 self, values: np.ndarray, mask: np.ndarray, copy: bool = False
309 ) -> None:
310 if not (isinstance(values, np.ndarray) and values.dtype == np.bool_):
311 raise TypeError(
312 "values should be boolean numpy array. Use "
313 "the 'pd.array' function instead"
314 )
315 self._dtype = BooleanDtype()
316 super().__init__(values, mask, copy=copy)
317
318 @property
319 def dtype(self) -> BooleanDtype:
320 return self._dtype
321
322 @classmethod
323 def _from_sequence_of_strings(
324 cls,
325 strings: list[str],
326 *,
327 dtype: Dtype | None = None,
328 copy: bool = False,
329 true_values: list[str] | None = None,
330 false_values: list[str] | None = None,
331 ) -> BooleanArray:
332 true_values_union = cls._TRUE_VALUES.union(true_values or [])
333 false_values_union = cls._FALSE_VALUES.union(false_values or [])
334
335 def map_string(s) -> bool:
336 if s in true_values_union:
337 return True
338 elif s in false_values_union:
339 return False
340 else:
341 raise ValueError(f"{s} cannot be cast to bool")
342
343 scalars = np.array(strings, dtype=object)
344 mask = isna(scalars)
345 scalars[~mask] = list(map(map_string, scalars[~mask]))
346 return cls._from_sequence(scalars, dtype=dtype, copy=copy)
347
348 _HANDLED_TYPES = (np.ndarray, numbers.Number, bool, np.bool_)
349
350 @classmethod
351 def _coerce_to_array(
352 cls, value, *, dtype: DtypeObj, copy: bool = False
353 ) -> tuple[np.ndarray, np.ndarray]:
354 if dtype:
355 assert dtype == "boolean"
356 return coerce_to_array(value, copy=copy)
357
358 def _logical_method(self, other, op):
359 assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"}
360 other_is_scalar = lib.is_scalar(other)
361 mask = None
362
363 if isinstance(other, BooleanArray):
364 other, mask = other._data, other._mask
365 elif is_list_like(other):
366 other = np.asarray(other, dtype="bool")
367 if other.ndim > 1:
368 raise NotImplementedError("can only perform ops with 1-d structures")
369 other, mask = coerce_to_array(other, copy=False)
370 elif isinstance(other, np.bool_):
371 other = other.item()
372
373 if other_is_scalar and other is not libmissing.NA and not lib.is_bool(other):
374 raise TypeError(
375 "'other' should be pandas.NA or a bool. "
376 f"Got {type(other).__name__} instead."
377 )
378
379 if not other_is_scalar and len(self) != len(other):
380 raise ValueError("Lengths must match")
381
382 if op.__name__ in {"or_", "ror_"}:
383 result, mask = ops.kleene_or(self._data, other, self._mask, mask)
384 elif op.__name__ in {"and_", "rand_"}:
385 result, mask = ops.kleene_and(self._data, other, self._mask, mask)
386 else:
387 # i.e. xor, rxor
388 result, mask = ops.kleene_xor(self._data, other, self._mask, mask)
389
390 # i.e. BooleanArray
391 return self._maybe_mask_result(result, mask)
392
393 def _accumulate(
394 self, name: str, *, skipna: bool = True, **kwargs
395 ) -> BaseMaskedArray:
396 data = self._data
397 mask = self._mask
398 if name in ("cummin", "cummax"):
399 op = getattr(masked_accumulations, name)
400 data, mask = op(data, mask, skipna=skipna, **kwargs)
401 return self._simple_new(data, mask)
402 else:
403 from pandas.core.arrays import IntegerArray
404
405 return IntegerArray(data.astype(int), mask)._accumulate(
406 name, skipna=skipna, **kwargs
407 )