1from __future__ import annotations
2
3import functools
4import operator
5import re
6import textwrap
7from typing import (
8 TYPE_CHECKING,
9 Any,
10 Callable,
11 Literal,
12 cast,
13)
14import unicodedata
15
16import numpy as np
17
18from pandas._libs import lib
19from pandas._libs.tslibs import (
20 NaT,
21 Timedelta,
22 Timestamp,
23 timezones,
24)
25from pandas.compat import (
26 pa_version_under10p1,
27 pa_version_under11p0,
28 pa_version_under13p0,
29)
30from pandas.util._decorators import doc
31from pandas.util._validators import validate_fillna_kwargs
32
33from pandas.core.dtypes.cast import (
34 can_hold_element,
35 infer_dtype_from_scalar,
36)
37from pandas.core.dtypes.common import (
38 CategoricalDtype,
39 is_array_like,
40 is_bool_dtype,
41 is_float_dtype,
42 is_integer,
43 is_list_like,
44 is_numeric_dtype,
45 is_scalar,
46)
47from pandas.core.dtypes.dtypes import DatetimeTZDtype
48from pandas.core.dtypes.missing import isna
49
50from pandas.core import (
51 algorithms as algos,
52 missing,
53 ops,
54 roperator,
55)
56from pandas.core.algorithms import map_array
57from pandas.core.arraylike import OpsMixin
58from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin
59from pandas.core.arrays._utils import to_numpy_dtype_inference
60from pandas.core.arrays.base import (
61 ExtensionArray,
62 ExtensionArraySupportsAnyAll,
63)
64from pandas.core.arrays.masked import BaseMaskedArray
65from pandas.core.arrays.string_ import StringDtype
66import pandas.core.common as com
67from pandas.core.indexers import (
68 check_array_indexer,
69 unpack_tuple_and_ellipses,
70 validate_indices,
71)
72from pandas.core.strings.base import BaseStringArrayMethods
73
74from pandas.io._util import _arrow_dtype_mapping
75from pandas.tseries.frequencies import to_offset
76
77if not pa_version_under10p1:
78 import pyarrow as pa
79 import pyarrow.compute as pc
80
81 from pandas.core.dtypes.dtypes import ArrowDtype
82
83 ARROW_CMP_FUNCS = {
84 "eq": pc.equal,
85 "ne": pc.not_equal,
86 "lt": pc.less,
87 "gt": pc.greater,
88 "le": pc.less_equal,
89 "ge": pc.greater_equal,
90 }
91
92 ARROW_LOGICAL_FUNCS = {
93 "and_": pc.and_kleene,
94 "rand_": lambda x, y: pc.and_kleene(y, x),
95 "or_": pc.or_kleene,
96 "ror_": lambda x, y: pc.or_kleene(y, x),
97 "xor": pc.xor,
98 "rxor": lambda x, y: pc.xor(y, x),
99 }
100
101 ARROW_BIT_WISE_FUNCS = {
102 "and_": pc.bit_wise_and,
103 "rand_": lambda x, y: pc.bit_wise_and(y, x),
104 "or_": pc.bit_wise_or,
105 "ror_": lambda x, y: pc.bit_wise_or(y, x),
106 "xor": pc.bit_wise_xor,
107 "rxor": lambda x, y: pc.bit_wise_xor(y, x),
108 }
109
110 def cast_for_truediv(
111 arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
112 ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]:
113 # Ensure int / int -> float mirroring Python/Numpy behavior
114 # as pc.divide_checked(int, int) -> int
115 if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
116 pa_object.type
117 ):
118 # GH: 56645.
119 # https://github.com/apache/arrow/issues/35563
120 return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast(
121 pa_object, pa.float64(), safe=False
122 )
123
124 return arrow_array, pa_object
125
126 def floordiv_compat(
127 left: pa.ChunkedArray | pa.Array | pa.Scalar,
128 right: pa.ChunkedArray | pa.Array | pa.Scalar,
129 ) -> pa.ChunkedArray:
130 # TODO: Replace with pyarrow floordiv kernel.
131 # https://github.com/apache/arrow/issues/39386
132 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
133 divided = pc.divide_checked(left, right)
134 if pa.types.is_signed_integer(divided.type):
135 # GH 56676
136 has_remainder = pc.not_equal(pc.multiply(divided, right), left)
137 has_one_negative_operand = pc.less(
138 pc.bit_wise_xor(left, right),
139 pa.scalar(0, type=divided.type),
140 )
141 result = pc.if_else(
142 pc.and_(
143 has_remainder,
144 has_one_negative_operand,
145 ),
146 # GH: 55561
147 pc.subtract(divided, pa.scalar(1, type=divided.type)),
148 divided,
149 )
150 else:
151 result = divided
152 result = result.cast(left.type)
153 else:
154 divided = pc.divide(left, right)
155 result = pc.floor(divided)
156 return result
157
158 ARROW_ARITHMETIC_FUNCS = {
159 "add": pc.add_checked,
160 "radd": lambda x, y: pc.add_checked(y, x),
161 "sub": pc.subtract_checked,
162 "rsub": lambda x, y: pc.subtract_checked(y, x),
163 "mul": pc.multiply_checked,
164 "rmul": lambda x, y: pc.multiply_checked(y, x),
165 "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)),
166 "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)),
167 "floordiv": lambda x, y: floordiv_compat(x, y),
168 "rfloordiv": lambda x, y: floordiv_compat(y, x),
169 "mod": NotImplemented,
170 "rmod": NotImplemented,
171 "divmod": NotImplemented,
172 "rdivmod": NotImplemented,
173 "pow": pc.power_checked,
174 "rpow": lambda x, y: pc.power_checked(y, x),
175 }
176
177if TYPE_CHECKING:
178 from collections.abc import Sequence
179
180 from pandas._typing import (
181 ArrayLike,
182 AxisInt,
183 Dtype,
184 FillnaOptions,
185 InterpolateOptions,
186 Iterator,
187 NpDtype,
188 NumpySorter,
189 NumpyValueArrayLike,
190 PositionalIndexer,
191 Scalar,
192 Self,
193 SortKind,
194 TakeIndexer,
195 TimeAmbiguous,
196 TimeNonexistent,
197 npt,
198 )
199
200 from pandas import Series
201 from pandas.core.arrays.datetimes import DatetimeArray
202 from pandas.core.arrays.timedeltas import TimedeltaArray
203
204
205def get_unit_from_pa_dtype(pa_dtype):
206 # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804
207 if pa_version_under11p0:
208 unit = str(pa_dtype).split("[", 1)[-1][:-1]
209 if unit not in ["s", "ms", "us", "ns"]:
210 raise ValueError(pa_dtype)
211 return unit
212 return pa_dtype.unit
213
214
215def to_pyarrow_type(
216 dtype: ArrowDtype | pa.DataType | Dtype | None,
217) -> pa.DataType | None:
218 """
219 Convert dtype to a pyarrow type instance.
220 """
221 if isinstance(dtype, ArrowDtype):
222 return dtype.pyarrow_dtype
223 elif isinstance(dtype, pa.DataType):
224 return dtype
225 elif isinstance(dtype, DatetimeTZDtype):
226 return pa.timestamp(dtype.unit, dtype.tz)
227 elif dtype:
228 try:
229 # Accepts python types too
230 # Doesn't handle all numpy types
231 return pa.from_numpy_dtype(dtype)
232 except pa.ArrowNotImplementedError:
233 pass
234 return None
235
236
237class ArrowExtensionArray(
238 OpsMixin,
239 ExtensionArraySupportsAnyAll,
240 ArrowStringArrayMixin,
241 BaseStringArrayMethods,
242):
243 """
244 Pandas ExtensionArray backed by a PyArrow ChunkedArray.
245
246 .. warning::
247
248 ArrowExtensionArray is considered experimental. The implementation and
249 parts of the API may change without warning.
250
251 Parameters
252 ----------
253 values : pyarrow.Array or pyarrow.ChunkedArray
254
255 Attributes
256 ----------
257 None
258
259 Methods
260 -------
261 None
262
263 Returns
264 -------
265 ArrowExtensionArray
266
267 Notes
268 -----
269 Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__
270 Some methods may either raise an exception or raise a ``PerformanceWarning`` if an
271 associated compute function is not available based on the installed version of PyArrow.
272
273 Please install the latest version of PyArrow to enable the best functionality and avoid
274 potential bugs in prior versions of PyArrow.
275
276 Examples
277 --------
278 Create an ArrowExtensionArray with :func:`pandas.array`:
279
280 >>> pd.array([1, 1, None], dtype="int64[pyarrow]")
281 <ArrowExtensionArray>
282 [1, 1, <NA>]
283 Length: 3, dtype: int64[pyarrow]
284 """ # noqa: E501 (http link too long)
285
286 _pa_array: pa.ChunkedArray
287 _dtype: ArrowDtype
288
289 def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
290 if pa_version_under10p1:
291 msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."
292 raise ImportError(msg)
293 if isinstance(values, pa.Array):
294 self._pa_array = pa.chunked_array([values])
295 elif isinstance(values, pa.ChunkedArray):
296 self._pa_array = values
297 else:
298 raise ValueError(
299 f"Unsupported type '{type(values)}' for ArrowExtensionArray"
300 )
301 self._dtype = ArrowDtype(self._pa_array.type)
302
303 @classmethod
304 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
305 """
306 Construct a new ExtensionArray from a sequence of scalars.
307 """
308 pa_type = to_pyarrow_type(dtype)
309 pa_array = cls._box_pa_array(scalars, pa_type=pa_type, copy=copy)
310 arr = cls(pa_array)
311 return arr
312
313 @classmethod
314 def _from_sequence_of_strings(
315 cls, strings, *, dtype: Dtype | None = None, copy: bool = False
316 ):
317 """
318 Construct a new ExtensionArray from a sequence of strings.
319 """
320 pa_type = to_pyarrow_type(dtype)
321 if (
322 pa_type is None
323 or pa.types.is_binary(pa_type)
324 or pa.types.is_string(pa_type)
325 or pa.types.is_large_string(pa_type)
326 ):
327 # pa_type is None: Let pa.array infer
328 # pa_type is string/binary: scalars already correct type
329 scalars = strings
330 elif pa.types.is_timestamp(pa_type):
331 from pandas.core.tools.datetimes import to_datetime
332
333 scalars = to_datetime(strings, errors="raise")
334 elif pa.types.is_date(pa_type):
335 from pandas.core.tools.datetimes import to_datetime
336
337 scalars = to_datetime(strings, errors="raise").date
338 elif pa.types.is_duration(pa_type):
339 from pandas.core.tools.timedeltas import to_timedelta
340
341 scalars = to_timedelta(strings, errors="raise")
342 if pa_type.unit != "ns":
343 # GH51175: test_from_sequence_of_strings_pa_array
344 # attempt to parse as int64 reflecting pyarrow's
345 # duration to string casting behavior
346 mask = isna(scalars)
347 if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
348 strings = pa.array(strings, type=pa.string(), from_pandas=True)
349 strings = pc.if_else(mask, None, strings)
350 try:
351 scalars = strings.cast(pa.int64())
352 except pa.ArrowInvalid:
353 pass
354 elif pa.types.is_time(pa_type):
355 from pandas.core.tools.times import to_time
356
357 # "coerce" to allow "null times" (None) to not raise
358 scalars = to_time(strings, errors="coerce")
359 elif pa.types.is_boolean(pa_type):
360 # pyarrow string->bool casting is case-insensitive:
361 # "true" or "1" -> True
362 # "false" or "0" -> False
363 # Note: BooleanArray was previously used to parse these strings
364 # and allows "1.0" and "0.0". Pyarrow casting does not support
365 # this, but we allow it here.
366 if isinstance(strings, (pa.Array, pa.ChunkedArray)):
367 scalars = strings
368 else:
369 scalars = pa.array(strings, type=pa.string(), from_pandas=True)
370 scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)
371 scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)
372 scalars = scalars.cast(pa.bool_())
373 elif (
374 pa.types.is_integer(pa_type)
375 or pa.types.is_floating(pa_type)
376 or pa.types.is_decimal(pa_type)
377 ):
378 from pandas.core.tools.numeric import to_numeric
379
380 scalars = to_numeric(strings, errors="raise")
381 else:
382 raise NotImplementedError(
383 f"Converting strings to {pa_type} is not implemented."
384 )
385 return cls._from_sequence(scalars, dtype=pa_type, copy=copy)
386
387 @classmethod
388 def _box_pa(
389 cls, value, pa_type: pa.DataType | None = None
390 ) -> pa.Array | pa.ChunkedArray | pa.Scalar:
391 """
392 Box value into a pyarrow Array, ChunkedArray or Scalar.
393
394 Parameters
395 ----------
396 value : any
397 pa_type : pa.DataType | None
398
399 Returns
400 -------
401 pa.Array or pa.ChunkedArray or pa.Scalar
402 """
403 if isinstance(value, pa.Scalar) or not is_list_like(value):
404 return cls._box_pa_scalar(value, pa_type)
405 return cls._box_pa_array(value, pa_type)
406
407 @classmethod
408 def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
409 """
410 Box value into a pyarrow Scalar.
411
412 Parameters
413 ----------
414 value : any
415 pa_type : pa.DataType | None
416
417 Returns
418 -------
419 pa.Scalar
420 """
421 if isinstance(value, pa.Scalar):
422 pa_scalar = value
423 elif isna(value):
424 pa_scalar = pa.scalar(None, type=pa_type)
425 else:
426 # Workaround https://github.com/apache/arrow/issues/37291
427 if isinstance(value, Timedelta):
428 if pa_type is None:
429 pa_type = pa.duration(value.unit)
430 elif value.unit != pa_type.unit:
431 value = value.as_unit(pa_type.unit)
432 value = value._value
433 elif isinstance(value, Timestamp):
434 if pa_type is None:
435 pa_type = pa.timestamp(value.unit, tz=value.tz)
436 elif value.unit != pa_type.unit:
437 value = value.as_unit(pa_type.unit)
438 value = value._value
439
440 pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)
441
442 if pa_type is not None and pa_scalar.type != pa_type:
443 pa_scalar = pa_scalar.cast(pa_type)
444
445 return pa_scalar
446
447 @classmethod
448 def _box_pa_array(
449 cls, value, pa_type: pa.DataType | None = None, copy: bool = False
450 ) -> pa.Array | pa.ChunkedArray:
451 """
452 Box value into a pyarrow Array or ChunkedArray.
453
454 Parameters
455 ----------
456 value : Sequence
457 pa_type : pa.DataType | None
458
459 Returns
460 -------
461 pa.Array or pa.ChunkedArray
462 """
463 if isinstance(value, cls):
464 pa_array = value._pa_array
465 elif isinstance(value, (pa.Array, pa.ChunkedArray)):
466 pa_array = value
467 elif isinstance(value, BaseMaskedArray):
468 # GH 52625
469 if copy:
470 value = value.copy()
471 pa_array = value.__arrow_array__()
472 else:
473 if (
474 isinstance(value, np.ndarray)
475 and pa_type is not None
476 and (
477 pa.types.is_large_binary(pa_type)
478 or pa.types.is_large_string(pa_type)
479 )
480 ):
481 # See https://github.com/apache/arrow/issues/35289
482 value = value.tolist()
483 elif copy and is_array_like(value):
484 # pa array should not get updated when numpy array is updated
485 value = value.copy()
486
487 if (
488 pa_type is not None
489 and pa.types.is_duration(pa_type)
490 and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi")
491 ):
492 # Workaround https://github.com/apache/arrow/issues/37291
493 from pandas.core.tools.timedeltas import to_timedelta
494
495 value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit)
496 value = value.to_numpy()
497
498 try:
499 pa_array = pa.array(value, type=pa_type, from_pandas=True)
500 except (pa.ArrowInvalid, pa.ArrowTypeError):
501 # GH50430: let pyarrow infer type, then cast
502 pa_array = pa.array(value, from_pandas=True)
503
504 if pa_type is None and pa.types.is_duration(pa_array.type):
505 # Workaround https://github.com/apache/arrow/issues/37291
506 from pandas.core.tools.timedeltas import to_timedelta
507
508 value = to_timedelta(value)
509 value = value.to_numpy()
510 pa_array = pa.array(value, type=pa_type, from_pandas=True)
511
512 if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0:
513 # GH52843: upstream bug for duration types when originally
514 # constructed with data containing numpy NaT.
515 # https://github.com/apache/arrow/issues/35088
516 arr = cls(pa_array)
517 arr = arr.fillna(arr.dtype.na_value)
518 pa_array = arr._pa_array
519
520 if pa_type is not None and pa_array.type != pa_type:
521 if pa.types.is_dictionary(pa_type):
522 pa_array = pa_array.dictionary_encode()
523 else:
524 try:
525 pa_array = pa_array.cast(pa_type)
526 except (
527 pa.ArrowInvalid,
528 pa.ArrowTypeError,
529 pa.ArrowNotImplementedError,
530 ):
531 if pa.types.is_string(pa_array.type) or pa.types.is_large_string(
532 pa_array.type
533 ):
534 # TODO: Move logic in _from_sequence_of_strings into
535 # _box_pa_array
536 return cls._from_sequence_of_strings(
537 value, dtype=pa_type
538 )._pa_array
539 else:
540 raise
541
542 return pa_array
543
544 def __getitem__(self, item: PositionalIndexer):
545 """Select a subset of self.
546
547 Parameters
548 ----------
549 item : int, slice, or ndarray
550 * int: The position in 'self' to get.
551 * slice: A slice object, where 'start', 'stop', and 'step' are
552 integers or None
553 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
554
555 Returns
556 -------
557 item : scalar or ExtensionArray
558
559 Notes
560 -----
561 For scalar ``item``, return a scalar value suitable for the array's
562 type. This should be an instance of ``self.dtype.type``.
563 For slice ``key``, return an instance of ``ExtensionArray``, even
564 if the slice is length 0 or 1.
565 For a boolean mask, return an instance of ``ExtensionArray``, filtered
566 to the values where ``item`` is True.
567 """
568 item = check_array_indexer(self, item)
569
570 if isinstance(item, np.ndarray):
571 if not len(item):
572 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
573 if self._dtype.name == "string" and self._dtype.storage in (
574 "pyarrow",
575 "pyarrow_numpy",
576 ):
577 pa_dtype = pa.string()
578 else:
579 pa_dtype = self._dtype.pyarrow_dtype
580 return type(self)(pa.chunked_array([], type=pa_dtype))
581 elif item.dtype.kind in "iu":
582 return self.take(item)
583 elif item.dtype.kind == "b":
584 return type(self)(self._pa_array.filter(item))
585 else:
586 raise IndexError(
587 "Only integers, slices and integer or "
588 "boolean arrays are valid indices."
589 )
590 elif isinstance(item, tuple):
591 item = unpack_tuple_and_ellipses(item)
592
593 if item is Ellipsis:
594 # TODO: should be handled by pyarrow?
595 item = slice(None)
596
597 if is_scalar(item) and not is_integer(item):
598 # e.g. "foo" or 2.5
599 # exception message copied from numpy
600 raise IndexError(
601 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
602 r"(`None`) and integer or boolean arrays are valid indices"
603 )
604 # We are not an array indexer, so maybe e.g. a slice or integer
605 # indexer. We dispatch to pyarrow.
606 if isinstance(item, slice):
607 # Arrow bug https://github.com/apache/arrow/issues/38768
608 if item.start == item.stop:
609 pass
610 elif (
611 item.stop is not None
612 and item.stop < -len(self)
613 and item.step is not None
614 and item.step < 0
615 ):
616 item = slice(item.start, None, item.step)
617
618 value = self._pa_array[item]
619 if isinstance(value, pa.ChunkedArray):
620 return type(self)(value)
621 else:
622 pa_type = self._pa_array.type
623 scalar = value.as_py()
624 if scalar is None:
625 return self._dtype.na_value
626 elif pa.types.is_timestamp(pa_type) and pa_type.unit != "ns":
627 # GH 53326
628 return Timestamp(scalar).as_unit(pa_type.unit)
629 elif pa.types.is_duration(pa_type) and pa_type.unit != "ns":
630 # GH 53326
631 return Timedelta(scalar).as_unit(pa_type.unit)
632 else:
633 return scalar
634
635 def __iter__(self) -> Iterator[Any]:
636 """
637 Iterate over elements of the array.
638 """
639 na_value = self._dtype.na_value
640 # GH 53326
641 pa_type = self._pa_array.type
642 box_timestamp = pa.types.is_timestamp(pa_type) and pa_type.unit != "ns"
643 box_timedelta = pa.types.is_duration(pa_type) and pa_type.unit != "ns"
644 for value in self._pa_array:
645 val = value.as_py()
646 if val is None:
647 yield na_value
648 elif box_timestamp:
649 yield Timestamp(val).as_unit(pa_type.unit)
650 elif box_timedelta:
651 yield Timedelta(val).as_unit(pa_type.unit)
652 else:
653 yield val
654
655 def __arrow_array__(self, type=None):
656 """Convert myself to a pyarrow ChunkedArray."""
657 return self._pa_array
658
659 def __array__(
660 self, dtype: NpDtype | None = None, copy: bool | None = None
661 ) -> np.ndarray:
662 """Correctly construct numpy arrays when passed to `np.asarray()`."""
663 return self.to_numpy(dtype=dtype)
664
665 def __invert__(self) -> Self:
666 # This is a bit wise op for integer types
667 if pa.types.is_integer(self._pa_array.type):
668 return type(self)(pc.bit_wise_not(self._pa_array))
669 elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
670 self._pa_array.type
671 ):
672 # Raise TypeError instead of pa.ArrowNotImplementedError
673 raise TypeError("__invert__ is not supported for string dtypes")
674 else:
675 return type(self)(pc.invert(self._pa_array))
676
677 def __neg__(self) -> Self:
678 return type(self)(pc.negate_checked(self._pa_array))
679
680 def __pos__(self) -> Self:
681 return type(self)(self._pa_array)
682
683 def __abs__(self) -> Self:
684 return type(self)(pc.abs_checked(self._pa_array))
685
686 # GH 42600: __getstate__/__setstate__ not necessary once
687 # https://issues.apache.org/jira/browse/ARROW-10739 is addressed
688 def __getstate__(self):
689 state = self.__dict__.copy()
690 state["_pa_array"] = self._pa_array.combine_chunks()
691 return state
692
693 def __setstate__(self, state) -> None:
694 if "_data" in state:
695 data = state.pop("_data")
696 else:
697 data = state["_pa_array"]
698 state["_pa_array"] = pa.chunked_array(data)
699 self.__dict__.update(state)
700
701 def _cmp_method(self, other, op):
702 pc_func = ARROW_CMP_FUNCS[op.__name__]
703 if isinstance(
704 other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)
705 ) or isinstance(getattr(other, "dtype", None), CategoricalDtype):
706 result = pc_func(self._pa_array, self._box_pa(other))
707 elif is_scalar(other):
708 try:
709 result = pc_func(self._pa_array, self._box_pa(other))
710 except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
711 mask = isna(self) | isna(other)
712 valid = ~mask
713 result = np.zeros(len(self), dtype="bool")
714 np_array = np.array(self)
715 try:
716 result[valid] = op(np_array[valid], other)
717 except TypeError:
718 result = ops.invalid_comparison(np_array, other, op)
719 result = pa.array(result, type=pa.bool_())
720 result = pc.if_else(valid, result, None)
721 else:
722 raise NotImplementedError(
723 f"{op.__name__} not implemented for {type(other)}"
724 )
725 return ArrowExtensionArray(result)
726
727 def _evaluate_op_method(self, other, op, arrow_funcs):
728 pa_type = self._pa_array.type
729 other = self._box_pa(other)
730
731 if (
732 pa.types.is_string(pa_type)
733 or pa.types.is_large_string(pa_type)
734 or pa.types.is_binary(pa_type)
735 ):
736 if op in [operator.add, roperator.radd]:
737 sep = pa.scalar("", type=pa_type)
738 if op is operator.add:
739 result = pc.binary_join_element_wise(self._pa_array, other, sep)
740 elif op is roperator.radd:
741 result = pc.binary_join_element_wise(other, self._pa_array, sep)
742 return type(self)(result)
743 elif op in [operator.mul, roperator.rmul]:
744 binary = self._pa_array
745 integral = other
746 if not pa.types.is_integer(integral.type):
747 raise TypeError("Can only string multiply by an integer.")
748 pa_integral = pc.if_else(pc.less(integral, 0), 0, integral)
749 result = pc.binary_repeat(binary, pa_integral)
750 return type(self)(result)
751 elif (
752 pa.types.is_string(other.type)
753 or pa.types.is_binary(other.type)
754 or pa.types.is_large_string(other.type)
755 ) and op in [operator.mul, roperator.rmul]:
756 binary = other
757 integral = self._pa_array
758 if not pa.types.is_integer(integral.type):
759 raise TypeError("Can only string multiply by an integer.")
760 pa_integral = pc.if_else(pc.less(integral, 0), 0, integral)
761 result = pc.binary_repeat(binary, pa_integral)
762 return type(self)(result)
763 if (
764 isinstance(other, pa.Scalar)
765 and pc.is_null(other).as_py()
766 and op.__name__ in ARROW_LOGICAL_FUNCS
767 ):
768 # pyarrow kleene ops require null to be typed
769 other = other.cast(pa_type)
770
771 pc_func = arrow_funcs[op.__name__]
772 if pc_func is NotImplemented:
773 raise NotImplementedError(f"{op.__name__} not implemented.")
774
775 result = pc_func(self._pa_array, other)
776 return type(self)(result)
777
778 def _logical_method(self, other, op):
779 # For integer types `^`, `|`, `&` are bitwise operators and return
780 # integer types. Otherwise these are boolean ops.
781 if pa.types.is_integer(self._pa_array.type):
782 return self._evaluate_op_method(other, op, ARROW_BIT_WISE_FUNCS)
783 else:
784 return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)
785
786 def _arith_method(self, other, op):
787 return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
788
789 def equals(self, other) -> bool:
790 if not isinstance(other, ArrowExtensionArray):
791 return False
792 # I'm told that pyarrow makes __eq__ behave like pandas' equals;
793 # TODO: is this documented somewhere?
794 return self._pa_array == other._pa_array
795
796 @property
797 def dtype(self) -> ArrowDtype:
798 """
799 An instance of 'ExtensionDtype'.
800 """
801 return self._dtype
802
803 @property
804 def nbytes(self) -> int:
805 """
806 The number of bytes needed to store this object in memory.
807 """
808 return self._pa_array.nbytes
809
810 def __len__(self) -> int:
811 """
812 Length of this array.
813
814 Returns
815 -------
816 length : int
817 """
818 return len(self._pa_array)
819
820 def __contains__(self, key) -> bool:
821 # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
822 if isna(key) and key is not self.dtype.na_value:
823 if self.dtype.kind == "f" and lib.is_float(key):
824 return pc.any(pc.is_nan(self._pa_array)).as_py()
825
826 # e.g. date or timestamp types we do not allow None here to match pd.NA
827 return False
828 # TODO: maybe complex? object?
829
830 return bool(super().__contains__(key))
831
832 @property
833 def _hasna(self) -> bool:
834 return self._pa_array.null_count > 0
835
836 def isna(self) -> npt.NDArray[np.bool_]:
837 """
838 Boolean NumPy array indicating if each value is missing.
839
840 This should return a 1-D array the same length as 'self'.
841 """
842 # GH51630: fast paths
843 null_count = self._pa_array.null_count
844 if null_count == 0:
845 return np.zeros(len(self), dtype=np.bool_)
846 elif null_count == len(self):
847 return np.ones(len(self), dtype=np.bool_)
848
849 return self._pa_array.is_null().to_numpy()
850
851 def any(self, *, skipna: bool = True, **kwargs):
852 """
853 Return whether any element is truthy.
854
855 Returns False unless there is at least one element that is truthy.
856 By default, NAs are skipped. If ``skipna=False`` is specified and
857 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
858 is used as for logical operations.
859
860 Parameters
861 ----------
862 skipna : bool, default True
863 Exclude NA values. If the entire array is NA and `skipna` is
864 True, then the result will be False, as for an empty array.
865 If `skipna` is False, the result will still be True if there is
866 at least one element that is truthy, otherwise NA will be returned
867 if there are NA's present.
868
869 Returns
870 -------
871 bool or :attr:`pandas.NA`
872
873 See Also
874 --------
875 ArrowExtensionArray.all : Return whether all elements are truthy.
876
877 Examples
878 --------
879 The result indicates whether any element is truthy (and by default
880 skips NAs):
881
882 >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any()
883 True
884 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any()
885 True
886 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any()
887 False
888 >>> pd.array([], dtype="boolean[pyarrow]").any()
889 False
890 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any()
891 False
892 >>> pd.array([pd.NA], dtype="float64[pyarrow]").any()
893 False
894
895 With ``skipna=False``, the result can be NA if this is logically
896 required (whether ``pd.NA`` is True or False influences the result):
897
898 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
899 True
900 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
901 True
902 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
903 <NA>
904 >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
905 <NA>
906 """
907 return self._reduce("any", skipna=skipna, **kwargs)
908
909 def all(self, *, skipna: bool = True, **kwargs):
910 """
911 Return whether all elements are truthy.
912
913 Returns True unless there is at least one element that is falsey.
914 By default, NAs are skipped. If ``skipna=False`` is specified and
915 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
916 is used as for logical operations.
917
918 Parameters
919 ----------
920 skipna : bool, default True
921 Exclude NA values. If the entire array is NA and `skipna` is
922 True, then the result will be True, as for an empty array.
923 If `skipna` is False, the result will still be False if there is
924 at least one element that is falsey, otherwise NA will be returned
925 if there are NA's present.
926
927 Returns
928 -------
929 bool or :attr:`pandas.NA`
930
931 See Also
932 --------
933 ArrowExtensionArray.any : Return whether any element is truthy.
934
935 Examples
936 --------
937 The result indicates whether all elements are truthy (and by default
938 skips NAs):
939
940 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all()
941 True
942 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all()
943 True
944 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all()
945 False
946 >>> pd.array([], dtype="boolean[pyarrow]").all()
947 True
948 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all()
949 True
950 >>> pd.array([pd.NA], dtype="float64[pyarrow]").all()
951 True
952
953 With ``skipna=False``, the result can be NA if this is logically
954 required (whether ``pd.NA`` is True or False influences the result):
955
956 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
957 <NA>
958 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
959 <NA>
960 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
961 False
962 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
963 False
964 """
965 return self._reduce("all", skipna=skipna, **kwargs)
966
967 def argsort(
968 self,
969 *,
970 ascending: bool = True,
971 kind: SortKind = "quicksort",
972 na_position: str = "last",
973 **kwargs,
974 ) -> np.ndarray:
975 order = "ascending" if ascending else "descending"
976 null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None)
977 if null_placement is None:
978 raise ValueError(f"invalid na_position: {na_position}")
979
980 result = pc.array_sort_indices(
981 self._pa_array, order=order, null_placement=null_placement
982 )
983 np_result = result.to_numpy()
984 return np_result.astype(np.intp, copy=False)
985
986 def _argmin_max(self, skipna: bool, method: str) -> int:
987 if self._pa_array.length() in (0, self._pa_array.null_count) or (
988 self._hasna and not skipna
989 ):
990 # For empty or all null, pyarrow returns -1 but pandas expects TypeError
991 # For skipna=False and data w/ null, pandas expects NotImplementedError
992 # let ExtensionArray.arg{max|min} raise
993 return getattr(super(), f"arg{method}")(skipna=skipna)
994
995 data = self._pa_array
996 if pa.types.is_duration(data.type):
997 data = data.cast(pa.int64())
998
999 value = getattr(pc, method)(data, skip_nulls=skipna)
1000 return pc.index(data, value).as_py()
1001
1002 def argmin(self, skipna: bool = True) -> int:
1003 return self._argmin_max(skipna, "min")
1004
1005 def argmax(self, skipna: bool = True) -> int:
1006 return self._argmin_max(skipna, "max")
1007
1008 def copy(self) -> Self:
1009 """
1010 Return a shallow copy of the array.
1011
1012 Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
1013
1014 Returns
1015 -------
1016 type(self)
1017 """
1018 return type(self)(self._pa_array)
1019
1020 def dropna(self) -> Self:
1021 """
1022 Return ArrowExtensionArray without NA values.
1023
1024 Returns
1025 -------
1026 ArrowExtensionArray
1027 """
1028 return type(self)(pc.drop_null(self._pa_array))
1029
1030 def _pad_or_backfill(
1031 self,
1032 *,
1033 method: FillnaOptions,
1034 limit: int | None = None,
1035 limit_area: Literal["inside", "outside"] | None = None,
1036 copy: bool = True,
1037 ) -> Self:
1038 if not self._hasna:
1039 # TODO(CoW): Not necessary anymore when CoW is the default
1040 return self.copy()
1041
1042 if limit is None and limit_area is None:
1043 method = missing.clean_fill_method(method)
1044 try:
1045 if method == "pad":
1046 return type(self)(pc.fill_null_forward(self._pa_array))
1047 elif method == "backfill":
1048 return type(self)(pc.fill_null_backward(self._pa_array))
1049 except pa.ArrowNotImplementedError:
1050 # ArrowNotImplementedError: Function 'coalesce' has no kernel
1051 # matching input types (duration[ns], duration[ns])
1052 # TODO: remove try/except wrapper if/when pyarrow implements
1053 # a kernel for duration types.
1054 pass
1055
1056 # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove
1057 # this method entirely.
1058 return super()._pad_or_backfill(
1059 method=method, limit=limit, limit_area=limit_area, copy=copy
1060 )
1061
1062 @doc(ExtensionArray.fillna)
1063 def fillna(
1064 self,
1065 value: object | ArrayLike | None = None,
1066 method: FillnaOptions | None = None,
1067 limit: int | None = None,
1068 copy: bool = True,
1069 ) -> Self:
1070 value, method = validate_fillna_kwargs(value, method)
1071
1072 if not self._hasna:
1073 # TODO(CoW): Not necessary anymore when CoW is the default
1074 return self.copy()
1075
1076 if limit is not None:
1077 return super().fillna(value=value, method=method, limit=limit, copy=copy)
1078
1079 if method is not None:
1080 return super().fillna(method=method, limit=limit, copy=copy)
1081
1082 if isinstance(value, (np.ndarray, ExtensionArray)):
1083 # Similar to check_value_size, but we do not mask here since we may
1084 # end up passing it to the super() method.
1085 if len(value) != len(self):
1086 raise ValueError(
1087 f"Length of 'value' does not match. Got ({len(value)}) "
1088 f" expected {len(self)}"
1089 )
1090
1091 try:
1092 fill_value = self._box_pa(value, pa_type=self._pa_array.type)
1093 except pa.ArrowTypeError as err:
1094 msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
1095 raise TypeError(msg) from err
1096
1097 try:
1098 return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value))
1099 except pa.ArrowNotImplementedError:
1100 # ArrowNotImplementedError: Function 'coalesce' has no kernel
1101 # matching input types (duration[ns], duration[ns])
1102 # TODO: remove try/except wrapper if/when pyarrow implements
1103 # a kernel for duration types.
1104 pass
1105
1106 return super().fillna(value=value, method=method, limit=limit, copy=copy)
1107
1108 def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
1109 # short-circuit to return all False array.
1110 if not len(values):
1111 return np.zeros(len(self), dtype=bool)
1112
1113 result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True))
1114 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
1115 # to False
1116 return np.array(result, dtype=np.bool_)
1117
1118 def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
1119 """
1120 Return an array and missing value suitable for factorization.
1121
1122 Returns
1123 -------
1124 values : ndarray
1125 na_value : pd.NA
1126
1127 Notes
1128 -----
1129 The values returned by this method are also used in
1130 :func:`pandas.util.hash_pandas_object`.
1131 """
1132 values = self._pa_array.to_numpy()
1133 return values, self.dtype.na_value
1134
1135 @doc(ExtensionArray.factorize)
1136 def factorize(
1137 self,
1138 use_na_sentinel: bool = True,
1139 ) -> tuple[np.ndarray, ExtensionArray]:
1140 null_encoding = "mask" if use_na_sentinel else "encode"
1141
1142 data = self._pa_array
1143 pa_type = data.type
1144 if pa_version_under11p0 and pa.types.is_duration(pa_type):
1145 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
1146 data = data.cast(pa.int64())
1147
1148 if pa.types.is_dictionary(data.type):
1149 encoded = data
1150 else:
1151 encoded = data.dictionary_encode(null_encoding=null_encoding)
1152 if encoded.length() == 0:
1153 indices = np.array([], dtype=np.intp)
1154 uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))
1155 else:
1156 # GH 54844
1157 combined = encoded.combine_chunks()
1158 pa_indices = combined.indices
1159 if pa_indices.null_count > 0:
1160 pa_indices = pc.fill_null(pa_indices, -1)
1161 indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(
1162 np.intp, copy=False
1163 )
1164 uniques = type(self)(combined.dictionary)
1165
1166 if pa_version_under11p0 and pa.types.is_duration(pa_type):
1167 uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype))
1168 return indices, uniques
1169
1170 def reshape(self, *args, **kwargs):
1171 raise NotImplementedError(
1172 f"{type(self)} does not support reshape "
1173 f"as backed by a 1D pyarrow.ChunkedArray."
1174 )
1175
1176 def round(self, decimals: int = 0, *args, **kwargs) -> Self:
1177 """
1178 Round each value in the array a to the given number of decimals.
1179
1180 Parameters
1181 ----------
1182 decimals : int, default 0
1183 Number of decimal places to round to. If decimals is negative,
1184 it specifies the number of positions to the left of the decimal point.
1185 *args, **kwargs
1186 Additional arguments and keywords have no effect.
1187
1188 Returns
1189 -------
1190 ArrowExtensionArray
1191 Rounded values of the ArrowExtensionArray.
1192
1193 See Also
1194 --------
1195 DataFrame.round : Round values of a DataFrame.
1196 Series.round : Round values of a Series.
1197 """
1198 return type(self)(pc.round(self._pa_array, ndigits=decimals))
1199
1200 @doc(ExtensionArray.searchsorted)
1201 def searchsorted(
1202 self,
1203 value: NumpyValueArrayLike | ExtensionArray,
1204 side: Literal["left", "right"] = "left",
1205 sorter: NumpySorter | None = None,
1206 ) -> npt.NDArray[np.intp] | np.intp:
1207 if self._hasna:
1208 raise ValueError(
1209 "searchsorted requires array to be sorted, which is impossible "
1210 "with NAs present."
1211 )
1212 if isinstance(value, ExtensionArray):
1213 value = value.astype(object)
1214 # Base class searchsorted would cast to object, which is *much* slower.
1215 dtype = None
1216 if isinstance(self.dtype, ArrowDtype):
1217 pa_dtype = self.dtype.pyarrow_dtype
1218 if (
1219 pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype)
1220 ) and pa_dtype.unit == "ns":
1221 # np.array[datetime/timedelta].searchsorted(datetime/timedelta)
1222 # erroneously fails when numpy type resolution is nanoseconds
1223 dtype = object
1224 return self.to_numpy(dtype=dtype).searchsorted(value, side=side, sorter=sorter)
1225
1226 def take(
1227 self,
1228 indices: TakeIndexer,
1229 allow_fill: bool = False,
1230 fill_value: Any = None,
1231 ) -> ArrowExtensionArray:
1232 """
1233 Take elements from an array.
1234
1235 Parameters
1236 ----------
1237 indices : sequence of int or one-dimensional np.ndarray of int
1238 Indices to be taken.
1239 allow_fill : bool, default False
1240 How to handle negative values in `indices`.
1241
1242 * False: negative values in `indices` indicate positional indices
1243 from the right (the default). This is similar to
1244 :func:`numpy.take`.
1245
1246 * True: negative values in `indices` indicate
1247 missing values. These values are set to `fill_value`. Any other
1248 other negative values raise a ``ValueError``.
1249
1250 fill_value : any, optional
1251 Fill value to use for NA-indices when `allow_fill` is True.
1252 This may be ``None``, in which case the default NA value for
1253 the type, ``self.dtype.na_value``, is used.
1254
1255 For many ExtensionArrays, there will be two representations of
1256 `fill_value`: a user-facing "boxed" scalar, and a low-level
1257 physical NA value. `fill_value` should be the user-facing version,
1258 and the implementation should handle translating that to the
1259 physical version for processing the take if necessary.
1260
1261 Returns
1262 -------
1263 ExtensionArray
1264
1265 Raises
1266 ------
1267 IndexError
1268 When the indices are out of bounds for the array.
1269 ValueError
1270 When `indices` contains negative values other than ``-1``
1271 and `allow_fill` is True.
1272
1273 See Also
1274 --------
1275 numpy.take
1276 api.extensions.take
1277
1278 Notes
1279 -----
1280 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
1281 ``iloc``, when `indices` is a sequence of values. Additionally,
1282 it's called by :meth:`Series.reindex`, or any other method
1283 that causes realignment, with a `fill_value`.
1284 """
1285 indices_array = np.asanyarray(indices)
1286
1287 if len(self._pa_array) == 0 and (indices_array >= 0).any():
1288 raise IndexError("cannot do a non-empty take")
1289 if indices_array.size > 0 and indices_array.max() >= len(self._pa_array):
1290 raise IndexError("out of bounds value in 'indices'.")
1291
1292 if allow_fill:
1293 fill_mask = indices_array < 0
1294 if fill_mask.any():
1295 validate_indices(indices_array, len(self._pa_array))
1296 # TODO(ARROW-9433): Treat negative indices as NULL
1297 indices_array = pa.array(indices_array, mask=fill_mask)
1298 result = self._pa_array.take(indices_array)
1299 if isna(fill_value):
1300 return type(self)(result)
1301 # TODO: ArrowNotImplementedError: Function fill_null has no
1302 # kernel matching input types (array[string], scalar[string])
1303 result = type(self)(result)
1304 result[fill_mask] = fill_value
1305 return result
1306 # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
1307 else:
1308 # Nothing to fill
1309 return type(self)(self._pa_array.take(indices))
1310 else: # allow_fill=False
1311 # TODO(ARROW-9432): Treat negative indices as indices from the right.
1312 if (indices_array < 0).any():
1313 # Don't modify in-place
1314 indices_array = np.copy(indices_array)
1315 indices_array[indices_array < 0] += len(self._pa_array)
1316 return type(self)(self._pa_array.take(indices_array))
1317
1318 def _maybe_convert_datelike_array(self):
1319 """Maybe convert to a datelike array."""
1320 pa_type = self._pa_array.type
1321 if pa.types.is_timestamp(pa_type):
1322 return self._to_datetimearray()
1323 elif pa.types.is_duration(pa_type):
1324 return self._to_timedeltaarray()
1325 return self
1326
1327 def _to_datetimearray(self) -> DatetimeArray:
1328 """Convert a pyarrow timestamp typed array to a DatetimeArray."""
1329 from pandas.core.arrays.datetimes import (
1330 DatetimeArray,
1331 tz_to_dtype,
1332 )
1333
1334 pa_type = self._pa_array.type
1335 assert pa.types.is_timestamp(pa_type)
1336 np_dtype = np.dtype(f"M8[{pa_type.unit}]")
1337 dtype = tz_to_dtype(pa_type.tz, pa_type.unit)
1338 np_array = self._pa_array.to_numpy()
1339 np_array = np_array.astype(np_dtype)
1340 return DatetimeArray._simple_new(np_array, dtype=dtype)
1341
1342 def _to_timedeltaarray(self) -> TimedeltaArray:
1343 """Convert a pyarrow duration typed array to a TimedeltaArray."""
1344 from pandas.core.arrays.timedeltas import TimedeltaArray
1345
1346 pa_type = self._pa_array.type
1347 assert pa.types.is_duration(pa_type)
1348 np_dtype = np.dtype(f"m8[{pa_type.unit}]")
1349 np_array = self._pa_array.to_numpy()
1350 np_array = np_array.astype(np_dtype)
1351 return TimedeltaArray._simple_new(np_array, dtype=np_dtype)
1352
1353 def _values_for_json(self) -> np.ndarray:
1354 if is_numeric_dtype(self.dtype):
1355 return np.asarray(self, dtype=object)
1356 return super()._values_for_json()
1357
1358 @doc(ExtensionArray.to_numpy)
1359 def to_numpy(
1360 self,
1361 dtype: npt.DTypeLike | None = None,
1362 copy: bool = False,
1363 na_value: object = lib.no_default,
1364 ) -> np.ndarray:
1365 original_na_value = na_value
1366 dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna)
1367 pa_type = self._pa_array.type
1368 if not self._hasna or isna(na_value) or pa.types.is_null(pa_type):
1369 data = self
1370 else:
1371 data = self.fillna(na_value)
1372 copy = False
1373
1374 if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type):
1375 # GH 55997
1376 if dtype != object and na_value is self.dtype.na_value:
1377 na_value = lib.no_default
1378 result = data._maybe_convert_datelike_array().to_numpy(
1379 dtype=dtype, na_value=na_value
1380 )
1381 elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type):
1382 # convert to list of python datetime.time objects before
1383 # wrapping in ndarray
1384 result = np.array(list(data), dtype=dtype)
1385 if data._hasna:
1386 result[data.isna()] = na_value
1387 elif pa.types.is_null(pa_type):
1388 if dtype is not None and isna(na_value):
1389 na_value = None
1390 result = np.full(len(data), fill_value=na_value, dtype=dtype)
1391 elif not data._hasna or (
1392 pa.types.is_floating(pa_type)
1393 and (
1394 na_value is np.nan
1395 or original_na_value is lib.no_default
1396 and is_float_dtype(dtype)
1397 )
1398 ):
1399 result = data._pa_array.to_numpy()
1400 if dtype is not None:
1401 result = result.astype(dtype, copy=False)
1402 if copy:
1403 result = result.copy()
1404 else:
1405 if dtype is None:
1406 empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False)
1407 if can_hold_element(empty, na_value):
1408 dtype = empty.dtype
1409 else:
1410 dtype = np.object_
1411 result = np.empty(len(data), dtype=dtype)
1412 mask = data.isna()
1413 result[mask] = na_value
1414 result[~mask] = data[~mask]._pa_array.to_numpy()
1415 return result
1416
1417 def map(self, mapper, na_action=None):
1418 if is_numeric_dtype(self.dtype):
1419 return map_array(self.to_numpy(), mapper, na_action=na_action)
1420 else:
1421 return super().map(mapper, na_action)
1422
1423 @doc(ExtensionArray.duplicated)
1424 def duplicated(
1425 self, keep: Literal["first", "last", False] = "first"
1426 ) -> npt.NDArray[np.bool_]:
1427 pa_type = self._pa_array.type
1428 if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):
1429 values = self.to_numpy(na_value=0)
1430 elif pa.types.is_boolean(pa_type):
1431 values = self.to_numpy(na_value=False)
1432 elif pa.types.is_temporal(pa_type):
1433 if pa_type.bit_width == 32:
1434 pa_type = pa.int32()
1435 else:
1436 pa_type = pa.int64()
1437 arr = self.astype(ArrowDtype(pa_type))
1438 values = arr.to_numpy(na_value=0)
1439 else:
1440 # factorize the values to avoid the performance penalty of
1441 # converting to object dtype
1442 values = self.factorize()[0]
1443
1444 mask = self.isna() if self._hasna else None
1445 return algos.duplicated(values, keep=keep, mask=mask)
1446
1447 def unique(self) -> Self:
1448 """
1449 Compute the ArrowExtensionArray of unique values.
1450
1451 Returns
1452 -------
1453 ArrowExtensionArray
1454 """
1455 pa_type = self._pa_array.type
1456
1457 if pa_version_under11p0 and pa.types.is_duration(pa_type):
1458 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
1459 data = self._pa_array.cast(pa.int64())
1460 else:
1461 data = self._pa_array
1462
1463 pa_result = pc.unique(data)
1464
1465 if pa_version_under11p0 and pa.types.is_duration(pa_type):
1466 pa_result = pa_result.cast(pa_type)
1467
1468 return type(self)(pa_result)
1469
1470 def value_counts(self, dropna: bool = True) -> Series:
1471 """
1472 Return a Series containing counts of each unique value.
1473
1474 Parameters
1475 ----------
1476 dropna : bool, default True
1477 Don't include counts of missing values.
1478
1479 Returns
1480 -------
1481 counts : Series
1482
1483 See Also
1484 --------
1485 Series.value_counts
1486 """
1487 pa_type = self._pa_array.type
1488 if pa_version_under11p0 and pa.types.is_duration(pa_type):
1489 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
1490 data = self._pa_array.cast(pa.int64())
1491 else:
1492 data = self._pa_array
1493
1494 from pandas import (
1495 Index,
1496 Series,
1497 )
1498
1499 vc = data.value_counts()
1500
1501 values = vc.field(0)
1502 counts = vc.field(1)
1503 if dropna and data.null_count > 0:
1504 mask = values.is_valid()
1505 values = values.filter(mask)
1506 counts = counts.filter(mask)
1507
1508 if pa_version_under11p0 and pa.types.is_duration(pa_type):
1509 values = values.cast(pa_type)
1510
1511 counts = ArrowExtensionArray(counts)
1512
1513 index = Index(type(self)(values))
1514
1515 return Series(counts, index=index, name="count", copy=False)
1516
1517 @classmethod
1518 def _concat_same_type(cls, to_concat) -> Self:
1519 """
1520 Concatenate multiple ArrowExtensionArrays.
1521
1522 Parameters
1523 ----------
1524 to_concat : sequence of ArrowExtensionArrays
1525
1526 Returns
1527 -------
1528 ArrowExtensionArray
1529 """
1530 chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]
1531 if to_concat[0].dtype == "string":
1532 # StringDtype has no attribute pyarrow_dtype
1533 pa_dtype = pa.large_string()
1534 else:
1535 pa_dtype = to_concat[0].dtype.pyarrow_dtype
1536 arr = pa.chunked_array(chunks, type=pa_dtype)
1537 return cls(arr)
1538
1539 def _accumulate(
1540 self, name: str, *, skipna: bool = True, **kwargs
1541 ) -> ArrowExtensionArray | ExtensionArray:
1542 """
1543 Return an ExtensionArray performing an accumulation operation.
1544
1545 The underlying data type might change.
1546
1547 Parameters
1548 ----------
1549 name : str
1550 Name of the function, supported values are:
1551 - cummin
1552 - cummax
1553 - cumsum
1554 - cumprod
1555 skipna : bool, default True
1556 If True, skip NA values.
1557 **kwargs
1558 Additional keyword arguments passed to the accumulation function.
1559 Currently, there is no supported kwarg.
1560
1561 Returns
1562 -------
1563 array
1564
1565 Raises
1566 ------
1567 NotImplementedError : subclass does not define accumulations
1568 """
1569 pyarrow_name = {
1570 "cummax": "cumulative_max",
1571 "cummin": "cumulative_min",
1572 "cumprod": "cumulative_prod_checked",
1573 "cumsum": "cumulative_sum_checked",
1574 }.get(name, name)
1575 pyarrow_meth = getattr(pc, pyarrow_name, None)
1576 if pyarrow_meth is None:
1577 return super()._accumulate(name, skipna=skipna, **kwargs)
1578
1579 data_to_accum = self._pa_array
1580
1581 pa_dtype = data_to_accum.type
1582
1583 convert_to_int = (
1584 pa.types.is_temporal(pa_dtype) and name in ["cummax", "cummin"]
1585 ) or (pa.types.is_duration(pa_dtype) and name == "cumsum")
1586
1587 if convert_to_int:
1588 if pa_dtype.bit_width == 32:
1589 data_to_accum = data_to_accum.cast(pa.int32())
1590 else:
1591 data_to_accum = data_to_accum.cast(pa.int64())
1592
1593 result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1594
1595 if convert_to_int:
1596 result = result.cast(pa_dtype)
1597
1598 return type(self)(result)
1599
1600 def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar:
1601 """
1602 Return a pyarrow scalar result of performing the reduction operation.
1603
1604 Parameters
1605 ----------
1606 name : str
1607 Name of the function, supported values are:
1608 { any, all, min, max, sum, mean, median, prod,
1609 std, var, sem, kurt, skew }.
1610 skipna : bool, default True
1611 If True, skip NaN values.
1612 **kwargs
1613 Additional keyword arguments passed to the reduction function.
1614 Currently, `ddof` is the only supported kwarg.
1615
1616 Returns
1617 -------
1618 pyarrow scalar
1619
1620 Raises
1621 ------
1622 TypeError : subclass does not define reductions
1623 """
1624 pa_type = self._pa_array.type
1625
1626 data_to_reduce = self._pa_array
1627
1628 cast_kwargs = {} if pa_version_under13p0 else {"safe": False}
1629
1630 if name in ["any", "all"] and (
1631 pa.types.is_integer(pa_type)
1632 or pa.types.is_floating(pa_type)
1633 or pa.types.is_duration(pa_type)
1634 or pa.types.is_decimal(pa_type)
1635 ):
1636 # pyarrow only supports any/all for boolean dtype, we allow
1637 # for other dtypes, matching our non-pyarrow behavior
1638
1639 if pa.types.is_duration(pa_type):
1640 data_to_cmp = self._pa_array.cast(pa.int64())
1641 else:
1642 data_to_cmp = self._pa_array
1643
1644 not_eq = pc.not_equal(data_to_cmp, 0)
1645 data_to_reduce = not_eq
1646
1647 elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
1648 data_to_reduce = self._pa_array.cast(pa.int64())
1649
1650 elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type):
1651 nbits = pa_type.bit_width
1652 if nbits == 32:
1653 data_to_reduce = self._pa_array.cast(pa.int32())
1654 else:
1655 data_to_reduce = self._pa_array.cast(pa.int64())
1656
1657 if name == "sem":
1658
1659 def pyarrow_meth(data, skip_nulls, **kwargs):
1660 numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs)
1661 denominator = pc.sqrt_checked(pc.count(self._pa_array))
1662 return pc.divide_checked(numerator, denominator)
1663
1664 else:
1665 pyarrow_name = {
1666 "median": "quantile",
1667 "prod": "product",
1668 "std": "stddev",
1669 "var": "variance",
1670 }.get(name, name)
1671 # error: Incompatible types in assignment
1672 # (expression has type "Optional[Any]", variable has type
1673 # "Callable[[Any, Any, KwArg(Any)], Any]")
1674 pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]
1675 if pyarrow_meth is None:
1676 # Let ExtensionArray._reduce raise the TypeError
1677 return super()._reduce(name, skipna=skipna, **kwargs)
1678
1679 # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0
1680 if name in ["any", "all"] and "min_count" not in kwargs:
1681 kwargs["min_count"] = 0
1682 elif name == "median":
1683 # GH 52679: Use quantile instead of approximate_median
1684 kwargs["q"] = 0.5
1685
1686 try:
1687 result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)
1688 except (AttributeError, NotImplementedError, TypeError) as err:
1689 msg = (
1690 f"'{type(self).__name__}' with dtype {self.dtype} "
1691 f"does not support reduction '{name}' with pyarrow "
1692 f"version {pa.__version__}. '{name}' may be supported by "
1693 f"upgrading pyarrow."
1694 )
1695 raise TypeError(msg) from err
1696 if name == "median":
1697 # GH 52679: Use quantile instead of approximate_median; returns array
1698 result = result[0]
1699 if pc.is_null(result).as_py():
1700 return result
1701
1702 if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
1703 result = result.cast(pa_type)
1704 if name in ["median", "mean"] and pa.types.is_temporal(pa_type):
1705 if not pa_version_under13p0:
1706 nbits = pa_type.bit_width
1707 if nbits == 32:
1708 result = result.cast(pa.int32(), **cast_kwargs)
1709 else:
1710 result = result.cast(pa.int64(), **cast_kwargs)
1711 result = result.cast(pa_type)
1712 if name in ["std", "sem"] and pa.types.is_temporal(pa_type):
1713 result = result.cast(pa.int64(), **cast_kwargs)
1714 if pa.types.is_duration(pa_type):
1715 result = result.cast(pa_type)
1716 elif pa.types.is_time(pa_type):
1717 unit = get_unit_from_pa_dtype(pa_type)
1718 result = result.cast(pa.duration(unit))
1719 elif pa.types.is_date(pa_type):
1720 # go with closest available unit, i.e. "s"
1721 result = result.cast(pa.duration("s"))
1722 else:
1723 # i.e. timestamp
1724 result = result.cast(pa.duration(pa_type.unit))
1725
1726 return result
1727
1728 def _reduce(
1729 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
1730 ):
1731 """
1732 Return a scalar result of performing the reduction operation.
1733
1734 Parameters
1735 ----------
1736 name : str
1737 Name of the function, supported values are:
1738 { any, all, min, max, sum, mean, median, prod,
1739 std, var, sem, kurt, skew }.
1740 skipna : bool, default True
1741 If True, skip NaN values.
1742 **kwargs
1743 Additional keyword arguments passed to the reduction function.
1744 Currently, `ddof` is the only supported kwarg.
1745
1746 Returns
1747 -------
1748 scalar
1749
1750 Raises
1751 ------
1752 TypeError : subclass does not define reductions
1753 """
1754 result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)
1755 if isinstance(result, pa.Array):
1756 return type(self)(result)
1757 else:
1758 return result
1759
1760 def _reduce_calc(
1761 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
1762 ):
1763 pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)
1764
1765 if keepdims:
1766 if isinstance(pa_result, pa.Scalar):
1767 result = pa.array([pa_result.as_py()], type=pa_result.type)
1768 else:
1769 result = pa.array(
1770 [pa_result],
1771 type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]),
1772 )
1773 return result
1774
1775 if pc.is_null(pa_result).as_py():
1776 return self.dtype.na_value
1777 elif isinstance(pa_result, pa.Scalar):
1778 return pa_result.as_py()
1779 else:
1780 return pa_result
1781
1782 def _explode(self):
1783 """
1784 See Series.explode.__doc__.
1785 """
1786 # child class explode method supports only list types; return
1787 # default implementation for non list types.
1788 if not pa.types.is_list(self.dtype.pyarrow_dtype):
1789 return super()._explode()
1790 values = self
1791 counts = pa.compute.list_value_length(values._pa_array)
1792 counts = counts.fill_null(1).to_numpy()
1793 fill_value = pa.scalar([None], type=self._pa_array.type)
1794 mask = counts == 0
1795 if mask.any():
1796 values = values.copy()
1797 values[mask] = fill_value
1798 counts = counts.copy()
1799 counts[mask] = 1
1800 values = values.fillna(fill_value)
1801 values = type(self)(pa.compute.list_flatten(values._pa_array))
1802 return values, counts
1803
1804 def __setitem__(self, key, value) -> None:
1805 """Set one or more values inplace.
1806
1807 Parameters
1808 ----------
1809 key : int, ndarray, or slice
1810 When called from, e.g. ``Series.__setitem__``, ``key`` will be
1811 one of
1812
1813 * scalar int
1814 * ndarray of integers.
1815 * boolean ndarray
1816 * slice object
1817
1818 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
1819 value or values to be set of ``key``.
1820
1821 Returns
1822 -------
1823 None
1824 """
1825 # GH50085: unwrap 1D indexers
1826 if isinstance(key, tuple) and len(key) == 1:
1827 key = key[0]
1828
1829 key = check_array_indexer(self, key)
1830 value = self._maybe_convert_setitem_value(value)
1831
1832 if com.is_null_slice(key):
1833 # fast path (GH50248)
1834 data = self._if_else(True, value, self._pa_array)
1835
1836 elif is_integer(key):
1837 # fast path
1838 key = cast(int, key)
1839 n = len(self)
1840 if key < 0:
1841 key += n
1842 if not 0 <= key < n:
1843 raise IndexError(
1844 f"index {key} is out of bounds for axis 0 with size {n}"
1845 )
1846 if isinstance(value, pa.Scalar):
1847 value = value.as_py()
1848 elif is_list_like(value):
1849 raise ValueError("Length of indexer and values mismatch")
1850 chunks = [
1851 *self._pa_array[:key].chunks,
1852 pa.array([value], type=self._pa_array.type, from_pandas=True),
1853 *self._pa_array[key + 1 :].chunks,
1854 ]
1855 data = pa.chunked_array(chunks).combine_chunks()
1856
1857 elif is_bool_dtype(key):
1858 key = np.asarray(key, dtype=np.bool_)
1859 data = self._replace_with_mask(self._pa_array, key, value)
1860
1861 elif is_scalar(value) or isinstance(value, pa.Scalar):
1862 mask = np.zeros(len(self), dtype=np.bool_)
1863 mask[key] = True
1864 data = self._if_else(mask, value, self._pa_array)
1865
1866 else:
1867 indices = np.arange(len(self))[key]
1868 if len(indices) != len(value):
1869 raise ValueError("Length of indexer and values mismatch")
1870 if len(indices) == 0:
1871 return
1872 argsort = np.argsort(indices)
1873 indices = indices[argsort]
1874 value = value.take(argsort)
1875 mask = np.zeros(len(self), dtype=np.bool_)
1876 mask[indices] = True
1877 data = self._replace_with_mask(self._pa_array, mask, value)
1878
1879 if isinstance(data, pa.Array):
1880 data = pa.chunked_array([data])
1881 self._pa_array = data
1882
1883 def _rank_calc(
1884 self,
1885 *,
1886 axis: AxisInt = 0,
1887 method: str = "average",
1888 na_option: str = "keep",
1889 ascending: bool = True,
1890 pct: bool = False,
1891 ):
1892 if axis != 0:
1893 ranked = super()._rank(
1894 axis=axis,
1895 method=method,
1896 na_option=na_option,
1897 ascending=ascending,
1898 pct=pct,
1899 )
1900 # keep dtypes consistent with the implementation below
1901 if method == "average" or pct:
1902 pa_type = pa.float64()
1903 else:
1904 pa_type = pa.uint64()
1905 result = pa.array(ranked, type=pa_type, from_pandas=True)
1906 return result
1907
1908 data = self._pa_array.combine_chunks()
1909 sort_keys = "ascending" if ascending else "descending"
1910 null_placement = "at_start" if na_option == "top" else "at_end"
1911 tiebreaker = "min" if method == "average" else method
1912
1913 result = pc.rank(
1914 data,
1915 sort_keys=sort_keys,
1916 null_placement=null_placement,
1917 tiebreaker=tiebreaker,
1918 )
1919
1920 if na_option == "keep":
1921 mask = pc.is_null(self._pa_array)
1922 null = pa.scalar(None, type=result.type)
1923 result = pc.if_else(mask, null, result)
1924
1925 if method == "average":
1926 result_max = pc.rank(
1927 data,
1928 sort_keys=sort_keys,
1929 null_placement=null_placement,
1930 tiebreaker="max",
1931 )
1932 result_max = result_max.cast(pa.float64())
1933 result_min = result.cast(pa.float64())
1934 result = pc.divide(pc.add(result_min, result_max), 2)
1935
1936 if pct:
1937 if not pa.types.is_floating(result.type):
1938 result = result.cast(pa.float64())
1939 if method == "dense":
1940 divisor = pc.max(result)
1941 else:
1942 divisor = pc.count(result)
1943 result = pc.divide(result, divisor)
1944
1945 return result
1946
1947 def _rank(
1948 self,
1949 *,
1950 axis: AxisInt = 0,
1951 method: str = "average",
1952 na_option: str = "keep",
1953 ascending: bool = True,
1954 pct: bool = False,
1955 ):
1956 """
1957 See Series.rank.__doc__.
1958 """
1959 return type(self)(
1960 self._rank_calc(
1961 axis=axis,
1962 method=method,
1963 na_option=na_option,
1964 ascending=ascending,
1965 pct=pct,
1966 )
1967 )
1968
1969 def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self:
1970 """
1971 Compute the quantiles of self for each quantile in `qs`.
1972
1973 Parameters
1974 ----------
1975 qs : np.ndarray[float64]
1976 interpolation: str
1977
1978 Returns
1979 -------
1980 same type as self
1981 """
1982 pa_dtype = self._pa_array.type
1983
1984 data = self._pa_array
1985 if pa.types.is_temporal(pa_dtype):
1986 # https://github.com/apache/arrow/issues/33769 in these cases
1987 # we can cast to ints and back
1988 nbits = pa_dtype.bit_width
1989 if nbits == 32:
1990 data = data.cast(pa.int32())
1991 else:
1992 data = data.cast(pa.int64())
1993
1994 result = pc.quantile(data, q=qs, interpolation=interpolation)
1995
1996 if pa.types.is_temporal(pa_dtype):
1997 if pa.types.is_floating(result.type):
1998 result = pc.floor(result)
1999 nbits = pa_dtype.bit_width
2000 if nbits == 32:
2001 result = result.cast(pa.int32())
2002 else:
2003 result = result.cast(pa.int64())
2004 result = result.cast(pa_dtype)
2005
2006 return type(self)(result)
2007
2008 def _mode(self, dropna: bool = True) -> Self:
2009 """
2010 Returns the mode(s) of the ExtensionArray.
2011
2012 Always returns `ExtensionArray` even if only one value.
2013
2014 Parameters
2015 ----------
2016 dropna : bool, default True
2017 Don't consider counts of NA values.
2018
2019 Returns
2020 -------
2021 same type as self
2022 Sorted, if possible.
2023 """
2024 pa_type = self._pa_array.type
2025 if pa.types.is_temporal(pa_type):
2026 nbits = pa_type.bit_width
2027 if nbits == 32:
2028 data = self._pa_array.cast(pa.int32())
2029 elif nbits == 64:
2030 data = self._pa_array.cast(pa.int64())
2031 else:
2032 raise NotImplementedError(pa_type)
2033 else:
2034 data = self._pa_array
2035
2036 if dropna:
2037 data = data.drop_null()
2038
2039 res = pc.value_counts(data)
2040 most_common = res.field("values").filter(
2041 pc.equal(res.field("counts"), pc.max(res.field("counts")))
2042 )
2043
2044 if pa.types.is_temporal(pa_type):
2045 most_common = most_common.cast(pa_type)
2046
2047 most_common = most_common.take(pc.array_sort_indices(most_common))
2048 return type(self)(most_common)
2049
2050 def _maybe_convert_setitem_value(self, value):
2051 """Maybe convert value to be pyarrow compatible."""
2052 try:
2053 value = self._box_pa(value, self._pa_array.type)
2054 except pa.ArrowTypeError as err:
2055 msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
2056 raise TypeError(msg) from err
2057 return value
2058
2059 def interpolate(
2060 self,
2061 *,
2062 method: InterpolateOptions,
2063 axis: int,
2064 index,
2065 limit,
2066 limit_direction,
2067 limit_area,
2068 copy: bool,
2069 **kwargs,
2070 ) -> Self:
2071 """
2072 See NDFrame.interpolate.__doc__.
2073 """
2074 # NB: we return type(self) even if copy=False
2075 mask = self.isna()
2076 if self.dtype.kind == "f":
2077 data = self._pa_array.to_numpy()
2078 elif self.dtype.kind in "iu":
2079 data = self.to_numpy(dtype="f8", na_value=0.0)
2080 else:
2081 raise NotImplementedError(
2082 f"interpolate is not implemented for dtype={self.dtype}"
2083 )
2084
2085 missing.interpolate_2d_inplace(
2086 data,
2087 method=method,
2088 axis=0,
2089 index=index,
2090 limit=limit,
2091 limit_direction=limit_direction,
2092 limit_area=limit_area,
2093 mask=mask,
2094 **kwargs,
2095 )
2096 return type(self)(self._box_pa_array(pa.array(data, mask=mask)))
2097
2098 @classmethod
2099 def _if_else(
2100 cls,
2101 cond: npt.NDArray[np.bool_] | bool,
2102 left: ArrayLike | Scalar,
2103 right: ArrayLike | Scalar,
2104 ):
2105 """
2106 Choose values based on a condition.
2107
2108 Analogous to pyarrow.compute.if_else, with logic
2109 to fallback to numpy for unsupported types.
2110
2111 Parameters
2112 ----------
2113 cond : npt.NDArray[np.bool_] or bool
2114 left : ArrayLike | Scalar
2115 right : ArrayLike | Scalar
2116
2117 Returns
2118 -------
2119 pa.Array
2120 """
2121 try:
2122 return pc.if_else(cond, left, right)
2123 except pa.ArrowNotImplementedError:
2124 pass
2125
2126 def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
2127 if isinstance(value, (pa.Array, pa.ChunkedArray)):
2128 pa_type = value.type
2129 elif isinstance(value, pa.Scalar):
2130 pa_type = value.type
2131 value = value.as_py()
2132 else:
2133 pa_type = None
2134 return np.array(value, dtype=object), pa_type
2135
2136 left, left_type = _to_numpy_and_type(left)
2137 right, right_type = _to_numpy_and_type(right)
2138 pa_type = left_type or right_type
2139 result = np.where(cond, left, right)
2140 return pa.array(result, type=pa_type, from_pandas=True)
2141
2142 @classmethod
2143 def _replace_with_mask(
2144 cls,
2145 values: pa.Array | pa.ChunkedArray,
2146 mask: npt.NDArray[np.bool_] | bool,
2147 replacements: ArrayLike | Scalar,
2148 ):
2149 """
2150 Replace items selected with a mask.
2151
2152 Analogous to pyarrow.compute.replace_with_mask, with logic
2153 to fallback to numpy for unsupported types.
2154
2155 Parameters
2156 ----------
2157 values : pa.Array or pa.ChunkedArray
2158 mask : npt.NDArray[np.bool_] or bool
2159 replacements : ArrayLike or Scalar
2160 Replacement value(s)
2161
2162 Returns
2163 -------
2164 pa.Array or pa.ChunkedArray
2165 """
2166 if isinstance(replacements, pa.ChunkedArray):
2167 # replacements must be array or scalar, not ChunkedArray
2168 replacements = replacements.combine_chunks()
2169 if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type):
2170 # GH#52059 replace_with_mask segfaults for chunked array
2171 # https://github.com/apache/arrow/issues/34634
2172 values = values.combine_chunks()
2173 try:
2174 return pc.replace_with_mask(values, mask, replacements)
2175 except pa.ArrowNotImplementedError:
2176 pass
2177 if isinstance(replacements, pa.Array):
2178 replacements = np.array(replacements, dtype=object)
2179 elif isinstance(replacements, pa.Scalar):
2180 replacements = replacements.as_py()
2181 result = np.array(values, dtype=object)
2182 result[mask] = replacements
2183 return pa.array(result, type=values.type, from_pandas=True)
2184
2185 # ------------------------------------------------------------------
2186 # GroupBy Methods
2187
2188 def _to_masked(self):
2189 pa_dtype = self._pa_array.type
2190
2191 if pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype):
2192 na_value = 1
2193 elif pa.types.is_boolean(pa_dtype):
2194 na_value = True
2195 else:
2196 raise NotImplementedError
2197
2198 dtype = _arrow_dtype_mapping()[pa_dtype]
2199 mask = self.isna()
2200 arr = self.to_numpy(dtype=dtype.numpy_dtype, na_value=na_value)
2201 return dtype.construct_array_type()(arr, mask)
2202
2203 def _groupby_op(
2204 self,
2205 *,
2206 how: str,
2207 has_dropped_na: bool,
2208 min_count: int,
2209 ngroups: int,
2210 ids: npt.NDArray[np.intp],
2211 **kwargs,
2212 ):
2213 if isinstance(self.dtype, StringDtype):
2214 return super()._groupby_op(
2215 how=how,
2216 has_dropped_na=has_dropped_na,
2217 min_count=min_count,
2218 ngroups=ngroups,
2219 ids=ids,
2220 **kwargs,
2221 )
2222
2223 # maybe convert to a compatible dtype optimized for groupby
2224 values: ExtensionArray
2225 pa_type = self._pa_array.type
2226 if pa.types.is_timestamp(pa_type):
2227 values = self._to_datetimearray()
2228 elif pa.types.is_duration(pa_type):
2229 values = self._to_timedeltaarray()
2230 else:
2231 values = self._to_masked()
2232
2233 result = values._groupby_op(
2234 how=how,
2235 has_dropped_na=has_dropped_na,
2236 min_count=min_count,
2237 ngroups=ngroups,
2238 ids=ids,
2239 **kwargs,
2240 )
2241 if isinstance(result, np.ndarray):
2242 return result
2243 return type(self)._from_sequence(result, copy=False)
2244
2245 def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
2246 """Apply a callable to each element while maintaining the chunking structure."""
2247 return [
2248 [
2249 None if val is None else func(val)
2250 for val in chunk.to_numpy(zero_copy_only=False)
2251 ]
2252 for chunk in self._pa_array.iterchunks()
2253 ]
2254
2255 def _str_count(self, pat: str, flags: int = 0):
2256 if flags:
2257 raise NotImplementedError(f"count not implemented with {flags=}")
2258 return type(self)(pc.count_substring_regex(self._pa_array, pat))
2259
2260 def _str_contains(
2261 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
2262 ):
2263 if flags:
2264 raise NotImplementedError(f"contains not implemented with {flags=}")
2265
2266 if regex:
2267 pa_contains = pc.match_substring_regex
2268 else:
2269 pa_contains = pc.match_substring
2270 result = pa_contains(self._pa_array, pat, ignore_case=not case)
2271 if not isna(na):
2272 result = result.fill_null(na)
2273 return type(self)(result)
2274
2275 def _str_startswith(self, pat: str | tuple[str, ...], na=None):
2276 if isinstance(pat, str):
2277 result = pc.starts_with(self._pa_array, pattern=pat)
2278 else:
2279 if len(pat) == 0:
2280 # For empty tuple, pd.StringDtype() returns null for missing values
2281 # and false for valid values.
2282 result = pc.if_else(pc.is_null(self._pa_array), None, False)
2283 else:
2284 result = pc.starts_with(self._pa_array, pattern=pat[0])
2285
2286 for p in pat[1:]:
2287 result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
2288 if not isna(na):
2289 result = result.fill_null(na)
2290 return type(self)(result)
2291
2292 def _str_endswith(self, pat: str | tuple[str, ...], na=None):
2293 if isinstance(pat, str):
2294 result = pc.ends_with(self._pa_array, pattern=pat)
2295 else:
2296 if len(pat) == 0:
2297 # For empty tuple, pd.StringDtype() returns null for missing values
2298 # and false for valid values.
2299 result = pc.if_else(pc.is_null(self._pa_array), None, False)
2300 else:
2301 result = pc.ends_with(self._pa_array, pattern=pat[0])
2302
2303 for p in pat[1:]:
2304 result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
2305 if not isna(na):
2306 result = result.fill_null(na)
2307 return type(self)(result)
2308
2309 def _str_replace(
2310 self,
2311 pat: str | re.Pattern,
2312 repl: str | Callable,
2313 n: int = -1,
2314 case: bool = True,
2315 flags: int = 0,
2316 regex: bool = True,
2317 ):
2318 if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
2319 raise NotImplementedError(
2320 "replace is not supported with a re.Pattern, callable repl, "
2321 "case=False, or flags!=0"
2322 )
2323
2324 func = pc.replace_substring_regex if regex else pc.replace_substring
2325 # https://github.com/apache/arrow/issues/39149
2326 # GH 56404, unexpected behavior with negative max_replacements with pyarrow.
2327 pa_max_replacements = None if n < 0 else n
2328 result = func(
2329 self._pa_array,
2330 pattern=pat,
2331 replacement=repl,
2332 max_replacements=pa_max_replacements,
2333 )
2334 return type(self)(result)
2335
2336 def _str_repeat(self, repeats: int | Sequence[int]):
2337 if not isinstance(repeats, int):
2338 raise NotImplementedError(
2339 f"repeat is not implemented when repeats is {type(repeats).__name__}"
2340 )
2341 else:
2342 return type(self)(pc.binary_repeat(self._pa_array, repeats))
2343
2344 def _str_match(
2345 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
2346 ):
2347 if not pat.startswith("^"):
2348 pat = f"^{pat}"
2349 return self._str_contains(pat, case, flags, na, regex=True)
2350
2351 def _str_fullmatch(
2352 self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
2353 ):
2354 if not pat.endswith("$") or pat.endswith("\\$"):
2355 pat = f"{pat}$"
2356 return self._str_match(pat, case, flags, na)
2357
2358 def _str_find(self, sub: str, start: int = 0, end: int | None = None):
2359 if start != 0 and end is not None:
2360 slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
2361 result = pc.find_substring(slices, sub)
2362 not_found = pc.equal(result, -1)
2363 start_offset = max(0, start)
2364 offset_result = pc.add(result, start_offset)
2365 result = pc.if_else(not_found, result, offset_result)
2366 elif start == 0 and end is None:
2367 slices = self._pa_array
2368 result = pc.find_substring(slices, sub)
2369 else:
2370 raise NotImplementedError(
2371 f"find not implemented with {sub=}, {start=}, {end=}"
2372 )
2373 return type(self)(result)
2374
2375 def _str_join(self, sep: str):
2376 if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
2377 self._pa_array.type
2378 ):
2379 result = self._apply_elementwise(list)
2380 result = pa.chunked_array(result, type=pa.list_(pa.string()))
2381 else:
2382 result = self._pa_array
2383 return type(self)(pc.binary_join(result, sep))
2384
2385 def _str_partition(self, sep: str, expand: bool):
2386 predicate = lambda val: val.partition(sep)
2387 result = self._apply_elementwise(predicate)
2388 return type(self)(pa.chunked_array(result))
2389
2390 def _str_rpartition(self, sep: str, expand: bool):
2391 predicate = lambda val: val.rpartition(sep)
2392 result = self._apply_elementwise(predicate)
2393 return type(self)(pa.chunked_array(result))
2394
2395 def _str_slice(
2396 self, start: int | None = None, stop: int | None = None, step: int | None = None
2397 ):
2398 if start is None:
2399 start = 0
2400 if step is None:
2401 step = 1
2402 return type(self)(
2403 pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)
2404 )
2405
2406 def _str_isalnum(self):
2407 return type(self)(pc.utf8_is_alnum(self._pa_array))
2408
2409 def _str_isalpha(self):
2410 return type(self)(pc.utf8_is_alpha(self._pa_array))
2411
2412 def _str_isdecimal(self):
2413 return type(self)(pc.utf8_is_decimal(self._pa_array))
2414
2415 def _str_isdigit(self):
2416 return type(self)(pc.utf8_is_digit(self._pa_array))
2417
2418 def _str_islower(self):
2419 return type(self)(pc.utf8_is_lower(self._pa_array))
2420
2421 def _str_isnumeric(self):
2422 return type(self)(pc.utf8_is_numeric(self._pa_array))
2423
2424 def _str_isspace(self):
2425 return type(self)(pc.utf8_is_space(self._pa_array))
2426
2427 def _str_istitle(self):
2428 return type(self)(pc.utf8_is_title(self._pa_array))
2429
2430 def _str_isupper(self):
2431 return type(self)(pc.utf8_is_upper(self._pa_array))
2432
2433 def _str_len(self):
2434 return type(self)(pc.utf8_length(self._pa_array))
2435
2436 def _str_lower(self):
2437 return type(self)(pc.utf8_lower(self._pa_array))
2438
2439 def _str_upper(self):
2440 return type(self)(pc.utf8_upper(self._pa_array))
2441
2442 def _str_strip(self, to_strip=None):
2443 if to_strip is None:
2444 result = pc.utf8_trim_whitespace(self._pa_array)
2445 else:
2446 result = pc.utf8_trim(self._pa_array, characters=to_strip)
2447 return type(self)(result)
2448
2449 def _str_lstrip(self, to_strip=None):
2450 if to_strip is None:
2451 result = pc.utf8_ltrim_whitespace(self._pa_array)
2452 else:
2453 result = pc.utf8_ltrim(self._pa_array, characters=to_strip)
2454 return type(self)(result)
2455
2456 def _str_rstrip(self, to_strip=None):
2457 if to_strip is None:
2458 result = pc.utf8_rtrim_whitespace(self._pa_array)
2459 else:
2460 result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
2461 return type(self)(result)
2462
2463 def _str_removeprefix(self, prefix: str):
2464 if not pa_version_under13p0:
2465 starts_with = pc.starts_with(self._pa_array, pattern=prefix)
2466 removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
2467 result = pc.if_else(starts_with, removed, self._pa_array)
2468 return type(self)(result)
2469 predicate = lambda val: val.removeprefix(prefix)
2470 result = self._apply_elementwise(predicate)
2471 return type(self)(pa.chunked_array(result))
2472
2473 def _str_casefold(self):
2474 predicate = lambda val: val.casefold()
2475 result = self._apply_elementwise(predicate)
2476 return type(self)(pa.chunked_array(result))
2477
2478 def _str_encode(self, encoding: str, errors: str = "strict"):
2479 predicate = lambda val: val.encode(encoding, errors)
2480 result = self._apply_elementwise(predicate)
2481 return type(self)(pa.chunked_array(result))
2482
2483 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
2484 if flags:
2485 raise NotImplementedError("Only flags=0 is implemented.")
2486 groups = re.compile(pat).groupindex.keys()
2487 if len(groups) == 0:
2488 raise ValueError(f"{pat=} must contain a symbolic group name.")
2489 result = pc.extract_regex(self._pa_array, pat)
2490 if expand:
2491 return {
2492 col: type(self)(pc.struct_field(result, [i]))
2493 for col, i in zip(groups, range(result.type.num_fields))
2494 }
2495 else:
2496 return type(self)(pc.struct_field(result, [0]))
2497
2498 def _str_findall(self, pat: str, flags: int = 0):
2499 regex = re.compile(pat, flags=flags)
2500 predicate = lambda val: regex.findall(val)
2501 result = self._apply_elementwise(predicate)
2502 return type(self)(pa.chunked_array(result))
2503
2504 def _str_get_dummies(self, sep: str = "|"):
2505 split = pc.split_pattern(self._pa_array, sep)
2506 flattened_values = pc.list_flatten(split)
2507 uniques = flattened_values.unique()
2508 uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques))
2509 lengths = pc.list_value_length(split).fill_null(0).to_numpy()
2510 n_rows = len(self)
2511 n_cols = len(uniques)
2512 indices = pc.index_in(flattened_values, uniques_sorted).to_numpy()
2513 indices = indices + np.arange(n_rows).repeat(lengths) * n_cols
2514 dummies = np.zeros(n_rows * n_cols, dtype=np.bool_)
2515 dummies[indices] = True
2516 dummies = dummies.reshape((n_rows, n_cols))
2517 result = type(self)(pa.array(list(dummies)))
2518 return result, uniques_sorted.to_pylist()
2519
2520 def _str_index(self, sub: str, start: int = 0, end: int | None = None):
2521 predicate = lambda val: val.index(sub, start, end)
2522 result = self._apply_elementwise(predicate)
2523 return type(self)(pa.chunked_array(result))
2524
2525 def _str_rindex(self, sub: str, start: int = 0, end: int | None = None):
2526 predicate = lambda val: val.rindex(sub, start, end)
2527 result = self._apply_elementwise(predicate)
2528 return type(self)(pa.chunked_array(result))
2529
2530 def _str_normalize(self, form: str):
2531 predicate = lambda val: unicodedata.normalize(form, val)
2532 result = self._apply_elementwise(predicate)
2533 return type(self)(pa.chunked_array(result))
2534
2535 def _str_rfind(self, sub: str, start: int = 0, end=None):
2536 predicate = lambda val: val.rfind(sub, start, end)
2537 result = self._apply_elementwise(predicate)
2538 return type(self)(pa.chunked_array(result))
2539
2540 def _str_split(
2541 self,
2542 pat: str | None = None,
2543 n: int | None = -1,
2544 expand: bool = False,
2545 regex: bool | None = None,
2546 ):
2547 if n in {-1, 0}:
2548 n = None
2549 if pat is None:
2550 split_func = pc.utf8_split_whitespace
2551 elif regex:
2552 split_func = functools.partial(pc.split_pattern_regex, pattern=pat)
2553 else:
2554 split_func = functools.partial(pc.split_pattern, pattern=pat)
2555 return type(self)(split_func(self._pa_array, max_splits=n))
2556
2557 def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
2558 if n in {-1, 0}:
2559 n = None
2560 if pat is None:
2561 return type(self)(
2562 pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True)
2563 )
2564 else:
2565 return type(self)(
2566 pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True)
2567 )
2568
2569 def _str_translate(self, table: dict[int, str]):
2570 predicate = lambda val: val.translate(table)
2571 result = self._apply_elementwise(predicate)
2572 return type(self)(pa.chunked_array(result))
2573
2574 def _str_wrap(self, width: int, **kwargs):
2575 kwargs["width"] = width
2576 tw = textwrap.TextWrapper(**kwargs)
2577 predicate = lambda val: "\n".join(tw.wrap(val))
2578 result = self._apply_elementwise(predicate)
2579 return type(self)(pa.chunked_array(result))
2580
2581 @property
2582 def _dt_days(self):
2583 return type(self)(
2584 pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32())
2585 )
2586
2587 @property
2588 def _dt_hours(self):
2589 return type(self)(
2590 pa.array(
2591 [
2592 td.components.hours if td is not NaT else None
2593 for td in self._to_timedeltaarray()
2594 ],
2595 type=pa.int32(),
2596 )
2597 )
2598
2599 @property
2600 def _dt_minutes(self):
2601 return type(self)(
2602 pa.array(
2603 [
2604 td.components.minutes if td is not NaT else None
2605 for td in self._to_timedeltaarray()
2606 ],
2607 type=pa.int32(),
2608 )
2609 )
2610
2611 @property
2612 def _dt_seconds(self):
2613 return type(self)(
2614 pa.array(
2615 self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32()
2616 )
2617 )
2618
2619 @property
2620 def _dt_milliseconds(self):
2621 return type(self)(
2622 pa.array(
2623 [
2624 td.components.milliseconds if td is not NaT else None
2625 for td in self._to_timedeltaarray()
2626 ],
2627 type=pa.int32(),
2628 )
2629 )
2630
2631 @property
2632 def _dt_microseconds(self):
2633 return type(self)(
2634 pa.array(
2635 self._to_timedeltaarray().microseconds,
2636 from_pandas=True,
2637 type=pa.int32(),
2638 )
2639 )
2640
2641 @property
2642 def _dt_nanoseconds(self):
2643 return type(self)(
2644 pa.array(
2645 self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32()
2646 )
2647 )
2648
2649 def _dt_to_pytimedelta(self):
2650 data = self._pa_array.to_pylist()
2651 if self._dtype.pyarrow_dtype.unit == "ns":
2652 data = [None if ts is None else ts.to_pytimedelta() for ts in data]
2653 return np.array(data, dtype=object)
2654
2655 def _dt_total_seconds(self):
2656 return type(self)(
2657 pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True)
2658 )
2659
2660 def _dt_as_unit(self, unit: str):
2661 if pa.types.is_date(self.dtype.pyarrow_dtype):
2662 raise NotImplementedError("as_unit not implemented for date types")
2663 pd_array = self._maybe_convert_datelike_array()
2664 # Don't just cast _pa_array in order to follow pandas unit conversion rules
2665 return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True))
2666
2667 @property
2668 def _dt_year(self):
2669 return type(self)(pc.year(self._pa_array))
2670
2671 @property
2672 def _dt_day(self):
2673 return type(self)(pc.day(self._pa_array))
2674
2675 @property
2676 def _dt_day_of_week(self):
2677 return type(self)(pc.day_of_week(self._pa_array))
2678
2679 _dt_dayofweek = _dt_day_of_week
2680 _dt_weekday = _dt_day_of_week
2681
2682 @property
2683 def _dt_day_of_year(self):
2684 return type(self)(pc.day_of_year(self._pa_array))
2685
2686 _dt_dayofyear = _dt_day_of_year
2687
2688 @property
2689 def _dt_hour(self):
2690 return type(self)(pc.hour(self._pa_array))
2691
2692 def _dt_isocalendar(self):
2693 return type(self)(pc.iso_calendar(self._pa_array))
2694
2695 @property
2696 def _dt_is_leap_year(self):
2697 return type(self)(pc.is_leap_year(self._pa_array))
2698
2699 @property
2700 def _dt_is_month_start(self):
2701 return type(self)(pc.equal(pc.day(self._pa_array), 1))
2702
2703 @property
2704 def _dt_is_month_end(self):
2705 result = pc.equal(
2706 pc.days_between(
2707 pc.floor_temporal(self._pa_array, unit="day"),
2708 pc.ceil_temporal(self._pa_array, unit="month"),
2709 ),
2710 1,
2711 )
2712 return type(self)(result)
2713
2714 @property
2715 def _dt_is_year_start(self):
2716 return type(self)(
2717 pc.and_(
2718 pc.equal(pc.month(self._pa_array), 1),
2719 pc.equal(pc.day(self._pa_array), 1),
2720 )
2721 )
2722
2723 @property
2724 def _dt_is_year_end(self):
2725 return type(self)(
2726 pc.and_(
2727 pc.equal(pc.month(self._pa_array), 12),
2728 pc.equal(pc.day(self._pa_array), 31),
2729 )
2730 )
2731
2732 @property
2733 def _dt_is_quarter_start(self):
2734 result = pc.equal(
2735 pc.floor_temporal(self._pa_array, unit="quarter"),
2736 pc.floor_temporal(self._pa_array, unit="day"),
2737 )
2738 return type(self)(result)
2739
2740 @property
2741 def _dt_is_quarter_end(self):
2742 result = pc.equal(
2743 pc.days_between(
2744 pc.floor_temporal(self._pa_array, unit="day"),
2745 pc.ceil_temporal(self._pa_array, unit="quarter"),
2746 ),
2747 1,
2748 )
2749 return type(self)(result)
2750
2751 @property
2752 def _dt_days_in_month(self):
2753 result = pc.days_between(
2754 pc.floor_temporal(self._pa_array, unit="month"),
2755 pc.ceil_temporal(self._pa_array, unit="month"),
2756 )
2757 return type(self)(result)
2758
2759 _dt_daysinmonth = _dt_days_in_month
2760
2761 @property
2762 def _dt_microsecond(self):
2763 return type(self)(pc.microsecond(self._pa_array))
2764
2765 @property
2766 def _dt_minute(self):
2767 return type(self)(pc.minute(self._pa_array))
2768
2769 @property
2770 def _dt_month(self):
2771 return type(self)(pc.month(self._pa_array))
2772
2773 @property
2774 def _dt_nanosecond(self):
2775 return type(self)(pc.nanosecond(self._pa_array))
2776
2777 @property
2778 def _dt_quarter(self):
2779 return type(self)(pc.quarter(self._pa_array))
2780
2781 @property
2782 def _dt_second(self):
2783 return type(self)(pc.second(self._pa_array))
2784
2785 @property
2786 def _dt_date(self):
2787 return type(self)(self._pa_array.cast(pa.date32()))
2788
2789 @property
2790 def _dt_time(self):
2791 unit = (
2792 self.dtype.pyarrow_dtype.unit
2793 if self.dtype.pyarrow_dtype.unit in {"us", "ns"}
2794 else "ns"
2795 )
2796 return type(self)(self._pa_array.cast(pa.time64(unit)))
2797
2798 @property
2799 def _dt_tz(self):
2800 return timezones.maybe_get_tz(self.dtype.pyarrow_dtype.tz)
2801
2802 @property
2803 def _dt_unit(self):
2804 return self.dtype.pyarrow_dtype.unit
2805
2806 def _dt_normalize(self):
2807 return type(self)(pc.floor_temporal(self._pa_array, 1, "day"))
2808
2809 def _dt_strftime(self, format: str):
2810 return type(self)(pc.strftime(self._pa_array, format=format))
2811
2812 def _round_temporally(
2813 self,
2814 method: Literal["ceil", "floor", "round"],
2815 freq,
2816 ambiguous: TimeAmbiguous = "raise",
2817 nonexistent: TimeNonexistent = "raise",
2818 ):
2819 if ambiguous != "raise":
2820 raise NotImplementedError("ambiguous is not supported.")
2821 if nonexistent != "raise":
2822 raise NotImplementedError("nonexistent is not supported.")
2823 offset = to_offset(freq)
2824 if offset is None:
2825 raise ValueError(f"Must specify a valid frequency: {freq}")
2826 pa_supported_unit = {
2827 "Y": "year",
2828 "YS": "year",
2829 "Q": "quarter",
2830 "QS": "quarter",
2831 "M": "month",
2832 "MS": "month",
2833 "W": "week",
2834 "D": "day",
2835 "h": "hour",
2836 "min": "minute",
2837 "s": "second",
2838 "ms": "millisecond",
2839 "us": "microsecond",
2840 "ns": "nanosecond",
2841 }
2842 unit = pa_supported_unit.get(offset._prefix, None)
2843 if unit is None:
2844 raise ValueError(f"{freq=} is not supported")
2845 multiple = offset.n
2846 rounding_method = getattr(pc, f"{method}_temporal")
2847 return type(self)(rounding_method(self._pa_array, multiple=multiple, unit=unit))
2848
2849 def _dt_ceil(
2850 self,
2851 freq,
2852 ambiguous: TimeAmbiguous = "raise",
2853 nonexistent: TimeNonexistent = "raise",
2854 ):
2855 return self._round_temporally("ceil", freq, ambiguous, nonexistent)
2856
2857 def _dt_floor(
2858 self,
2859 freq,
2860 ambiguous: TimeAmbiguous = "raise",
2861 nonexistent: TimeNonexistent = "raise",
2862 ):
2863 return self._round_temporally("floor", freq, ambiguous, nonexistent)
2864
2865 def _dt_round(
2866 self,
2867 freq,
2868 ambiguous: TimeAmbiguous = "raise",
2869 nonexistent: TimeNonexistent = "raise",
2870 ):
2871 return self._round_temporally("round", freq, ambiguous, nonexistent)
2872
2873 def _dt_day_name(self, locale: str | None = None):
2874 if locale is None:
2875 locale = "C"
2876 return type(self)(pc.strftime(self._pa_array, format="%A", locale=locale))
2877
2878 def _dt_month_name(self, locale: str | None = None):
2879 if locale is None:
2880 locale = "C"
2881 return type(self)(pc.strftime(self._pa_array, format="%B", locale=locale))
2882
2883 def _dt_to_pydatetime(self):
2884 if pa.types.is_date(self.dtype.pyarrow_dtype):
2885 raise ValueError(
2886 f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. "
2887 "Convert to pyarrow timestamp type."
2888 )
2889 data = self._pa_array.to_pylist()
2890 if self._dtype.pyarrow_dtype.unit == "ns":
2891 data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data]
2892 return np.array(data, dtype=object)
2893
2894 def _dt_tz_localize(
2895 self,
2896 tz,
2897 ambiguous: TimeAmbiguous = "raise",
2898 nonexistent: TimeNonexistent = "raise",
2899 ):
2900 if ambiguous != "raise":
2901 raise NotImplementedError(f"{ambiguous=} is not supported")
2902 nonexistent_pa = {
2903 "raise": "raise",
2904 "shift_backward": "earliest",
2905 "shift_forward": "latest",
2906 }.get(
2907 nonexistent, None # type: ignore[arg-type]
2908 )
2909 if nonexistent_pa is None:
2910 raise NotImplementedError(f"{nonexistent=} is not supported")
2911 if tz is None:
2912 result = self._pa_array.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit))
2913 else:
2914 result = pc.assume_timezone(
2915 self._pa_array, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa
2916 )
2917 return type(self)(result)
2918
2919 def _dt_tz_convert(self, tz):
2920 if self.dtype.pyarrow_dtype.tz is None:
2921 raise TypeError(
2922 "Cannot convert tz-naive timestamps, use tz_localize to localize"
2923 )
2924 current_unit = self.dtype.pyarrow_dtype.unit
2925 result = self._pa_array.cast(pa.timestamp(current_unit, tz))
2926 return type(self)(result)
2927
2928
2929def transpose_homogeneous_pyarrow(
2930 arrays: Sequence[ArrowExtensionArray],
2931) -> list[ArrowExtensionArray]:
2932 """Transpose arrow extension arrays in a list, but faster.
2933
2934 Input should be a list of arrays of equal length and all have the same
2935 dtype. The caller is responsible for ensuring validity of input data.
2936 """
2937 arrays = list(arrays)
2938 nrows, ncols = len(arrays[0]), len(arrays)
2939 indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten()
2940 arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks])
2941 arr = arr.take(indices)
2942 return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)]