1from __future__ import annotations
2
3from copy import deepcopy
4import functools
5import operator
6import re
7import sys
8import textwrap
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Callable,
13 Literal,
14 Sequence,
15 TypeVar,
16 cast,
17)
18import unicodedata
19
20import numpy as np
21
22from pandas._libs import lib
23from pandas._typing import (
24 ArrayLike,
25 AxisInt,
26 Dtype,
27 FillnaOptions,
28 Iterator,
29 NpDtype,
30 PositionalIndexer,
31 Scalar,
32 SortKind,
33 TakeIndexer,
34 TimeAmbiguous,
35 TimeNonexistent,
36 npt,
37)
38from pandas.compat import (
39 pa_version_under7p0,
40 pa_version_under8p0,
41 pa_version_under9p0,
42 pa_version_under11p0,
43)
44from pandas.util._decorators import doc
45from pandas.util._validators import validate_fillna_kwargs
46
47from pandas.core.dtypes.common import (
48 is_array_like,
49 is_bool_dtype,
50 is_integer,
51 is_integer_dtype,
52 is_list_like,
53 is_object_dtype,
54 is_scalar,
55)
56from pandas.core.dtypes.dtypes import DatetimeTZDtype
57from pandas.core.dtypes.missing import isna
58
59from pandas.core import roperator
60from pandas.core.arraylike import OpsMixin
61from pandas.core.arrays.base import (
62 ExtensionArray,
63 ExtensionArraySupportsAnyAll,
64)
65import pandas.core.common as com
66from pandas.core.indexers import (
67 check_array_indexer,
68 unpack_tuple_and_ellipses,
69 validate_indices,
70)
71from pandas.core.strings.base import BaseStringArrayMethods
72
73from pandas.tseries.frequencies import to_offset
74
75if not pa_version_under7p0:
76 import pyarrow as pa
77 import pyarrow.compute as pc
78
79 from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
80 from pandas.core.arrays.arrow.dtype import ArrowDtype
81
82 ARROW_CMP_FUNCS = {
83 "eq": pc.equal,
84 "ne": pc.not_equal,
85 "lt": pc.less,
86 "gt": pc.greater,
87 "le": pc.less_equal,
88 "ge": pc.greater_equal,
89 }
90
91 ARROW_LOGICAL_FUNCS = {
92 "and_": pc.and_kleene,
93 "rand_": lambda x, y: pc.and_kleene(y, x),
94 "or_": pc.or_kleene,
95 "ror_": lambda x, y: pc.or_kleene(y, x),
96 "xor": pc.xor,
97 "rxor": lambda x, y: pc.xor(y, x),
98 }
99
100 def cast_for_truediv(
101 arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar
102 ) -> pa.ChunkedArray:
103 # Ensure int / int -> float mirroring Python/Numpy behavior
104 # as pc.divide_checked(int, int) -> int
105 if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(
106 pa_object.type
107 ):
108 return arrow_array.cast(pa.float64())
109 return arrow_array
110
111 def floordiv_compat(
112 left: pa.ChunkedArray | pa.Array | pa.Scalar,
113 right: pa.ChunkedArray | pa.Array | pa.Scalar,
114 ) -> pa.ChunkedArray:
115 # Ensure int // int -> int mirroring Python/Numpy behavior
116 # as pc.floor(pc.divide_checked(int, int)) -> float
117 result = pc.floor(pc.divide(left, right))
118 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):
119 result = result.cast(left.type)
120 return result
121
122 ARROW_ARITHMETIC_FUNCS = {
123 "add": pc.add_checked,
124 "radd": lambda x, y: pc.add_checked(y, x),
125 "sub": pc.subtract_checked,
126 "rsub": lambda x, y: pc.subtract_checked(y, x),
127 "mul": pc.multiply_checked,
128 "rmul": lambda x, y: pc.multiply_checked(y, x),
129 "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y),
130 "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)),
131 "floordiv": lambda x, y: floordiv_compat(x, y),
132 "rfloordiv": lambda x, y: floordiv_compat(y, x),
133 "mod": NotImplemented,
134 "rmod": NotImplemented,
135 "divmod": NotImplemented,
136 "rdivmod": NotImplemented,
137 "pow": pc.power_checked,
138 "rpow": lambda x, y: pc.power_checked(y, x),
139 }
140
141if TYPE_CHECKING:
142 from pandas._typing import (
143 NumpySorter,
144 NumpyValueArrayLike,
145 )
146
147 from pandas import Series
148
149ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")
150
151
152def get_unit_from_pa_dtype(pa_dtype):
153 # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804
154 if pa_version_under11p0:
155 unit = str(pa_dtype).split("[", 1)[-1][:-1]
156 if unit not in ["s", "ms", "us", "ns"]:
157 raise ValueError(pa_dtype)
158 return unit
159 return pa_dtype.unit
160
161
162def to_pyarrow_type(
163 dtype: ArrowDtype | pa.DataType | Dtype | None,
164) -> pa.DataType | None:
165 """
166 Convert dtype to a pyarrow type instance.
167 """
168 if isinstance(dtype, ArrowDtype):
169 return dtype.pyarrow_dtype
170 elif isinstance(dtype, pa.DataType):
171 return dtype
172 elif isinstance(dtype, DatetimeTZDtype):
173 return pa.timestamp(dtype.unit, dtype.tz)
174 elif dtype:
175 try:
176 # Accepts python types too
177 # Doesn't handle all numpy types
178 return pa.from_numpy_dtype(dtype)
179 except pa.ArrowNotImplementedError:
180 pass
181 return None
182
183
184class ArrowExtensionArray(
185 OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods
186):
187 """
188 Pandas ExtensionArray backed by a PyArrow ChunkedArray.
189
190 .. warning::
191
192 ArrowExtensionArray is considered experimental. The implementation and
193 parts of the API may change without warning.
194
195 Parameters
196 ----------
197 values : pyarrow.Array or pyarrow.ChunkedArray
198
199 Attributes
200 ----------
201 None
202
203 Methods
204 -------
205 None
206
207 Returns
208 -------
209 ArrowExtensionArray
210
211 Notes
212 -----
213 Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__
214 Some methods may either raise an exception or raise a ``PerformanceWarning`` if an
215 associated compute function is not available based on the installed version of PyArrow.
216
217 Please install the latest version of PyArrow to enable the best functionality and avoid
218 potential bugs in prior versions of PyArrow.
219
220 Examples
221 --------
222 Create an ArrowExtensionArray with :func:`pandas.array`:
223
224 >>> pd.array([1, 1, None], dtype="int64[pyarrow]")
225 <ArrowExtensionArray>
226 [1, 1, <NA>]
227 Length: 3, dtype: int64[pyarrow]
228 """ # noqa: E501 (http link too long)
229
230 _data: pa.ChunkedArray
231 _dtype: ArrowDtype
232
233 def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
234 if pa_version_under7p0:
235 msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray."
236 raise ImportError(msg)
237 if isinstance(values, pa.Array):
238 self._data = pa.chunked_array([values])
239 elif isinstance(values, pa.ChunkedArray):
240 self._data = values
241 else:
242 raise ValueError(
243 f"Unsupported type '{type(values)}' for ArrowExtensionArray"
244 )
245 self._dtype = ArrowDtype(self._data.type)
246
247 @classmethod
248 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
249 """
250 Construct a new ExtensionArray from a sequence of scalars.
251 """
252 pa_dtype = to_pyarrow_type(dtype)
253 if (
254 isinstance(scalars, np.ndarray)
255 and isinstance(dtype, ArrowDtype)
256 and (
257 pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype)
258 )
259 ):
260 # See https://github.com/apache/arrow/issues/35289
261 scalars = scalars.tolist()
262
263 if isinstance(scalars, cls):
264 scalars = scalars._data
265 elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)):
266 if copy and is_array_like(scalars):
267 # pa array should not get updated when numpy array is updated
268 scalars = deepcopy(scalars)
269 try:
270 scalars = pa.array(scalars, type=pa_dtype, from_pandas=True)
271 except pa.ArrowInvalid:
272 # GH50430: let pyarrow infer type, then cast
273 scalars = pa.array(scalars, from_pandas=True)
274 if pa_dtype:
275 if pa.types.is_dictionary(pa_dtype):
276 scalars = scalars.dictionary_encode()
277 else:
278 scalars = scalars.cast(pa_dtype)
279 arr = cls(scalars)
280 if pa.types.is_duration(scalars.type) and scalars.null_count > 0:
281 # GH52843: upstream bug for duration types when originally
282 # constructed with data containing numpy NaT.
283 # https://github.com/apache/arrow/issues/35088
284 arr = arr.fillna(arr.dtype.na_value)
285 return arr
286
287 @classmethod
288 def _from_sequence_of_strings(
289 cls, strings, *, dtype: Dtype | None = None, copy: bool = False
290 ):
291 """
292 Construct a new ExtensionArray from a sequence of strings.
293 """
294 pa_type = to_pyarrow_type(dtype)
295 if (
296 pa_type is None
297 or pa.types.is_binary(pa_type)
298 or pa.types.is_string(pa_type)
299 ):
300 # pa_type is None: Let pa.array infer
301 # pa_type is string/binary: scalars already correct type
302 scalars = strings
303 elif pa.types.is_timestamp(pa_type):
304 from pandas.core.tools.datetimes import to_datetime
305
306 scalars = to_datetime(strings, errors="raise")
307 elif pa.types.is_date(pa_type):
308 from pandas.core.tools.datetimes import to_datetime
309
310 scalars = to_datetime(strings, errors="raise").date
311 elif pa.types.is_duration(pa_type):
312 from pandas.core.tools.timedeltas import to_timedelta
313
314 scalars = to_timedelta(strings, errors="raise")
315 if pa_type.unit != "ns":
316 # GH51175: test_from_sequence_of_strings_pa_array
317 # attempt to parse as int64 reflecting pyarrow's
318 # duration to string casting behavior
319 mask = isna(scalars)
320 if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
321 strings = pa.array(strings, type=pa.string(), from_pandas=True)
322 strings = pc.if_else(mask, None, strings)
323 try:
324 scalars = strings.cast(pa.int64())
325 except pa.ArrowInvalid:
326 pass
327 elif pa.types.is_time(pa_type):
328 from pandas.core.tools.times import to_time
329
330 # "coerce" to allow "null times" (None) to not raise
331 scalars = to_time(strings, errors="coerce")
332 elif pa.types.is_boolean(pa_type):
333 from pandas.core.arrays import BooleanArray
334
335 scalars = BooleanArray._from_sequence_of_strings(strings).to_numpy()
336 elif (
337 pa.types.is_integer(pa_type)
338 or pa.types.is_floating(pa_type)
339 or pa.types.is_decimal(pa_type)
340 ):
341 from pandas.core.tools.numeric import to_numeric
342
343 scalars = to_numeric(strings, errors="raise")
344 else:
345 raise NotImplementedError(
346 f"Converting strings to {pa_type} is not implemented."
347 )
348 return cls._from_sequence(scalars, dtype=pa_type, copy=copy)
349
350 def __getitem__(self, item: PositionalIndexer):
351 """Select a subset of self.
352
353 Parameters
354 ----------
355 item : int, slice, or ndarray
356 * int: The position in 'self' to get.
357 * slice: A slice object, where 'start', 'stop', and 'step' are
358 integers or None
359 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'
360
361 Returns
362 -------
363 item : scalar or ExtensionArray
364
365 Notes
366 -----
367 For scalar ``item``, return a scalar value suitable for the array's
368 type. This should be an instance of ``self.dtype.type``.
369 For slice ``key``, return an instance of ``ExtensionArray``, even
370 if the slice is length 0 or 1.
371 For a boolean mask, return an instance of ``ExtensionArray``, filtered
372 to the values where ``item`` is True.
373 """
374 item = check_array_indexer(self, item)
375
376 if isinstance(item, np.ndarray):
377 if not len(item):
378 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]
379 if self._dtype.name == "string" and self._dtype.storage == "pyarrow":
380 pa_dtype = pa.string()
381 else:
382 pa_dtype = self._dtype.pyarrow_dtype
383 return type(self)(pa.chunked_array([], type=pa_dtype))
384 elif is_integer_dtype(item.dtype):
385 return self.take(item)
386 elif is_bool_dtype(item.dtype):
387 return type(self)(self._data.filter(item))
388 else:
389 raise IndexError(
390 "Only integers, slices and integer or "
391 "boolean arrays are valid indices."
392 )
393 elif isinstance(item, tuple):
394 item = unpack_tuple_and_ellipses(item)
395
396 if item is Ellipsis:
397 # TODO: should be handled by pyarrow?
398 item = slice(None)
399
400 if is_scalar(item) and not is_integer(item):
401 # e.g. "foo" or 2.5
402 # exception message copied from numpy
403 raise IndexError(
404 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
405 r"(`None`) and integer or boolean arrays are valid indices"
406 )
407 # We are not an array indexer, so maybe e.g. a slice or integer
408 # indexer. We dispatch to pyarrow.
409 value = self._data[item]
410 if isinstance(value, pa.ChunkedArray):
411 return type(self)(value)
412 else:
413 scalar = value.as_py()
414 if scalar is None:
415 return self._dtype.na_value
416 else:
417 return scalar
418
419 def __iter__(self) -> Iterator[Any]:
420 """
421 Iterate over elements of the array.
422 """
423 na_value = self._dtype.na_value
424 for value in self._data:
425 val = value.as_py()
426 if val is None:
427 yield na_value
428 else:
429 yield val
430
431 def __arrow_array__(self, type=None):
432 """Convert myself to a pyarrow ChunkedArray."""
433 return self._data
434
435 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
436 """Correctly construct numpy arrays when passed to `np.asarray()`."""
437 return self.to_numpy(dtype=dtype)
438
439 def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
440 return type(self)(pc.invert(self._data))
441
442 def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
443 return type(self)(pc.negate_checked(self._data))
444
445 def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
446 return type(self)(self._data)
447
448 def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
449 return type(self)(pc.abs_checked(self._data))
450
451 # GH 42600: __getstate__/__setstate__ not necessary once
452 # https://issues.apache.org/jira/browse/ARROW-10739 is addressed
453 def __getstate__(self):
454 state = self.__dict__.copy()
455 state["_data"] = self._data.combine_chunks()
456 return state
457
458 def __setstate__(self, state) -> None:
459 state["_data"] = pa.chunked_array(state["_data"])
460 self.__dict__.update(state)
461
462 def _cmp_method(self, other, op):
463 from pandas.core.arrays.masked import BaseMaskedArray
464
465 pc_func = ARROW_CMP_FUNCS[op.__name__]
466 if isinstance(other, ArrowExtensionArray):
467 result = pc_func(self._data, other._data)
468 elif isinstance(other, (np.ndarray, list)):
469 result = pc_func(self._data, other)
470 elif isinstance(other, BaseMaskedArray):
471 # GH 52625
472 result = pc_func(self._data, other.__arrow_array__())
473 elif is_scalar(other):
474 try:
475 result = pc_func(self._data, pa.scalar(other))
476 except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):
477 mask = isna(self) | isna(other)
478 valid = ~mask
479 result = np.zeros(len(self), dtype="bool")
480 result[valid] = op(np.array(self)[valid], other)
481 result = pa.array(result, type=pa.bool_())
482 result = pc.if_else(valid, result, None)
483 else:
484 raise NotImplementedError(
485 f"{op.__name__} not implemented for {type(other)}"
486 )
487 return ArrowExtensionArray(result)
488
489 def _evaluate_op_method(self, other, op, arrow_funcs):
490 from pandas.core.arrays.masked import BaseMaskedArray
491
492 pa_type = self._data.type
493 if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [
494 operator.add,
495 roperator.radd,
496 ]:
497 length = self._data.length()
498
499 seps: list[str] | list[bytes]
500 if pa.types.is_string(pa_type):
501 seps = [""] * length
502 else:
503 seps = [b""] * length
504
505 if is_scalar(other):
506 other = [other] * length
507 elif isinstance(other, type(self)):
508 other = other._data
509 if op is operator.add:
510 result = pc.binary_join_element_wise(self._data, other, seps)
511 else:
512 result = pc.binary_join_element_wise(other, self._data, seps)
513 return type(self)(result)
514
515 pc_func = arrow_funcs[op.__name__]
516 if pc_func is NotImplemented:
517 raise NotImplementedError(f"{op.__name__} not implemented.")
518 if isinstance(other, ArrowExtensionArray):
519 result = pc_func(self._data, other._data)
520 elif isinstance(other, (np.ndarray, list)):
521 result = pc_func(self._data, pa.array(other, from_pandas=True))
522 elif isinstance(other, BaseMaskedArray):
523 # GH 52625
524 result = pc_func(self._data, other.__arrow_array__())
525 elif is_scalar(other):
526 if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS:
527 # pyarrow kleene ops require null to be typed
528 pa_scalar = pa.scalar(None, type=self._data.type)
529 else:
530 pa_scalar = pa.scalar(other)
531 result = pc_func(self._data, pa_scalar)
532 else:
533 raise NotImplementedError(
534 f"{op.__name__} not implemented for {type(other)}"
535 )
536 return type(self)(result)
537
538 def _logical_method(self, other, op):
539 return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)
540
541 def _arith_method(self, other, op):
542 return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)
543
544 def equals(self, other) -> bool:
545 if not isinstance(other, ArrowExtensionArray):
546 return False
547 # I'm told that pyarrow makes __eq__ behave like pandas' equals;
548 # TODO: is this documented somewhere?
549 return self._data == other._data
550
551 @property
552 def dtype(self) -> ArrowDtype:
553 """
554 An instance of 'ExtensionDtype'.
555 """
556 return self._dtype
557
558 @property
559 def nbytes(self) -> int:
560 """
561 The number of bytes needed to store this object in memory.
562 """
563 return self._data.nbytes
564
565 def __len__(self) -> int:
566 """
567 Length of this array.
568
569 Returns
570 -------
571 length : int
572 """
573 return len(self._data)
574
575 def __contains__(self, key) -> bool:
576 # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604
577 if isna(key) and key is not self.dtype.na_value:
578 if self.dtype.kind == "f" and lib.is_float(key) and isna(key):
579 return pc.any(pc.is_nan(self._data)).as_py()
580
581 # e.g. date or timestamp types we do not allow None here to match pd.NA
582 return False
583 # TODO: maybe complex? object?
584
585 return bool(super().__contains__(key))
586
587 @property
588 def _hasna(self) -> bool:
589 return self._data.null_count > 0
590
591 def isna(self) -> npt.NDArray[np.bool_]:
592 """
593 Boolean NumPy array indicating if each value is missing.
594
595 This should return a 1-D array the same length as 'self'.
596 """
597 return self._data.is_null().to_numpy()
598
599 def any(self, *, skipna: bool = True, **kwargs):
600 """
601 Return whether any element is truthy.
602
603 Returns False unless there is at least one element that is truthy.
604 By default, NAs are skipped. If ``skipna=False`` is specified and
605 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
606 is used as for logical operations.
607
608 Parameters
609 ----------
610 skipna : bool, default True
611 Exclude NA values. If the entire array is NA and `skipna` is
612 True, then the result will be False, as for an empty array.
613 If `skipna` is False, the result will still be True if there is
614 at least one element that is truthy, otherwise NA will be returned
615 if there are NA's present.
616
617 Returns
618 -------
619 bool or :attr:`pandas.NA`
620
621 See Also
622 --------
623 ArrowExtensionArray.all : Return whether all elements are truthy.
624
625 Examples
626 --------
627 The result indicates whether any element is truthy (and by default
628 skips NAs):
629
630 >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any()
631 True
632 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any()
633 True
634 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any()
635 False
636 >>> pd.array([], dtype="boolean[pyarrow]").any()
637 False
638 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any()
639 False
640 >>> pd.array([pd.NA], dtype="float64[pyarrow]").any()
641 False
642
643 With ``skipna=False``, the result can be NA if this is logically
644 required (whether ``pd.NA`` is True or False influences the result):
645
646 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
647 True
648 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
649 True
650 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
651 <NA>
652 >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)
653 <NA>
654 """
655 return self._reduce("any", skipna=skipna, **kwargs)
656
657 def all(self, *, skipna: bool = True, **kwargs):
658 """
659 Return whether all elements are truthy.
660
661 Returns True unless there is at least one element that is falsey.
662 By default, NAs are skipped. If ``skipna=False`` is specified and
663 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
664 is used as for logical operations.
665
666 Parameters
667 ----------
668 skipna : bool, default True
669 Exclude NA values. If the entire array is NA and `skipna` is
670 True, then the result will be True, as for an empty array.
671 If `skipna` is False, the result will still be False if there is
672 at least one element that is falsey, otherwise NA will be returned
673 if there are NA's present.
674
675 Returns
676 -------
677 bool or :attr:`pandas.NA`
678
679 See Also
680 --------
681 ArrowExtensionArray.any : Return whether any element is truthy.
682
683 Examples
684 --------
685 The result indicates whether all elements are truthy (and by default
686 skips NAs):
687
688 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all()
689 True
690 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all()
691 True
692 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all()
693 False
694 >>> pd.array([], dtype="boolean[pyarrow]").all()
695 True
696 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all()
697 True
698 >>> pd.array([pd.NA], dtype="float64[pyarrow]").all()
699 True
700
701 With ``skipna=False``, the result can be NA if this is logically
702 required (whether ``pd.NA`` is True or False influences the result):
703
704 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
705 <NA>
706 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
707 <NA>
708 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
709 False
710 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)
711 False
712 """
713 return self._reduce("all", skipna=skipna, **kwargs)
714
715 def argsort(
716 self,
717 *,
718 ascending: bool = True,
719 kind: SortKind = "quicksort",
720 na_position: str = "last",
721 **kwargs,
722 ) -> np.ndarray:
723 order = "ascending" if ascending else "descending"
724 null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None)
725 if null_placement is None:
726 raise ValueError(f"invalid na_position: {na_position}")
727
728 result = pc.array_sort_indices(
729 self._data, order=order, null_placement=null_placement
730 )
731 np_result = result.to_numpy()
732 return np_result.astype(np.intp, copy=False)
733
734 def _argmin_max(self, skipna: bool, method: str) -> int:
735 if self._data.length() in (0, self._data.null_count) or (
736 self._hasna and not skipna
737 ):
738 # For empty or all null, pyarrow returns -1 but pandas expects TypeError
739 # For skipna=False and data w/ null, pandas expects NotImplementedError
740 # let ExtensionArray.arg{max|min} raise
741 return getattr(super(), f"arg{method}")(skipna=skipna)
742
743 data = self._data
744 if pa.types.is_duration(data.type):
745 data = data.cast(pa.int64())
746
747 value = getattr(pc, method)(data, skip_nulls=skipna)
748 return pc.index(data, value).as_py()
749
750 def argmin(self, skipna: bool = True) -> int:
751 return self._argmin_max(skipna, "min")
752
753 def argmax(self, skipna: bool = True) -> int:
754 return self._argmin_max(skipna, "max")
755
756 def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
757 """
758 Return a shallow copy of the array.
759
760 Underlying ChunkedArray is immutable, so a deep copy is unnecessary.
761
762 Returns
763 -------
764 type(self)
765 """
766 return type(self)(self._data)
767
768 def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
769 """
770 Return ArrowExtensionArray without NA values.
771
772 Returns
773 -------
774 ArrowExtensionArray
775 """
776 return type(self)(pc.drop_null(self._data))
777
778 @doc(ExtensionArray.fillna)
779 def fillna(
780 self: ArrowExtensionArrayT,
781 value: object | ArrayLike | None = None,
782 method: FillnaOptions | None = None,
783 limit: int | None = None,
784 ) -> ArrowExtensionArrayT:
785 value, method = validate_fillna_kwargs(value, method)
786
787 if limit is not None:
788 return super().fillna(value=value, method=method, limit=limit)
789
790 if method is not None:
791 fallback_performancewarning()
792 return super().fillna(value=value, method=method, limit=limit)
793
794 if is_array_like(value):
795 value = cast(ArrayLike, value)
796 if len(value) != len(self):
797 raise ValueError(
798 f"Length of 'value' does not match. Got ({len(value)}) "
799 f" expected {len(self)}"
800 )
801
802 def convert_fill_value(value, pa_type, dtype):
803 if value is None:
804 return value
805 if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
806 return value
807 if is_array_like(value):
808 pa_box = pa.array
809 else:
810 pa_box = pa.scalar
811 try:
812 value = pa_box(value, type=pa_type, from_pandas=True)
813 except pa.ArrowTypeError as err:
814 msg = f"Invalid value '{str(value)}' for dtype {dtype}"
815 raise TypeError(msg) from err
816 return value
817
818 fill_value = convert_fill_value(value, self._data.type, self.dtype)
819
820 try:
821 if method is None:
822 return type(self)(pc.fill_null(self._data, fill_value=fill_value))
823 elif method == "pad":
824 return type(self)(pc.fill_null_forward(self._data))
825 elif method == "backfill":
826 return type(self)(pc.fill_null_backward(self._data))
827 except pa.ArrowNotImplementedError:
828 # ArrowNotImplementedError: Function 'coalesce' has no kernel
829 # matching input types (duration[ns], duration[ns])
830 # TODO: remove try/except wrapper if/when pyarrow implements
831 # a kernel for duration types.
832 pass
833
834 return super().fillna(value=value, method=method, limit=limit)
835
836 def isin(self, values) -> npt.NDArray[np.bool_]:
837 # short-circuit to return all False array.
838 if not len(values):
839 return np.zeros(len(self), dtype=bool)
840
841 result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True))
842 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
843 # to False
844 return np.array(result, dtype=np.bool_)
845
846 def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
847 """
848 Return an array and missing value suitable for factorization.
849
850 Returns
851 -------
852 values : ndarray
853 na_value : pd.NA
854
855 Notes
856 -----
857 The values returned by this method are also used in
858 :func:`pandas.util.hash_pandas_object`.
859 """
860 values = self._data.to_numpy()
861 return values, self.dtype.na_value
862
863 @doc(ExtensionArray.factorize)
864 def factorize(
865 self,
866 use_na_sentinel: bool = True,
867 ) -> tuple[np.ndarray, ExtensionArray]:
868 null_encoding = "mask" if use_na_sentinel else "encode"
869
870 pa_type = self._data.type
871 if pa.types.is_duration(pa_type):
872 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
873 data = self._data.cast(pa.int64())
874 else:
875 data = self._data
876
877 if pa.types.is_dictionary(data.type):
878 encoded = data
879 else:
880 encoded = data.dictionary_encode(null_encoding=null_encoding)
881 if encoded.length() == 0:
882 indices = np.array([], dtype=np.intp)
883 uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))
884 else:
885 pa_indices = encoded.combine_chunks().indices
886 if pa_indices.null_count > 0:
887 pa_indices = pc.fill_null(pa_indices, -1)
888 indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(
889 np.intp, copy=False
890 )
891 uniques = type(self)(encoded.chunk(0).dictionary)
892
893 if pa.types.is_duration(pa_type):
894 uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype))
895 return indices, uniques
896
897 def reshape(self, *args, **kwargs):
898 raise NotImplementedError(
899 f"{type(self)} does not support reshape "
900 f"as backed by a 1D pyarrow.ChunkedArray."
901 )
902
903 def round(
904 self: ArrowExtensionArrayT, decimals: int = 0, *args, **kwargs
905 ) -> ArrowExtensionArrayT:
906 """
907 Round each value in the array a to the given number of decimals.
908
909 Parameters
910 ----------
911 decimals : int, default 0
912 Number of decimal places to round to. If decimals is negative,
913 it specifies the number of positions to the left of the decimal point.
914 *args, **kwargs
915 Additional arguments and keywords have no effect.
916
917 Returns
918 -------
919 ArrowExtensionArray
920 Rounded values of the ArrowExtensionArray.
921
922 See Also
923 --------
924 DataFrame.round : Round values of a DataFrame.
925 Series.round : Round values of a Series.
926 """
927 return type(self)(pc.round(self._data, ndigits=decimals))
928
929 @doc(ExtensionArray.searchsorted)
930 def searchsorted(
931 self,
932 value: NumpyValueArrayLike | ExtensionArray,
933 side: Literal["left", "right"] = "left",
934 sorter: NumpySorter = None,
935 ) -> npt.NDArray[np.intp] | np.intp:
936 if self._hasna:
937 raise ValueError(
938 "searchsorted requires array to be sorted, which is impossible "
939 "with NAs present."
940 )
941 if isinstance(value, ExtensionArray):
942 value = value.astype(object)
943 # Base class searchsorted would cast to object, which is *much* slower.
944 return self.to_numpy().searchsorted(value, side=side, sorter=sorter)
945
946 def take(
947 self,
948 indices: TakeIndexer,
949 allow_fill: bool = False,
950 fill_value: Any = None,
951 ) -> ArrowExtensionArray:
952 """
953 Take elements from an array.
954
955 Parameters
956 ----------
957 indices : sequence of int or one-dimensional np.ndarray of int
958 Indices to be taken.
959 allow_fill : bool, default False
960 How to handle negative values in `indices`.
961
962 * False: negative values in `indices` indicate positional indices
963 from the right (the default). This is similar to
964 :func:`numpy.take`.
965
966 * True: negative values in `indices` indicate
967 missing values. These values are set to `fill_value`. Any other
968 other negative values raise a ``ValueError``.
969
970 fill_value : any, optional
971 Fill value to use for NA-indices when `allow_fill` is True.
972 This may be ``None``, in which case the default NA value for
973 the type, ``self.dtype.na_value``, is used.
974
975 For many ExtensionArrays, there will be two representations of
976 `fill_value`: a user-facing "boxed" scalar, and a low-level
977 physical NA value. `fill_value` should be the user-facing version,
978 and the implementation should handle translating that to the
979 physical version for processing the take if necessary.
980
981 Returns
982 -------
983 ExtensionArray
984
985 Raises
986 ------
987 IndexError
988 When the indices are out of bounds for the array.
989 ValueError
990 When `indices` contains negative values other than ``-1``
991 and `allow_fill` is True.
992
993 See Also
994 --------
995 numpy.take
996 api.extensions.take
997
998 Notes
999 -----
1000 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
1001 ``iloc``, when `indices` is a sequence of values. Additionally,
1002 it's called by :meth:`Series.reindex`, or any other method
1003 that causes realignment, with a `fill_value`.
1004 """
1005 # TODO: Remove once we got rid of the (indices < 0) check
1006 if not is_array_like(indices):
1007 indices_array = np.asanyarray(indices)
1008 else:
1009 # error: Incompatible types in assignment (expression has type
1010 # "Sequence[int]", variable has type "ndarray")
1011 indices_array = indices # type: ignore[assignment]
1012
1013 if len(self._data) == 0 and (indices_array >= 0).any():
1014 raise IndexError("cannot do a non-empty take")
1015 if indices_array.size > 0 and indices_array.max() >= len(self._data):
1016 raise IndexError("out of bounds value in 'indices'.")
1017
1018 if allow_fill:
1019 fill_mask = indices_array < 0
1020 if fill_mask.any():
1021 validate_indices(indices_array, len(self._data))
1022 # TODO(ARROW-9433): Treat negative indices as NULL
1023 indices_array = pa.array(indices_array, mask=fill_mask)
1024 result = self._data.take(indices_array)
1025 if isna(fill_value):
1026 return type(self)(result)
1027 # TODO: ArrowNotImplementedError: Function fill_null has no
1028 # kernel matching input types (array[string], scalar[string])
1029 result = type(self)(result)
1030 result[fill_mask] = fill_value
1031 return result
1032 # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
1033 else:
1034 # Nothing to fill
1035 return type(self)(self._data.take(indices))
1036 else: # allow_fill=False
1037 # TODO(ARROW-9432): Treat negative indices as indices from the right.
1038 if (indices_array < 0).any():
1039 # Don't modify in-place
1040 indices_array = np.copy(indices_array)
1041 indices_array[indices_array < 0] += len(self._data)
1042 return type(self)(self._data.take(indices_array))
1043
1044 @doc(ExtensionArray.to_numpy)
1045 def to_numpy(
1046 self,
1047 dtype: npt.DTypeLike | None = None,
1048 copy: bool = False,
1049 na_value: object = lib.no_default,
1050 ) -> np.ndarray:
1051 if dtype is None and self._hasna:
1052 dtype = object
1053 if na_value is lib.no_default:
1054 na_value = self.dtype.na_value
1055
1056 pa_type = self._data.type
1057 if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type):
1058 # temporal types with units and/or timezones currently
1059 # require pandas/python scalars to pass all tests
1060 # TODO: improve performance (this is slow)
1061 result = np.array(list(self), dtype=dtype)
1062 elif is_object_dtype(dtype) and self._hasna:
1063 result = np.empty(len(self), dtype=object)
1064 mask = ~self.isna()
1065 result[mask] = np.asarray(self[mask]._data)
1066 elif pa.types.is_null(self._data.type):
1067 result = np.asarray(self._data, dtype=dtype)
1068 if not isna(na_value):
1069 result[:] = na_value
1070 return result
1071 elif self._hasna:
1072 data = self.copy()
1073 data[self.isna()] = na_value
1074 return np.asarray(data._data, dtype=dtype)
1075 else:
1076 result = np.asarray(self._data, dtype=dtype)
1077 if copy:
1078 result = result.copy()
1079 if self._hasna:
1080 result[self.isna()] = na_value
1081 return result
1082
1083 def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
1084 """
1085 Compute the ArrowExtensionArray of unique values.
1086
1087 Returns
1088 -------
1089 ArrowExtensionArray
1090 """
1091 pa_type = self._data.type
1092
1093 if pa.types.is_duration(pa_type):
1094 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
1095 data = self._data.cast(pa.int64())
1096 else:
1097 data = self._data
1098
1099 pa_result = pc.unique(data)
1100
1101 if pa.types.is_duration(pa_type):
1102 pa_result = pa_result.cast(pa_type)
1103
1104 return type(self)(pa_result)
1105
1106 def value_counts(self, dropna: bool = True) -> Series:
1107 """
1108 Return a Series containing counts of each unique value.
1109
1110 Parameters
1111 ----------
1112 dropna : bool, default True
1113 Don't include counts of missing values.
1114
1115 Returns
1116 -------
1117 counts : Series
1118
1119 See Also
1120 --------
1121 Series.value_counts
1122 """
1123 pa_type = self._data.type
1124 if pa.types.is_duration(pa_type):
1125 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
1126 data = self._data.cast(pa.int64())
1127 else:
1128 data = self._data
1129
1130 from pandas import (
1131 Index,
1132 Series,
1133 )
1134
1135 vc = data.value_counts()
1136
1137 values = vc.field(0)
1138 counts = vc.field(1)
1139 if dropna and data.null_count > 0:
1140 mask = values.is_valid()
1141 values = values.filter(mask)
1142 counts = counts.filter(mask)
1143
1144 if pa.types.is_duration(pa_type):
1145 values = values.cast(pa_type)
1146
1147 counts = ArrowExtensionArray(counts)
1148
1149 index = Index(type(self)(values))
1150
1151 return Series(counts, index=index, name="count", copy=False)
1152
1153 @classmethod
1154 def _concat_same_type(
1155 cls: type[ArrowExtensionArrayT], to_concat
1156 ) -> ArrowExtensionArrayT:
1157 """
1158 Concatenate multiple ArrowExtensionArrays.
1159
1160 Parameters
1161 ----------
1162 to_concat : sequence of ArrowExtensionArrays
1163
1164 Returns
1165 -------
1166 ArrowExtensionArray
1167 """
1168 chunks = [array for ea in to_concat for array in ea._data.iterchunks()]
1169 if to_concat[0].dtype == "string":
1170 # StringDtype has no attrivute pyarrow_dtype
1171 pa_dtype = pa.string()
1172 else:
1173 pa_dtype = to_concat[0].dtype.pyarrow_dtype
1174 arr = pa.chunked_array(chunks, type=pa_dtype)
1175 return cls(arr)
1176
1177 def _accumulate(
1178 self, name: str, *, skipna: bool = True, **kwargs
1179 ) -> ArrowExtensionArray | ExtensionArray:
1180 """
1181 Return an ExtensionArray performing an accumulation operation.
1182
1183 The underlying data type might change.
1184
1185 Parameters
1186 ----------
1187 name : str
1188 Name of the function, supported values are:
1189 - cummin
1190 - cummax
1191 - cumsum
1192 - cumprod
1193 skipna : bool, default True
1194 If True, skip NA values.
1195 **kwargs
1196 Additional keyword arguments passed to the accumulation function.
1197 Currently, there is no supported kwarg.
1198
1199 Returns
1200 -------
1201 array
1202
1203 Raises
1204 ------
1205 NotImplementedError : subclass does not define accumulations
1206 """
1207 pyarrow_name = {
1208 "cumsum": "cumulative_sum_checked",
1209 }.get(name, name)
1210 pyarrow_meth = getattr(pc, pyarrow_name, None)
1211 if pyarrow_meth is None:
1212 return super()._accumulate(name, skipna=skipna, **kwargs)
1213
1214 data_to_accum = self._data
1215
1216 pa_dtype = data_to_accum.type
1217 if pa.types.is_duration(pa_dtype):
1218 data_to_accum = data_to_accum.cast(pa.int64())
1219
1220 result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1221
1222 if pa.types.is_duration(pa_dtype):
1223 result = result.cast(pa_dtype)
1224
1225 return type(self)(result)
1226
1227 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
1228 """
1229 Return a scalar result of performing the reduction operation.
1230
1231 Parameters
1232 ----------
1233 name : str
1234 Name of the function, supported values are:
1235 { any, all, min, max, sum, mean, median, prod,
1236 std, var, sem, kurt, skew }.
1237 skipna : bool, default True
1238 If True, skip NaN values.
1239 **kwargs
1240 Additional keyword arguments passed to the reduction function.
1241 Currently, `ddof` is the only supported kwarg.
1242
1243 Returns
1244 -------
1245 scalar
1246
1247 Raises
1248 ------
1249 TypeError : subclass does not define reductions
1250 """
1251 pa_type = self._data.type
1252
1253 data_to_reduce = self._data
1254
1255 if name in ["any", "all"] and (
1256 pa.types.is_integer(pa_type)
1257 or pa.types.is_floating(pa_type)
1258 or pa.types.is_duration(pa_type)
1259 or pa.types.is_decimal(pa_type)
1260 ):
1261 # pyarrow only supports any/all for boolean dtype, we allow
1262 # for other dtypes, matching our non-pyarrow behavior
1263
1264 if pa.types.is_duration(pa_type):
1265 data_to_cmp = self._data.cast(pa.int64())
1266 else:
1267 data_to_cmp = self._data
1268
1269 not_eq = pc.not_equal(data_to_cmp, 0)
1270 data_to_reduce = not_eq
1271
1272 elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
1273 data_to_reduce = self._data.cast(pa.int64())
1274
1275 elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type):
1276 nbits = pa_type.bit_width
1277 if nbits == 32:
1278 data_to_reduce = self._data.cast(pa.int32())
1279 else:
1280 data_to_reduce = self._data.cast(pa.int64())
1281
1282 if name == "sem":
1283
1284 def pyarrow_meth(data, skip_nulls, **kwargs):
1285 numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs)
1286 denominator = pc.sqrt_checked(pc.count(self._data))
1287 return pc.divide_checked(numerator, denominator)
1288
1289 else:
1290 pyarrow_name = {
1291 "median": "quantile",
1292 "prod": "product",
1293 "std": "stddev",
1294 "var": "variance",
1295 }.get(name, name)
1296 # error: Incompatible types in assignment
1297 # (expression has type "Optional[Any]", variable has type
1298 # "Callable[[Any, Any, KwArg(Any)], Any]")
1299 pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]
1300 if pyarrow_meth is None:
1301 # Let ExtensionArray._reduce raise the TypeError
1302 return super()._reduce(name, skipna=skipna, **kwargs)
1303
1304 # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0
1305 if name in ["any", "all"] and "min_count" not in kwargs:
1306 kwargs["min_count"] = 0
1307 elif name == "median":
1308 # GH 52679: Use quantile instead of approximate_median
1309 kwargs["q"] = 0.5
1310
1311 try:
1312 result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)
1313 except (AttributeError, NotImplementedError, TypeError) as err:
1314 msg = (
1315 f"'{type(self).__name__}' with dtype {self.dtype} "
1316 f"does not support reduction '{name}' with pyarrow "
1317 f"version {pa.__version__}. '{name}' may be supported by "
1318 f"upgrading pyarrow."
1319 )
1320 raise TypeError(msg) from err
1321 if name == "median":
1322 # GH 52679: Use quantile instead of approximate_median; returns array
1323 result = result[0]
1324 if pc.is_null(result).as_py():
1325 return self.dtype.na_value
1326
1327 if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):
1328 result = result.cast(pa_type)
1329 if name in ["median", "mean"] and pa.types.is_temporal(pa_type):
1330 result = result.cast(pa_type)
1331 if name in ["std", "sem"] and pa.types.is_temporal(pa_type):
1332 result = result.cast(pa.int64())
1333 if pa.types.is_duration(pa_type):
1334 result = result.cast(pa_type)
1335 elif pa.types.is_time(pa_type):
1336 unit = get_unit_from_pa_dtype(pa_type)
1337 result = result.cast(pa.duration(unit))
1338 elif pa.types.is_date(pa_type):
1339 # go with closest available unit, i.e. "s"
1340 result = result.cast(pa.duration("s"))
1341 else:
1342 # i.e. timestamp
1343 result = result.cast(pa.duration(pa_type.unit))
1344
1345 return result.as_py()
1346
1347 def __setitem__(self, key, value) -> None:
1348 """Set one or more values inplace.
1349
1350 Parameters
1351 ----------
1352 key : int, ndarray, or slice
1353 When called from, e.g. ``Series.__setitem__``, ``key`` will be
1354 one of
1355
1356 * scalar int
1357 * ndarray of integers.
1358 * boolean ndarray
1359 * slice object
1360
1361 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object
1362 value or values to be set of ``key``.
1363
1364 Returns
1365 -------
1366 None
1367 """
1368 # GH50085: unwrap 1D indexers
1369 if isinstance(key, tuple) and len(key) == 1:
1370 key = key[0]
1371
1372 key = check_array_indexer(self, key)
1373 value = self._maybe_convert_setitem_value(value)
1374
1375 if com.is_null_slice(key):
1376 # fast path (GH50248)
1377 data = self._if_else(True, value, self._data)
1378
1379 elif is_integer(key):
1380 # fast path
1381 key = cast(int, key)
1382 n = len(self)
1383 if key < 0:
1384 key += n
1385 if not 0 <= key < n:
1386 raise IndexError(
1387 f"index {key} is out of bounds for axis 0 with size {n}"
1388 )
1389 if is_list_like(value):
1390 raise ValueError("Length of indexer and values mismatch")
1391 elif isinstance(value, pa.Scalar):
1392 value = value.as_py()
1393 chunks = [
1394 *self._data[:key].chunks,
1395 pa.array([value], type=self._data.type, from_pandas=True),
1396 *self._data[key + 1 :].chunks,
1397 ]
1398 data = pa.chunked_array(chunks).combine_chunks()
1399
1400 elif is_bool_dtype(key):
1401 key = np.asarray(key, dtype=np.bool_)
1402 data = self._replace_with_mask(self._data, key, value)
1403
1404 elif is_scalar(value) or isinstance(value, pa.Scalar):
1405 mask = np.zeros(len(self), dtype=np.bool_)
1406 mask[key] = True
1407 data = self._if_else(mask, value, self._data)
1408
1409 else:
1410 indices = np.arange(len(self))[key]
1411 if len(indices) != len(value):
1412 raise ValueError("Length of indexer and values mismatch")
1413 if len(indices) == 0:
1414 return
1415 argsort = np.argsort(indices)
1416 indices = indices[argsort]
1417 value = value.take(argsort)
1418 mask = np.zeros(len(self), dtype=np.bool_)
1419 mask[indices] = True
1420 data = self._replace_with_mask(self._data, mask, value)
1421
1422 if isinstance(data, pa.Array):
1423 data = pa.chunked_array([data])
1424 self._data = data
1425
1426 def _rank(
1427 self,
1428 *,
1429 axis: AxisInt = 0,
1430 method: str = "average",
1431 na_option: str = "keep",
1432 ascending: bool = True,
1433 pct: bool = False,
1434 ):
1435 """
1436 See Series.rank.__doc__.
1437 """
1438 if pa_version_under9p0 or axis != 0:
1439 ranked = super()._rank(
1440 axis=axis,
1441 method=method,
1442 na_option=na_option,
1443 ascending=ascending,
1444 pct=pct,
1445 )
1446 # keep dtypes consistent with the implementation below
1447 if method == "average" or pct:
1448 pa_type = pa.float64()
1449 else:
1450 pa_type = pa.uint64()
1451 result = pa.array(ranked, type=pa_type, from_pandas=True)
1452 return type(self)(result)
1453
1454 data = self._data.combine_chunks()
1455 sort_keys = "ascending" if ascending else "descending"
1456 null_placement = "at_start" if na_option == "top" else "at_end"
1457 tiebreaker = "min" if method == "average" else method
1458
1459 result = pc.rank(
1460 data,
1461 sort_keys=sort_keys,
1462 null_placement=null_placement,
1463 tiebreaker=tiebreaker,
1464 )
1465
1466 if na_option == "keep":
1467 mask = pc.is_null(self._data)
1468 null = pa.scalar(None, type=result.type)
1469 result = pc.if_else(mask, null, result)
1470
1471 if method == "average":
1472 result_max = pc.rank(
1473 data,
1474 sort_keys=sort_keys,
1475 null_placement=null_placement,
1476 tiebreaker="max",
1477 )
1478 result_max = result_max.cast(pa.float64())
1479 result_min = result.cast(pa.float64())
1480 result = pc.divide(pc.add(result_min, result_max), 2)
1481
1482 if pct:
1483 if not pa.types.is_floating(result.type):
1484 result = result.cast(pa.float64())
1485 if method == "dense":
1486 divisor = pc.max(result)
1487 else:
1488 divisor = pc.count(result)
1489 result = pc.divide(result, divisor)
1490
1491 return type(self)(result)
1492
1493 def _quantile(
1494 self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str
1495 ) -> ArrowExtensionArrayT:
1496 """
1497 Compute the quantiles of self for each quantile in `qs`.
1498
1499 Parameters
1500 ----------
1501 qs : np.ndarray[float64]
1502 interpolation: str
1503
1504 Returns
1505 -------
1506 same type as self
1507 """
1508 pa_dtype = self._data.type
1509
1510 data = self._data
1511 if pa.types.is_temporal(pa_dtype):
1512 # https://github.com/apache/arrow/issues/33769 in these cases
1513 # we can cast to ints and back
1514 nbits = pa_dtype.bit_width
1515 if nbits == 32:
1516 data = data.cast(pa.int32())
1517 else:
1518 data = data.cast(pa.int64())
1519
1520 result = pc.quantile(data, q=qs, interpolation=interpolation)
1521
1522 if pa.types.is_temporal(pa_dtype):
1523 nbits = pa_dtype.bit_width
1524 if nbits == 32:
1525 result = result.cast(pa.int32())
1526 else:
1527 result = result.cast(pa.int64())
1528 result = result.cast(pa_dtype)
1529
1530 return type(self)(result)
1531
1532 def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT:
1533 """
1534 Returns the mode(s) of the ExtensionArray.
1535
1536 Always returns `ExtensionArray` even if only one value.
1537
1538 Parameters
1539 ----------
1540 dropna : bool, default True
1541 Don't consider counts of NA values.
1542
1543 Returns
1544 -------
1545 same type as self
1546 Sorted, if possible.
1547 """
1548 pa_type = self._data.type
1549 if pa.types.is_temporal(pa_type):
1550 nbits = pa_type.bit_width
1551 if nbits == 32:
1552 data = self._data.cast(pa.int32())
1553 elif nbits == 64:
1554 data = self._data.cast(pa.int64())
1555 else:
1556 raise NotImplementedError(pa_type)
1557 else:
1558 data = self._data
1559
1560 if dropna:
1561 data = data.drop_null()
1562
1563 res = pc.value_counts(data)
1564 most_common = res.field("values").filter(
1565 pc.equal(res.field("counts"), pc.max(res.field("counts")))
1566 )
1567
1568 if pa.types.is_temporal(pa_type):
1569 most_common = most_common.cast(pa_type)
1570
1571 return type(self)(most_common)
1572
1573 def _maybe_convert_setitem_value(self, value):
1574 """Maybe convert value to be pyarrow compatible."""
1575 if value is None:
1576 return value
1577 if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):
1578 return value
1579 if is_list_like(value):
1580 pa_box = pa.array
1581 else:
1582 pa_box = pa.scalar
1583 try:
1584 value = pa_box(value, type=self._data.type, from_pandas=True)
1585 except pa.ArrowTypeError as err:
1586 msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
1587 raise TypeError(msg) from err
1588 return value
1589
1590 @classmethod
1591 def _if_else(
1592 cls,
1593 cond: npt.NDArray[np.bool_] | bool,
1594 left: ArrayLike | Scalar,
1595 right: ArrayLike | Scalar,
1596 ):
1597 """
1598 Choose values based on a condition.
1599
1600 Analogous to pyarrow.compute.if_else, with logic
1601 to fallback to numpy for unsupported types.
1602
1603 Parameters
1604 ----------
1605 cond : npt.NDArray[np.bool_] or bool
1606 left : ArrayLike | Scalar
1607 right : ArrayLike | Scalar
1608
1609 Returns
1610 -------
1611 pa.Array
1612 """
1613 try:
1614 return pc.if_else(cond, left, right)
1615 except pa.ArrowNotImplementedError:
1616 pass
1617
1618 def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
1619 if isinstance(value, (pa.Array, pa.ChunkedArray)):
1620 pa_type = value.type
1621 elif isinstance(value, pa.Scalar):
1622 pa_type = value.type
1623 value = value.as_py()
1624 else:
1625 pa_type = None
1626 return np.array(value, dtype=object), pa_type
1627
1628 left, left_type = _to_numpy_and_type(left)
1629 right, right_type = _to_numpy_and_type(right)
1630 pa_type = left_type or right_type
1631 result = np.where(cond, left, right)
1632 return pa.array(result, type=pa_type, from_pandas=True)
1633
1634 @classmethod
1635 def _replace_with_mask(
1636 cls,
1637 values: pa.Array | pa.ChunkedArray,
1638 mask: npt.NDArray[np.bool_] | bool,
1639 replacements: ArrayLike | Scalar,
1640 ):
1641 """
1642 Replace items selected with a mask.
1643
1644 Analogous to pyarrow.compute.replace_with_mask, with logic
1645 to fallback to numpy for unsupported types.
1646
1647 Parameters
1648 ----------
1649 values : pa.Array or pa.ChunkedArray
1650 mask : npt.NDArray[np.bool_] or bool
1651 replacements : ArrayLike or Scalar
1652 Replacement value(s)
1653
1654 Returns
1655 -------
1656 pa.Array or pa.ChunkedArray
1657 """
1658 if isinstance(replacements, pa.ChunkedArray):
1659 # replacements must be array or scalar, not ChunkedArray
1660 replacements = replacements.combine_chunks()
1661 if pa_version_under8p0:
1662 # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0:
1663 # version <= 7: segfaults with various types
1664 # version <= 6: fails to replace nulls
1665 if isinstance(replacements, pa.Array):
1666 indices = np.full(len(values), None)
1667 indices[mask] = np.arange(len(replacements))
1668 indices = pa.array(indices, type=pa.int64())
1669 replacements = replacements.take(indices)
1670 return cls._if_else(mask, replacements, values)
1671 if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type):
1672 # GH#52059 replace_with_mask segfaults for chunked array
1673 # https://github.com/apache/arrow/issues/34634
1674 values = values.combine_chunks()
1675 try:
1676 return pc.replace_with_mask(values, mask, replacements)
1677 except pa.ArrowNotImplementedError:
1678 pass
1679 if isinstance(replacements, pa.Array):
1680 replacements = np.array(replacements, dtype=object)
1681 elif isinstance(replacements, pa.Scalar):
1682 replacements = replacements.as_py()
1683 result = np.array(values, dtype=object)
1684 result[mask] = replacements
1685 return pa.array(result, type=values.type, from_pandas=True)
1686
1687 def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
1688 """Apply a callable to each element while maintaining the chunking structure."""
1689 return [
1690 [
1691 None if val is None else func(val)
1692 for val in chunk.to_numpy(zero_copy_only=False)
1693 ]
1694 for chunk in self._data.iterchunks()
1695 ]
1696
1697 def _str_count(self, pat: str, flags: int = 0):
1698 if flags:
1699 raise NotImplementedError(f"count not implemented with {flags=}")
1700 return type(self)(pc.count_substring_regex(self._data, pat))
1701
1702 def _str_pad(
1703 self,
1704 width: int,
1705 side: Literal["left", "right", "both"] = "left",
1706 fillchar: str = " ",
1707 ):
1708 if side == "left":
1709 pa_pad = pc.utf8_lpad
1710 elif side == "right":
1711 pa_pad = pc.utf8_rpad
1712 elif side == "both":
1713 pa_pad = pc.utf8_center
1714 else:
1715 raise ValueError(
1716 f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
1717 )
1718 return type(self)(pa_pad(self._data, width=width, padding=fillchar))
1719
1720 def _str_contains(
1721 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
1722 ):
1723 if flags:
1724 raise NotImplementedError(f"contains not implemented with {flags=}")
1725
1726 if regex:
1727 pa_contains = pc.match_substring_regex
1728 else:
1729 pa_contains = pc.match_substring
1730 result = pa_contains(self._data, pat, ignore_case=not case)
1731 if not isna(na):
1732 result = result.fill_null(na)
1733 return type(self)(result)
1734
1735 def _str_startswith(self, pat: str, na=None):
1736 result = pc.starts_with(self._data, pattern=pat)
1737 if not isna(na):
1738 result = result.fill_null(na)
1739 return type(self)(result)
1740
1741 def _str_endswith(self, pat: str, na=None):
1742 result = pc.ends_with(self._data, pattern=pat)
1743 if not isna(na):
1744 result = result.fill_null(na)
1745 return type(self)(result)
1746
1747 def _str_replace(
1748 self,
1749 pat: str | re.Pattern,
1750 repl: str | Callable,
1751 n: int = -1,
1752 case: bool = True,
1753 flags: int = 0,
1754 regex: bool = True,
1755 ):
1756 if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
1757 raise NotImplementedError(
1758 "replace is not supported with a re.Pattern, callable repl, "
1759 "case=False, or flags!=0"
1760 )
1761
1762 func = pc.replace_substring_regex if regex else pc.replace_substring
1763 result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
1764 return type(self)(result)
1765
1766 def _str_repeat(self, repeats: int | Sequence[int]):
1767 if not isinstance(repeats, int):
1768 raise NotImplementedError(
1769 f"repeat is not implemented when repeats is {type(repeats).__name__}"
1770 )
1771 elif pa_version_under7p0:
1772 raise NotImplementedError("repeat is not implemented for pyarrow < 7")
1773 else:
1774 return type(self)(pc.binary_repeat(self._data, repeats))
1775
1776 def _str_match(
1777 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
1778 ):
1779 if not pat.startswith("^"):
1780 pat = f"^{pat}"
1781 return self._str_contains(pat, case, flags, na, regex=True)
1782
1783 def _str_fullmatch(
1784 self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
1785 ):
1786 if not pat.endswith("$") or pat.endswith("//$"):
1787 pat = f"{pat}$"
1788 return self._str_match(pat, case, flags, na)
1789
1790 def _str_find(self, sub: str, start: int = 0, end: int | None = None):
1791 if start != 0 and end is not None:
1792 slices = pc.utf8_slice_codeunits(self._data, start, stop=end)
1793 result = pc.find_substring(slices, sub)
1794 not_found = pc.equal(result, -1)
1795 offset_result = pc.add(result, end - start)
1796 result = pc.if_else(not_found, result, offset_result)
1797 elif start == 0 and end is None:
1798 slices = self._data
1799 result = pc.find_substring(slices, sub)
1800 else:
1801 raise NotImplementedError(
1802 f"find not implemented with {sub=}, {start=}, {end=}"
1803 )
1804 return type(self)(result)
1805
1806 def _str_get(self, i: int):
1807 lengths = pc.utf8_length(self._data)
1808 if i >= 0:
1809 out_of_bounds = pc.greater_equal(i, lengths)
1810 start = i
1811 stop = i + 1
1812 step = 1
1813 else:
1814 out_of_bounds = pc.greater(-i, lengths)
1815 start = i
1816 stop = i - 1
1817 step = -1
1818 not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
1819 selected = pc.utf8_slice_codeunits(
1820 self._data, start=start, stop=stop, step=step
1821 )
1822 result = pa.array([None] * self._data.length(), type=self._data.type)
1823 result = pc.if_else(not_out_of_bounds, selected, result)
1824 return type(self)(result)
1825
1826 def _str_join(self, sep: str):
1827 return type(self)(pc.binary_join(self._data, sep))
1828
1829 def _str_partition(self, sep: str, expand: bool):
1830 predicate = lambda val: val.partition(sep)
1831 result = self._apply_elementwise(predicate)
1832 return type(self)(pa.chunked_array(result))
1833
1834 def _str_rpartition(self, sep: str, expand: bool):
1835 predicate = lambda val: val.rpartition(sep)
1836 result = self._apply_elementwise(predicate)
1837 return type(self)(pa.chunked_array(result))
1838
1839 def _str_slice(
1840 self, start: int | None = None, stop: int | None = None, step: int | None = None
1841 ):
1842 if start is None:
1843 start = 0
1844 if step is None:
1845 step = 1
1846 return type(self)(
1847 pc.utf8_slice_codeunits(self._data, start=start, stop=stop, step=step)
1848 )
1849
1850 def _str_slice_replace(
1851 self, start: int | None = None, stop: int | None = None, repl: str | None = None
1852 ):
1853 if repl is None:
1854 repl = ""
1855 if start is None:
1856 start = 0
1857 return type(self)(pc.utf8_replace_slice(self._data, start, stop, repl))
1858
1859 def _str_isalnum(self):
1860 return type(self)(pc.utf8_is_alnum(self._data))
1861
1862 def _str_isalpha(self):
1863 return type(self)(pc.utf8_is_alpha(self._data))
1864
1865 def _str_isdecimal(self):
1866 return type(self)(pc.utf8_is_decimal(self._data))
1867
1868 def _str_isdigit(self):
1869 return type(self)(pc.utf8_is_digit(self._data))
1870
1871 def _str_islower(self):
1872 return type(self)(pc.utf8_is_lower(self._data))
1873
1874 def _str_isnumeric(self):
1875 return type(self)(pc.utf8_is_numeric(self._data))
1876
1877 def _str_isspace(self):
1878 return type(self)(pc.utf8_is_space(self._data))
1879
1880 def _str_istitle(self):
1881 return type(self)(pc.utf8_is_title(self._data))
1882
1883 def _str_capitalize(self):
1884 return type(self)(pc.utf8_capitalize(self._data))
1885
1886 def _str_title(self):
1887 return type(self)(pc.utf8_title(self._data))
1888
1889 def _str_isupper(self):
1890 return type(self)(pc.utf8_is_upper(self._data))
1891
1892 def _str_swapcase(self):
1893 return type(self)(pc.utf8_swapcase(self._data))
1894
1895 def _str_len(self):
1896 return type(self)(pc.utf8_length(self._data))
1897
1898 def _str_lower(self):
1899 return type(self)(pc.utf8_lower(self._data))
1900
1901 def _str_upper(self):
1902 return type(self)(pc.utf8_upper(self._data))
1903
1904 def _str_strip(self, to_strip=None):
1905 if to_strip is None:
1906 result = pc.utf8_trim_whitespace(self._data)
1907 else:
1908 result = pc.utf8_trim(self._data, characters=to_strip)
1909 return type(self)(result)
1910
1911 def _str_lstrip(self, to_strip=None):
1912 if to_strip is None:
1913 result = pc.utf8_ltrim_whitespace(self._data)
1914 else:
1915 result = pc.utf8_ltrim(self._data, characters=to_strip)
1916 return type(self)(result)
1917
1918 def _str_rstrip(self, to_strip=None):
1919 if to_strip is None:
1920 result = pc.utf8_rtrim_whitespace(self._data)
1921 else:
1922 result = pc.utf8_rtrim(self._data, characters=to_strip)
1923 return type(self)(result)
1924
1925 def _str_removeprefix(self, prefix: str):
1926 # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed
1927 # starts_with = pc.starts_with(self._data, pattern=prefix)
1928 # removed = pc.utf8_slice_codeunits(self._data, len(prefix))
1929 # result = pc.if_else(starts_with, removed, self._data)
1930 # return type(self)(result)
1931 if sys.version_info < (3, 9):
1932 # NOTE pyupgrade will remove this when we run it with --py39-plus
1933 # so don't remove the unnecessary `else` statement below
1934 from pandas.util._str_methods import removeprefix
1935
1936 predicate = functools.partial(removeprefix, prefix=prefix)
1937 else:
1938 predicate = lambda val: val.removeprefix(prefix)
1939 result = self._apply_elementwise(predicate)
1940 return type(self)(pa.chunked_array(result))
1941
1942 def _str_removesuffix(self, suffix: str):
1943 ends_with = pc.ends_with(self._data, pattern=suffix)
1944 removed = pc.utf8_slice_codeunits(self._data, 0, stop=-len(suffix))
1945 result = pc.if_else(ends_with, removed, self._data)
1946 return type(self)(result)
1947
1948 def _str_casefold(self):
1949 predicate = lambda val: val.casefold()
1950 result = self._apply_elementwise(predicate)
1951 return type(self)(pa.chunked_array(result))
1952
1953 def _str_encode(self, encoding: str, errors: str = "strict"):
1954 predicate = lambda val: val.encode(encoding, errors)
1955 result = self._apply_elementwise(predicate)
1956 return type(self)(pa.chunked_array(result))
1957
1958 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
1959 raise NotImplementedError(
1960 "str.extract not supported with pd.ArrowDtype(pa.string())."
1961 )
1962
1963 def _str_findall(self, pat: str, flags: int = 0):
1964 regex = re.compile(pat, flags=flags)
1965 predicate = lambda val: regex.findall(val)
1966 result = self._apply_elementwise(predicate)
1967 return type(self)(pa.chunked_array(result))
1968
1969 def _str_get_dummies(self, sep: str = "|"):
1970 split = pc.split_pattern(self._data, sep).combine_chunks()
1971 uniques = split.flatten().unique()
1972 uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques))
1973 result_data = []
1974 for lst in split.to_pylist():
1975 if lst is None:
1976 result_data.append([False] * len(uniques_sorted))
1977 else:
1978 res = pc.is_in(uniques_sorted, pa.array(set(lst)))
1979 result_data.append(res.to_pylist())
1980 result = type(self)(pa.array(result_data))
1981 return result, uniques_sorted.to_pylist()
1982
1983 def _str_index(self, sub: str, start: int = 0, end: int | None = None):
1984 predicate = lambda val: val.index(sub, start, end)
1985 result = self._apply_elementwise(predicate)
1986 return type(self)(pa.chunked_array(result))
1987
1988 def _str_rindex(self, sub: str, start: int = 0, end: int | None = None):
1989 predicate = lambda val: val.rindex(sub, start, end)
1990 result = self._apply_elementwise(predicate)
1991 return type(self)(pa.chunked_array(result))
1992
1993 def _str_normalize(self, form: str):
1994 predicate = lambda val: unicodedata.normalize(form, val)
1995 result = self._apply_elementwise(predicate)
1996 return type(self)(pa.chunked_array(result))
1997
1998 def _str_rfind(self, sub: str, start: int = 0, end=None):
1999 predicate = lambda val: val.rfind(sub, start, end)
2000 result = self._apply_elementwise(predicate)
2001 return type(self)(pa.chunked_array(result))
2002
2003 def _str_split(
2004 self,
2005 pat: str | None = None,
2006 n: int | None = -1,
2007 expand: bool = False,
2008 regex: bool | None = None,
2009 ):
2010 if n in {-1, 0}:
2011 n = None
2012 if regex:
2013 split_func = pc.split_pattern_regex
2014 else:
2015 split_func = pc.split_pattern
2016 return type(self)(split_func(self._data, pat, max_splits=n))
2017
2018 def _str_rsplit(self, pat: str | None = None, n: int | None = -1):
2019 if n in {-1, 0}:
2020 n = None
2021 return type(self)(pc.split_pattern(self._data, pat, max_splits=n, reverse=True))
2022
2023 def _str_translate(self, table: dict[int, str]):
2024 predicate = lambda val: val.translate(table)
2025 result = self._apply_elementwise(predicate)
2026 return type(self)(pa.chunked_array(result))
2027
2028 def _str_wrap(self, width: int, **kwargs):
2029 kwargs["width"] = width
2030 tw = textwrap.TextWrapper(**kwargs)
2031 predicate = lambda val: "\n".join(tw.wrap(val))
2032 result = self._apply_elementwise(predicate)
2033 return type(self)(pa.chunked_array(result))
2034
2035 @property
2036 def _dt_year(self):
2037 return type(self)(pc.year(self._data))
2038
2039 @property
2040 def _dt_day(self):
2041 return type(self)(pc.day(self._data))
2042
2043 @property
2044 def _dt_day_of_week(self):
2045 return type(self)(pc.day_of_week(self._data))
2046
2047 _dt_dayofweek = _dt_day_of_week
2048 _dt_weekday = _dt_day_of_week
2049
2050 @property
2051 def _dt_day_of_year(self):
2052 return type(self)(pc.day_of_year(self._data))
2053
2054 _dt_dayofyear = _dt_day_of_year
2055
2056 @property
2057 def _dt_hour(self):
2058 return type(self)(pc.hour(self._data))
2059
2060 def _dt_isocalendar(self):
2061 return type(self)(pc.iso_calendar(self._data))
2062
2063 @property
2064 def _dt_is_leap_year(self):
2065 return type(self)(pc.is_leap_year(self._data))
2066
2067 @property
2068 def _dt_microsecond(self):
2069 return type(self)(pc.microsecond(self._data))
2070
2071 @property
2072 def _dt_minute(self):
2073 return type(self)(pc.minute(self._data))
2074
2075 @property
2076 def _dt_month(self):
2077 return type(self)(pc.month(self._data))
2078
2079 @property
2080 def _dt_nanosecond(self):
2081 return type(self)(pc.nanosecond(self._data))
2082
2083 @property
2084 def _dt_quarter(self):
2085 return type(self)(pc.quarter(self._data))
2086
2087 @property
2088 def _dt_second(self):
2089 return type(self)(pc.second(self._data))
2090
2091 @property
2092 def _dt_date(self):
2093 return type(self)(self._data.cast(pa.date32()))
2094
2095 @property
2096 def _dt_time(self):
2097 unit = (
2098 self.dtype.pyarrow_dtype.unit
2099 if self.dtype.pyarrow_dtype.unit in {"us", "ns"}
2100 else "ns"
2101 )
2102 return type(self)(self._data.cast(pa.time64(unit)))
2103
2104 @property
2105 def _dt_tz(self):
2106 return self.dtype.pyarrow_dtype.tz
2107
2108 def _dt_strftime(self, format: str):
2109 return type(self)(pc.strftime(self._data, format=format))
2110
2111 def _round_temporally(
2112 self,
2113 method: Literal["ceil", "floor", "round"],
2114 freq,
2115 ambiguous: TimeAmbiguous = "raise",
2116 nonexistent: TimeNonexistent = "raise",
2117 ):
2118 if ambiguous != "raise":
2119 raise NotImplementedError("ambiguous is not supported.")
2120 if nonexistent != "raise":
2121 raise NotImplementedError("nonexistent is not supported.")
2122 offset = to_offset(freq)
2123 if offset is None:
2124 raise ValueError(f"Must specify a valid frequency: {freq}")
2125 pa_supported_unit = {
2126 "A": "year",
2127 "AS": "year",
2128 "Q": "quarter",
2129 "QS": "quarter",
2130 "M": "month",
2131 "MS": "month",
2132 "W": "week",
2133 "D": "day",
2134 "H": "hour",
2135 "T": "minute",
2136 "S": "second",
2137 "L": "millisecond",
2138 "U": "microsecond",
2139 "N": "nanosecond",
2140 }
2141 unit = pa_supported_unit.get(offset._prefix, None)
2142 if unit is None:
2143 raise ValueError(f"{freq=} is not supported")
2144 multiple = offset.n
2145 rounding_method = getattr(pc, f"{method}_temporal")
2146 return type(self)(rounding_method(self._data, multiple=multiple, unit=unit))
2147
2148 def _dt_ceil(
2149 self,
2150 freq,
2151 ambiguous: TimeAmbiguous = "raise",
2152 nonexistent: TimeNonexistent = "raise",
2153 ):
2154 return self._round_temporally("ceil", freq, ambiguous, nonexistent)
2155
2156 def _dt_floor(
2157 self,
2158 freq,
2159 ambiguous: TimeAmbiguous = "raise",
2160 nonexistent: TimeNonexistent = "raise",
2161 ):
2162 return self._round_temporally("floor", freq, ambiguous, nonexistent)
2163
2164 def _dt_round(
2165 self,
2166 freq,
2167 ambiguous: TimeAmbiguous = "raise",
2168 nonexistent: TimeNonexistent = "raise",
2169 ):
2170 return self._round_temporally("round", freq, ambiguous, nonexistent)
2171
2172 def _dt_to_pydatetime(self):
2173 if pa.types.is_date(self.dtype.pyarrow_dtype):
2174 raise ValueError(
2175 f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. "
2176 "Convert to pyarrow timestamp type."
2177 )
2178 data = self._data.to_pylist()
2179 if self._dtype.pyarrow_dtype.unit == "ns":
2180 data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data]
2181 return np.array(data, dtype=object)
2182
2183 def _dt_tz_localize(
2184 self,
2185 tz,
2186 ambiguous: TimeAmbiguous = "raise",
2187 nonexistent: TimeNonexistent = "raise",
2188 ):
2189 if ambiguous != "raise":
2190 raise NotImplementedError(f"{ambiguous=} is not supported")
2191 nonexistent_pa = {
2192 "raise": "raise",
2193 "shift_backward": "earliest",
2194 "shift_forward": "latest",
2195 }.get(
2196 nonexistent, None # type: ignore[arg-type]
2197 )
2198 if nonexistent_pa is None:
2199 raise NotImplementedError(f"{nonexistent=} is not supported")
2200 if tz is None:
2201 result = self._data.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit))
2202 else:
2203 result = pc.assume_timezone(
2204 self._data, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa
2205 )
2206 return type(self)(result)