Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/arrow/array.py: 19%

1from __future__ import annotations

3from copy import deepcopy

4import functools

5import operator

6import re

7import sys

8import textwrap

9from typing import (

10 TYPE_CHECKING,

11 Any,

12 Callable,

13 Literal,

14 Sequence,

15 TypeVar,

16 cast,

17)

18import unicodedata

20import numpy as np

22from pandas._libs import lib

23from pandas._typing import (

24 ArrayLike,

25 AxisInt,

26 Dtype,

27 FillnaOptions,

28 Iterator,

29 NpDtype,

30 PositionalIndexer,

31 Scalar,

32 SortKind,

33 TakeIndexer,

34 TimeAmbiguous,

35 TimeNonexistent,

36 npt,

37)

38from pandas.compat import (

39 pa_version_under7p0,

40 pa_version_under8p0,

41 pa_version_under9p0,

42 pa_version_under11p0,

43)

44from pandas.util._decorators import doc

45from pandas.util._validators import validate_fillna_kwargs

47from pandas.core.dtypes.common import (

48 is_array_like,

49 is_bool_dtype,

50 is_integer,

51 is_integer_dtype,

52 is_list_like,

53 is_object_dtype,

54 is_scalar,

55)

56from pandas.core.dtypes.dtypes import DatetimeTZDtype

57from pandas.core.dtypes.missing import isna

59from pandas.core import roperator

60from pandas.core.arraylike import OpsMixin

61from pandas.core.arrays.base import (

62 ExtensionArray,

63 ExtensionArraySupportsAnyAll,

64)

65import pandas.core.common as com

66from pandas.core.indexers import (

67 check_array_indexer,

68 unpack_tuple_and_ellipses,

69 validate_indices,

70)

71from pandas.core.strings.base import BaseStringArrayMethods

73from pandas.tseries.frequencies import to_offset

75if not pa_version_under7p0:

76 import pyarrow as pa

77 import pyarrow.compute as pc

79 from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning

80 from pandas.core.arrays.arrow.dtype import ArrowDtype

82 ARROW_CMP_FUNCS = {

83 "eq": pc.equal,

84 "ne": pc.not_equal,

85 "lt": pc.less,

86 "gt": pc.greater,

87 "le": pc.less_equal,

88 "ge": pc.greater_equal,

89 }

91 ARROW_LOGICAL_FUNCS = {

92 "and_": pc.and_kleene,

93 "rand_": lambda x, y: pc.and_kleene(y, x),

94 "or_": pc.or_kleene,

95 "ror_": lambda x, y: pc.or_kleene(y, x),

96 "xor": pc.xor,

97 "rxor": lambda x, y: pc.xor(y, x),

98 }

100 def cast_for_truediv(

101 arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar

102 ) -> pa.ChunkedArray:

103 # Ensure int / int -> float mirroring Python/Numpy behavior

104 # as pc.divide_checked(int, int) -> int

105 if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(

106 pa_object.type

107 ):

108 return arrow_array.cast(pa.float64())

109 return arrow_array

110

111 def floordiv_compat(

112 left: pa.ChunkedArray | pa.Array | pa.Scalar,

113 right: pa.ChunkedArray | pa.Array | pa.Scalar,

114 ) -> pa.ChunkedArray:

115 # Ensure int // int -> int mirroring Python/Numpy behavior

116 # as pc.floor(pc.divide_checked(int, int)) -> float

117 result = pc.floor(pc.divide(left, right))

118 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):

119 result = result.cast(left.type)

120 return result

121

122 ARROW_ARITHMETIC_FUNCS = {

123 "add": pc.add_checked,

124 "radd": lambda x, y: pc.add_checked(y, x),

125 "sub": pc.subtract_checked,

126 "rsub": lambda x, y: pc.subtract_checked(y, x),

127 "mul": pc.multiply_checked,

128 "rmul": lambda x, y: pc.multiply_checked(y, x),

129 "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y),

130 "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)),

131 "floordiv": lambda x, y: floordiv_compat(x, y),

132 "rfloordiv": lambda x, y: floordiv_compat(y, x),

133 "mod": NotImplemented,

134 "rmod": NotImplemented,

135 "divmod": NotImplemented,

136 "rdivmod": NotImplemented,

137 "pow": pc.power_checked,

138 "rpow": lambda x, y: pc.power_checked(y, x),

139 }

140

141if TYPE_CHECKING:

142 from pandas._typing import (

143 NumpySorter,

144 NumpyValueArrayLike,

145 )

146

147 from pandas import Series

148

149ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray")

150

151

152def get_unit_from_pa_dtype(pa_dtype):

153 # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804

154 if pa_version_under11p0:

155 unit = str(pa_dtype).split("[", 1)[-1][:-1]

156 if unit not in ["s", "ms", "us", "ns"]:

157 raise ValueError(pa_dtype)

158 return unit

159 return pa_dtype.unit

160

161

162def to_pyarrow_type(

163 dtype: ArrowDtype | pa.DataType | Dtype | None,

164) -> pa.DataType | None:

165 """

166 Convert dtype to a pyarrow type instance.

167 """

168 if isinstance(dtype, ArrowDtype):

169 return dtype.pyarrow_dtype

170 elif isinstance(dtype, pa.DataType):

171 return dtype

172 elif isinstance(dtype, DatetimeTZDtype):

173 return pa.timestamp(dtype.unit, dtype.tz)

174 elif dtype:

175 try:

176 # Accepts python types too

177 # Doesn't handle all numpy types

178 return pa.from_numpy_dtype(dtype)

179 except pa.ArrowNotImplementedError:

180 pass

181 return None

182

183

184class ArrowExtensionArray(

185 OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods

186):

187 """

188 Pandas ExtensionArray backed by a PyArrow ChunkedArray.

189

190 .. warning::

191

192 ArrowExtensionArray is considered experimental. The implementation and

193 parts of the API may change without warning.

194

195 Parameters

196 ----------

197 values : pyarrow.Array or pyarrow.ChunkedArray

198

199 Attributes

200 ----------

201 None

202

203 Methods

204 -------

205 None

206

207 Returns

208 -------

209 ArrowExtensionArray

210

211 Notes

212 -----

213 Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__

214 Some methods may either raise an exception or raise a ``PerformanceWarning`` if an

215 associated compute function is not available based on the installed version of PyArrow.

216

217 Please install the latest version of PyArrow to enable the best functionality and avoid

218 potential bugs in prior versions of PyArrow.

219

220 Examples

221 --------

222 Create an ArrowExtensionArray with :func:`pandas.array`:

223

224 >>> pd.array([1, 1, None], dtype="int64[pyarrow]")

225 <ArrowExtensionArray>

226 [1, 1, <NA>]

227 Length: 3, dtype: int64[pyarrow]

228 """ # noqa: E501 (http link too long)

229

230 _data: pa.ChunkedArray

231 _dtype: ArrowDtype

232

233 def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:

234 if pa_version_under7p0:

235 msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray."

236 raise ImportError(msg)

237 if isinstance(values, pa.Array):

238 self._data = pa.chunked_array([values])

239 elif isinstance(values, pa.ChunkedArray):

240 self._data = values

241 else:

242 raise ValueError(

243 f"Unsupported type '{type(values)}' for ArrowExtensionArray"

244 )

245 self._dtype = ArrowDtype(self._data.type)

246

247 @classmethod

248 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):

249 """

250 Construct a new ExtensionArray from a sequence of scalars.

251 """

252 pa_dtype = to_pyarrow_type(dtype)

253 if (

254 isinstance(scalars, np.ndarray)

255 and isinstance(dtype, ArrowDtype)

256 and (

257 pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype)

258 )

259 ):

260 # See https://github.com/apache/arrow/issues/35289

261 scalars = scalars.tolist()

262

263 if isinstance(scalars, cls):

264 scalars = scalars._data

265 elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)):

266 if copy and is_array_like(scalars):

267 # pa array should not get updated when numpy array is updated

268 scalars = deepcopy(scalars)

269 try:

270 scalars = pa.array(scalars, type=pa_dtype, from_pandas=True)

271 except pa.ArrowInvalid:

272 # GH50430: let pyarrow infer type, then cast

273 scalars = pa.array(scalars, from_pandas=True)

274 if pa_dtype:

275 if pa.types.is_dictionary(pa_dtype):

276 scalars = scalars.dictionary_encode()

277 else:

278 scalars = scalars.cast(pa_dtype)

279 arr = cls(scalars)

280 if pa.types.is_duration(scalars.type) and scalars.null_count > 0:

281 # GH52843: upstream bug for duration types when originally

282 # constructed with data containing numpy NaT.

283 # https://github.com/apache/arrow/issues/35088

284 arr = arr.fillna(arr.dtype.na_value)

285 return arr

286

287 @classmethod

288 def _from_sequence_of_strings(

289 cls, strings, *, dtype: Dtype | None = None, copy: bool = False

290 ):

291 """

292 Construct a new ExtensionArray from a sequence of strings.

293 """

294 pa_type = to_pyarrow_type(dtype)

295 if (

296 pa_type is None

297 or pa.types.is_binary(pa_type)

298 or pa.types.is_string(pa_type)

299 ):

300 # pa_type is None: Let pa.array infer

301 # pa_type is string/binary: scalars already correct type

302 scalars = strings

303 elif pa.types.is_timestamp(pa_type):

304 from pandas.core.tools.datetimes import to_datetime

305

306 scalars = to_datetime(strings, errors="raise")

307 elif pa.types.is_date(pa_type):

308 from pandas.core.tools.datetimes import to_datetime

309

310 scalars = to_datetime(strings, errors="raise").date

311 elif pa.types.is_duration(pa_type):

312 from pandas.core.tools.timedeltas import to_timedelta

313

314 scalars = to_timedelta(strings, errors="raise")

315 if pa_type.unit != "ns":

316 # GH51175: test_from_sequence_of_strings_pa_array

317 # attempt to parse as int64 reflecting pyarrow's

318 # duration to string casting behavior

319 mask = isna(scalars)

320 if not isinstance(strings, (pa.Array, pa.ChunkedArray)):

321 strings = pa.array(strings, type=pa.string(), from_pandas=True)

322 strings = pc.if_else(mask, None, strings)

323 try:

324 scalars = strings.cast(pa.int64())

325 except pa.ArrowInvalid:

326 pass

327 elif pa.types.is_time(pa_type):

328 from pandas.core.tools.times import to_time

329

330 # "coerce" to allow "null times" (None) to not raise

331 scalars = to_time(strings, errors="coerce")

332 elif pa.types.is_boolean(pa_type):

333 from pandas.core.arrays import BooleanArray

334

335 scalars = BooleanArray._from_sequence_of_strings(strings).to_numpy()

336 elif (

337 pa.types.is_integer(pa_type)

338 or pa.types.is_floating(pa_type)

339 or pa.types.is_decimal(pa_type)

340 ):

341 from pandas.core.tools.numeric import to_numeric

342

343 scalars = to_numeric(strings, errors="raise")

344 else:

345 raise NotImplementedError(

346 f"Converting strings to {pa_type} is not implemented."

347 )

348 return cls._from_sequence(scalars, dtype=pa_type, copy=copy)

349

350 def __getitem__(self, item: PositionalIndexer):

351 """Select a subset of self.

352

353 Parameters

354 ----------

355 item : int, slice, or ndarray

356 * int: The position in 'self' to get.

357 * slice: A slice object, where 'start', 'stop', and 'step' are

358 integers or None

359 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'

360

361 Returns

362 -------

363 item : scalar or ExtensionArray

364

365 Notes

366 -----

367 For scalar ``item``, return a scalar value suitable for the array's

368 type. This should be an instance of ``self.dtype.type``.

369 For slice ``key``, return an instance of ``ExtensionArray``, even

370 if the slice is length 0 or 1.

371 For a boolean mask, return an instance of ``ExtensionArray``, filtered

372 to the values where ``item`` is True.

373 """

374 item = check_array_indexer(self, item)

375

376 if isinstance(item, np.ndarray):

377 if not len(item):

378 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]

379 if self._dtype.name == "string" and self._dtype.storage == "pyarrow":

380 pa_dtype = pa.string()

381 else:

382 pa_dtype = self._dtype.pyarrow_dtype

383 return type(self)(pa.chunked_array([], type=pa_dtype))

384 elif is_integer_dtype(item.dtype):

385 return self.take(item)

386 elif is_bool_dtype(item.dtype):

387 return type(self)(self._data.filter(item))

388 else:

389 raise IndexError(

390 "Only integers, slices and integer or "

391 "boolean arrays are valid indices."

392 )

393 elif isinstance(item, tuple):

394 item = unpack_tuple_and_ellipses(item)

395

396 if item is Ellipsis:

397 # TODO: should be handled by pyarrow?

398 item = slice(None)

399

400 if is_scalar(item) and not is_integer(item):

401 # e.g. "foo" or 2.5

402 # exception message copied from numpy

403 raise IndexError(

404 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "

405 r"(`None`) and integer or boolean arrays are valid indices"

406 )

407 # We are not an array indexer, so maybe e.g. a slice or integer

408 # indexer. We dispatch to pyarrow.

409 value = self._data[item]

410 if isinstance(value, pa.ChunkedArray):

411 return type(self)(value)

412 else:

413 scalar = value.as_py()

414 if scalar is None:

415 return self._dtype.na_value

416 else:

417 return scalar

418

419 def __iter__(self) -> Iterator[Any]:

420 """

421 Iterate over elements of the array.

422 """

423 na_value = self._dtype.na_value

424 for value in self._data:

425 val = value.as_py()

426 if val is None:

427 yield na_value

428 else:

429 yield val

430

431 def __arrow_array__(self, type=None):

432 """Convert myself to a pyarrow ChunkedArray."""

433 return self._data

434

435 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:

436 """Correctly construct numpy arrays when passed to `np.asarray()`."""

437 return self.to_numpy(dtype=dtype)

438

439 def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

440 return type(self)(pc.invert(self._data))

441

442 def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

443 return type(self)(pc.negate_checked(self._data))

444

445 def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

446 return type(self)(self._data)

447

448 def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

449 return type(self)(pc.abs_checked(self._data))

450

451 # GH 42600: __getstate__/__setstate__ not necessary once

452 # https://issues.apache.org/jira/browse/ARROW-10739 is addressed

453 def __getstate__(self):

454 state = self.__dict__.copy()

455 state["_data"] = self._data.combine_chunks()

456 return state

457

458 def __setstate__(self, state) -> None:

459 state["_data"] = pa.chunked_array(state["_data"])

460 self.__dict__.update(state)

461

462 def _cmp_method(self, other, op):

463 from pandas.core.arrays.masked import BaseMaskedArray

464

465 pc_func = ARROW_CMP_FUNCS[op.__name__]

466 if isinstance(other, ArrowExtensionArray):

467 result = pc_func(self._data, other._data)

468 elif isinstance(other, (np.ndarray, list)):

469 result = pc_func(self._data, other)

470 elif isinstance(other, BaseMaskedArray):

471 # GH 52625

472 result = pc_func(self._data, other.__arrow_array__())

473 elif is_scalar(other):

474 try:

475 result = pc_func(self._data, pa.scalar(other))

476 except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):

477 mask = isna(self) | isna(other)

478 valid = ~mask

479 result = np.zeros(len(self), dtype="bool")

480 result[valid] = op(np.array(self)[valid], other)

481 result = pa.array(result, type=pa.bool_())

482 result = pc.if_else(valid, result, None)

483 else:

484 raise NotImplementedError(

485 f"{op.__name__} not implemented for {type(other)}"

486 )

487 return ArrowExtensionArray(result)

488

489 def _evaluate_op_method(self, other, op, arrow_funcs):

490 from pandas.core.arrays.masked import BaseMaskedArray

491

492 pa_type = self._data.type

493 if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [

494 operator.add,

495 roperator.radd,

496 ]:

497 length = self._data.length()

498

499 seps: list[str] | list[bytes]

500 if pa.types.is_string(pa_type):

501 seps = [""] * length

502 else:

503 seps = [b""] * length

504

505 if is_scalar(other):

506 other = [other] * length

507 elif isinstance(other, type(self)):

508 other = other._data

509 if op is operator.add:

510 result = pc.binary_join_element_wise(self._data, other, seps)

511 else:

512 result = pc.binary_join_element_wise(other, self._data, seps)

513 return type(self)(result)

514

515 pc_func = arrow_funcs[op.__name__]

516 if pc_func is NotImplemented:

517 raise NotImplementedError(f"{op.__name__} not implemented.")

518 if isinstance(other, ArrowExtensionArray):

519 result = pc_func(self._data, other._data)

520 elif isinstance(other, (np.ndarray, list)):

521 result = pc_func(self._data, pa.array(other, from_pandas=True))

522 elif isinstance(other, BaseMaskedArray):

523 # GH 52625

524 result = pc_func(self._data, other.__arrow_array__())

525 elif is_scalar(other):

526 if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS:

527 # pyarrow kleene ops require null to be typed

528 pa_scalar = pa.scalar(None, type=self._data.type)

529 else:

530 pa_scalar = pa.scalar(other)

531 result = pc_func(self._data, pa_scalar)

532 else:

533 raise NotImplementedError(

534 f"{op.__name__} not implemented for {type(other)}"

535 )

536 return type(self)(result)

537

538 def _logical_method(self, other, op):

539 return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)

540

541 def _arith_method(self, other, op):

542 return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)

543

544 def equals(self, other) -> bool:

545 if not isinstance(other, ArrowExtensionArray):

546 return False

547 # I'm told that pyarrow makes __eq__ behave like pandas' equals;

548 # TODO: is this documented somewhere?

549 return self._data == other._data

550

551 @property

552 def dtype(self) -> ArrowDtype:

553 """

554 An instance of 'ExtensionDtype'.

555 """

556 return self._dtype

557

558 @property

559 def nbytes(self) -> int:

560 """

561 The number of bytes needed to store this object in memory.

562 """

563 return self._data.nbytes

564

565 def __len__(self) -> int:

566 """

567 Length of this array.

568

569 Returns

570 -------

571 length : int

572 """

573 return len(self._data)

574

575 def __contains__(self, key) -> bool:

576 # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604

577 if isna(key) and key is not self.dtype.na_value:

578 if self.dtype.kind == "f" and lib.is_float(key) and isna(key):

579 return pc.any(pc.is_nan(self._data)).as_py()

580

581 # e.g. date or timestamp types we do not allow None here to match pd.NA

582 return False

583 # TODO: maybe complex? object?

584

585 return bool(super().__contains__(key))

586

587 @property

588 def _hasna(self) -> bool:

589 return self._data.null_count > 0

590

591 def isna(self) -> npt.NDArray[np.bool_]:

592 """

593 Boolean NumPy array indicating if each value is missing.

594

595 This should return a 1-D array the same length as 'self'.

596 """

597 return self._data.is_null().to_numpy()

598

599 def any(self, *, skipna: bool = True, **kwargs):

600 """

601 Return whether any element is truthy.

602

603 Returns False unless there is at least one element that is truthy.

604 By default, NAs are skipped. If ``skipna=False`` is specified and

605 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`

606 is used as for logical operations.

607

608 Parameters

609 ----------

610 skipna : bool, default True

611 Exclude NA values. If the entire array is NA and `skipna` is

612 True, then the result will be False, as for an empty array.

613 If `skipna` is False, the result will still be True if there is

614 at least one element that is truthy, otherwise NA will be returned

615 if there are NA's present.

616

617 Returns

618 -------

619 bool or :attr:`pandas.NA`

620

621 See Also

622 --------

623 ArrowExtensionArray.all : Return whether all elements are truthy.

624

625 Examples

626 --------

627 The result indicates whether any element is truthy (and by default

628 skips NAs):

629

630 >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any()

631 True

632 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any()

633 True

634 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any()

635 False

636 >>> pd.array([], dtype="boolean[pyarrow]").any()

637 False

638 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any()

639 False

640 >>> pd.array([pd.NA], dtype="float64[pyarrow]").any()

641 False

642

643 With ``skipna=False``, the result can be NA if this is logically

644 required (whether ``pd.NA`` is True or False influences the result):

645

646 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)

647 True

648 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)

649 True

650 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)

651 <NA>

652 >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)

653 <NA>

654 """

655 return self._reduce("any", skipna=skipna, **kwargs)

656

657 def all(self, *, skipna: bool = True, **kwargs):

658 """

659 Return whether all elements are truthy.

660

661 Returns True unless there is at least one element that is falsey.

662 By default, NAs are skipped. If ``skipna=False`` is specified and

663 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`

664 is used as for logical operations.

665

666 Parameters

667 ----------

668 skipna : bool, default True

669 Exclude NA values. If the entire array is NA and `skipna` is

670 True, then the result will be True, as for an empty array.

671 If `skipna` is False, the result will still be False if there is

672 at least one element that is falsey, otherwise NA will be returned

673 if there are NA's present.

674

675 Returns

676 -------

677 bool or :attr:`pandas.NA`

678

679 See Also

680 --------

681 ArrowExtensionArray.any : Return whether any element is truthy.

682

683 Examples

684 --------

685 The result indicates whether all elements are truthy (and by default

686 skips NAs):

687

688 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all()

689 True

690 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all()

691 True

692 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all()

693 False

694 >>> pd.array([], dtype="boolean[pyarrow]").all()

695 True

696 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all()

697 True

698 >>> pd.array([pd.NA], dtype="float64[pyarrow]").all()

699 True

700

701 With ``skipna=False``, the result can be NA if this is logically

702 required (whether ``pd.NA`` is True or False influences the result):

703

704 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)

705 <NA>

706 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)

707 <NA>

708 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)

709 False

710 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)

711 False

712 """

713 return self._reduce("all", skipna=skipna, **kwargs)

714

715 def argsort(

716 self,

717 *,

718 ascending: bool = True,

719 kind: SortKind = "quicksort",

720 na_position: str = "last",

721 **kwargs,

722 ) -> np.ndarray:

723 order = "ascending" if ascending else "descending"

724 null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None)

725 if null_placement is None:

726 raise ValueError(f"invalid na_position: {na_position}")

727

728 result = pc.array_sort_indices(

729 self._data, order=order, null_placement=null_placement

730 )

731 np_result = result.to_numpy()

732 return np_result.astype(np.intp, copy=False)

733

734 def _argmin_max(self, skipna: bool, method: str) -> int:

735 if self._data.length() in (0, self._data.null_count) or (

736 self._hasna and not skipna

737 ):

738 # For empty or all null, pyarrow returns -1 but pandas expects TypeError

739 # For skipna=False and data w/ null, pandas expects NotImplementedError

740 # let ExtensionArray.arg{max|min} raise

741 return getattr(super(), f"arg{method}")(skipna=skipna)

742

743 data = self._data

744 if pa.types.is_duration(data.type):

745 data = data.cast(pa.int64())

746

747 value = getattr(pc, method)(data, skip_nulls=skipna)

748 return pc.index(data, value).as_py()

749

750 def argmin(self, skipna: bool = True) -> int:

751 return self._argmin_max(skipna, "min")

752

753 def argmax(self, skipna: bool = True) -> int:

754 return self._argmin_max(skipna, "max")

755

756 def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

757 """

758 Return a shallow copy of the array.

759

760 Underlying ChunkedArray is immutable, so a deep copy is unnecessary.

761

762 Returns

763 -------

764 type(self)

765 """

766 return type(self)(self._data)

767

768 def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

769 """

770 Return ArrowExtensionArray without NA values.

771

772 Returns

773 -------

774 ArrowExtensionArray

775 """

776 return type(self)(pc.drop_null(self._data))

777

778 @doc(ExtensionArray.fillna)

779 def fillna(

780 self: ArrowExtensionArrayT,

781 value: object | ArrayLike | None = None,

782 method: FillnaOptions | None = None,

783 limit: int | None = None,

784 ) -> ArrowExtensionArrayT:

785 value, method = validate_fillna_kwargs(value, method)

786

787 if limit is not None:

788 return super().fillna(value=value, method=method, limit=limit)

789

790 if method is not None:

791 fallback_performancewarning()

792 return super().fillna(value=value, method=method, limit=limit)

793

794 if is_array_like(value):

795 value = cast(ArrayLike, value)

796 if len(value) != len(self):

797 raise ValueError(

798 f"Length of 'value' does not match. Got ({len(value)}) "

799 f" expected {len(self)}"

800 )

801

802 def convert_fill_value(value, pa_type, dtype):

803 if value is None:

804 return value

805 if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):

806 return value

807 if is_array_like(value):

808 pa_box = pa.array

809 else:

810 pa_box = pa.scalar

811 try:

812 value = pa_box(value, type=pa_type, from_pandas=True)

813 except pa.ArrowTypeError as err:

814 msg = f"Invalid value '{str(value)}' for dtype {dtype}"

815 raise TypeError(msg) from err

816 return value

817

818 fill_value = convert_fill_value(value, self._data.type, self.dtype)

819

820 try:

821 if method is None:

822 return type(self)(pc.fill_null(self._data, fill_value=fill_value))

823 elif method == "pad":

824 return type(self)(pc.fill_null_forward(self._data))

825 elif method == "backfill":

826 return type(self)(pc.fill_null_backward(self._data))

827 except pa.ArrowNotImplementedError:

828 # ArrowNotImplementedError: Function 'coalesce' has no kernel

829 # matching input types (duration[ns], duration[ns])

830 # TODO: remove try/except wrapper if/when pyarrow implements

831 # a kernel for duration types.

832 pass

833

834 return super().fillna(value=value, method=method, limit=limit)

835

836 def isin(self, values) -> npt.NDArray[np.bool_]:

837 # short-circuit to return all False array.

838 if not len(values):

839 return np.zeros(len(self), dtype=bool)

840

841 result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True))

842 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls

843 # to False

844 return np.array(result, dtype=np.bool_)

845

846 def _values_for_factorize(self) -> tuple[np.ndarray, Any]:

847 """

848 Return an array and missing value suitable for factorization.

849

850 Returns

851 -------

852 values : ndarray

853 na_value : pd.NA

854

855 Notes

856 -----

857 The values returned by this method are also used in

858 :func:`pandas.util.hash_pandas_object`.

859 """

860 values = self._data.to_numpy()

861 return values, self.dtype.na_value

862

863 @doc(ExtensionArray.factorize)

864 def factorize(

865 self,

866 use_na_sentinel: bool = True,

867 ) -> tuple[np.ndarray, ExtensionArray]:

868 null_encoding = "mask" if use_na_sentinel else "encode"

869

870 pa_type = self._data.type

871 if pa.types.is_duration(pa_type):

872 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323

873 data = self._data.cast(pa.int64())

874 else:

875 data = self._data

876

877 if pa.types.is_dictionary(data.type):

878 encoded = data

879 else:

880 encoded = data.dictionary_encode(null_encoding=null_encoding)

881 if encoded.length() == 0:

882 indices = np.array([], dtype=np.intp)

883 uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))

884 else:

885 pa_indices = encoded.combine_chunks().indices

886 if pa_indices.null_count > 0:

887 pa_indices = pc.fill_null(pa_indices, -1)

888 indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(

889 np.intp, copy=False

890 )

891 uniques = type(self)(encoded.chunk(0).dictionary)

892

893 if pa.types.is_duration(pa_type):

894 uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype))

895 return indices, uniques

896

897 def reshape(self, *args, **kwargs):

898 raise NotImplementedError(

899 f"{type(self)} does not support reshape "

900 f"as backed by a 1D pyarrow.ChunkedArray."

901 )

902

903 def round(

904 self: ArrowExtensionArrayT, decimals: int = 0, *args, **kwargs

905 ) -> ArrowExtensionArrayT:

906 """

907 Round each value in the array a to the given number of decimals.

908

909 Parameters

910 ----------

911 decimals : int, default 0

912 Number of decimal places to round to. If decimals is negative,

913 it specifies the number of positions to the left of the decimal point.

914 *args, **kwargs

915 Additional arguments and keywords have no effect.

916

917 Returns

918 -------

919 ArrowExtensionArray

920 Rounded values of the ArrowExtensionArray.

921

922 See Also

923 --------

924 DataFrame.round : Round values of a DataFrame.

925 Series.round : Round values of a Series.

926 """

927 return type(self)(pc.round(self._data, ndigits=decimals))

928

929 @doc(ExtensionArray.searchsorted)

930 def searchsorted(

931 self,

932 value: NumpyValueArrayLike | ExtensionArray,

933 side: Literal["left", "right"] = "left",

934 sorter: NumpySorter = None,

935 ) -> npt.NDArray[np.intp] | np.intp:

936 if self._hasna:

937 raise ValueError(

938 "searchsorted requires array to be sorted, which is impossible "

939 "with NAs present."

940 )

941 if isinstance(value, ExtensionArray):

942 value = value.astype(object)

943 # Base class searchsorted would cast to object, which is *much* slower.

944 return self.to_numpy().searchsorted(value, side=side, sorter=sorter)

945

946 def take(

947 self,

948 indices: TakeIndexer,

949 allow_fill: bool = False,

950 fill_value: Any = None,

951 ) -> ArrowExtensionArray:

952 """

953 Take elements from an array.

954

955 Parameters

956 ----------

957 indices : sequence of int or one-dimensional np.ndarray of int

958 Indices to be taken.

959 allow_fill : bool, default False

960 How to handle negative values in `indices`.

961

962 * False: negative values in `indices` indicate positional indices

963 from the right (the default). This is similar to

964 :func:`numpy.take`.

965

966 * True: negative values in `indices` indicate

967 missing values. These values are set to `fill_value`. Any other

968 other negative values raise a ``ValueError``.

969

970 fill_value : any, optional

971 Fill value to use for NA-indices when `allow_fill` is True.

972 This may be ``None``, in which case the default NA value for

973 the type, ``self.dtype.na_value``, is used.

974

975 For many ExtensionArrays, there will be two representations of

976 `fill_value`: a user-facing "boxed" scalar, and a low-level

977 physical NA value. `fill_value` should be the user-facing version,

978 and the implementation should handle translating that to the

979 physical version for processing the take if necessary.

980

981 Returns

982 -------

983 ExtensionArray

984

985 Raises

986 ------

987 IndexError

988 When the indices are out of bounds for the array.

989 ValueError

990 When `indices` contains negative values other than ``-1``

991 and `allow_fill` is True.

992

993 See Also

994 --------

995 numpy.take

996 api.extensions.take

997

998 Notes

999 -----

1000 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,

1001 ``iloc``, when `indices` is a sequence of values. Additionally,

1002 it's called by :meth:`Series.reindex`, or any other method

1003 that causes realignment, with a `fill_value`.

1004 """

1005 # TODO: Remove once we got rid of the (indices < 0) check

1006 if not is_array_like(indices):

1007 indices_array = np.asanyarray(indices)

1008 else:

1009 # error: Incompatible types in assignment (expression has type

1010 # "Sequence[int]", variable has type "ndarray")

1011 indices_array = indices # type: ignore[assignment]

1012

1013 if len(self._data) == 0 and (indices_array >= 0).any():

1014 raise IndexError("cannot do a non-empty take")

1015 if indices_array.size > 0 and indices_array.max() >= len(self._data):

1016 raise IndexError("out of bounds value in 'indices'.")

1017

1018 if allow_fill:

1019 fill_mask = indices_array < 0

1020 if fill_mask.any():

1021 validate_indices(indices_array, len(self._data))

1022 # TODO(ARROW-9433): Treat negative indices as NULL

1023 indices_array = pa.array(indices_array, mask=fill_mask)

1024 result = self._data.take(indices_array)

1025 if isna(fill_value):

1026 return type(self)(result)

1027 # TODO: ArrowNotImplementedError: Function fill_null has no

1028 # kernel matching input types (array[string], scalar[string])

1029 result = type(self)(result)

1030 result[fill_mask] = fill_value

1031 return result

1032 # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))

1033 else:

1034 # Nothing to fill

1035 return type(self)(self._data.take(indices))

1036 else: # allow_fill=False

1037 # TODO(ARROW-9432): Treat negative indices as indices from the right.

1038 if (indices_array < 0).any():

1039 # Don't modify in-place

1040 indices_array = np.copy(indices_array)

1041 indices_array[indices_array < 0] += len(self._data)

1042 return type(self)(self._data.take(indices_array))

1043

1044 @doc(ExtensionArray.to_numpy)

1045 def to_numpy(

1046 self,

1047 dtype: npt.DTypeLike | None = None,

1048 copy: bool = False,

1049 na_value: object = lib.no_default,

1050 ) -> np.ndarray:

1051 if dtype is None and self._hasna:

1052 dtype = object

1053 if na_value is lib.no_default:

1054 na_value = self.dtype.na_value

1055

1056 pa_type = self._data.type

1057 if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type):

1058 # temporal types with units and/or timezones currently

1059 # require pandas/python scalars to pass all tests

1060 # TODO: improve performance (this is slow)

1061 result = np.array(list(self), dtype=dtype)

1062 elif is_object_dtype(dtype) and self._hasna:

1063 result = np.empty(len(self), dtype=object)

1064 mask = ~self.isna()

1065 result[mask] = np.asarray(self[mask]._data)

1066 elif pa.types.is_null(self._data.type):

1067 result = np.asarray(self._data, dtype=dtype)

1068 if not isna(na_value):

1069 result[:] = na_value

1070 return result

1071 elif self._hasna:

1072 data = self.copy()

1073 data[self.isna()] = na_value

1074 return np.asarray(data._data, dtype=dtype)

1075 else:

1076 result = np.asarray(self._data, dtype=dtype)

1077 if copy:

1078 result = result.copy()

1079 if self._hasna:

1080 result[self.isna()] = na_value

1081 return result

1082

1083 def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:

1084 """

1085 Compute the ArrowExtensionArray of unique values.

1086

1087 Returns

1088 -------

1089 ArrowExtensionArray

1090 """

1091 pa_type = self._data.type

1092

1093 if pa.types.is_duration(pa_type):

1094 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323

1095 data = self._data.cast(pa.int64())

1096 else:

1097 data = self._data

1098

1099 pa_result = pc.unique(data)

1100

1101 if pa.types.is_duration(pa_type):

1102 pa_result = pa_result.cast(pa_type)

1103

1104 return type(self)(pa_result)

1105

1106 def value_counts(self, dropna: bool = True) -> Series:

1107 """

1108 Return a Series containing counts of each unique value.

1109

1110 Parameters

1111 ----------

1112 dropna : bool, default True

1113 Don't include counts of missing values.

1114

1115 Returns

1116 -------

1117 counts : Series

1118

1119 See Also

1120 --------

1121 Series.value_counts

1122 """

1123 pa_type = self._data.type

1124 if pa.types.is_duration(pa_type):

1125 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323

1126 data = self._data.cast(pa.int64())

1127 else:

1128 data = self._data

1129

1130 from pandas import (

1131 Index,

1132 Series,

1133 )

1134

1135 vc = data.value_counts()

1136

1137 values = vc.field(0)

1138 counts = vc.field(1)

1139 if dropna and data.null_count > 0:

1140 mask = values.is_valid()

1141 values = values.filter(mask)

1142 counts = counts.filter(mask)

1143

1144 if pa.types.is_duration(pa_type):

1145 values = values.cast(pa_type)

1146

1147 counts = ArrowExtensionArray(counts)

1148

1149 index = Index(type(self)(values))

1150

1151 return Series(counts, index=index, name="count", copy=False)

1152

1153 @classmethod

1154 def _concat_same_type(

1155 cls: type[ArrowExtensionArrayT], to_concat

1156 ) -> ArrowExtensionArrayT:

1157 """

1158 Concatenate multiple ArrowExtensionArrays.

1159

1160 Parameters

1161 ----------

1162 to_concat : sequence of ArrowExtensionArrays

1163

1164 Returns

1165 -------

1166 ArrowExtensionArray

1167 """

1168 chunks = [array for ea in to_concat for array in ea._data.iterchunks()]

1169 if to_concat[0].dtype == "string":

1170 # StringDtype has no attrivute pyarrow_dtype

1171 pa_dtype = pa.string()

1172 else:

1173 pa_dtype = to_concat[0].dtype.pyarrow_dtype

1174 arr = pa.chunked_array(chunks, type=pa_dtype)

1175 return cls(arr)

1176

1177 def _accumulate(

1178 self, name: str, *, skipna: bool = True, **kwargs

1179 ) -> ArrowExtensionArray | ExtensionArray:

1180 """

1181 Return an ExtensionArray performing an accumulation operation.

1182

1183 The underlying data type might change.

1184

1185 Parameters

1186 ----------

1187 name : str

1188 Name of the function, supported values are:

1189 - cummin

1190 - cummax

1191 - cumsum

1192 - cumprod

1193 skipna : bool, default True

1194 If True, skip NA values.

1195 **kwargs

1196 Additional keyword arguments passed to the accumulation function.

1197 Currently, there is no supported kwarg.

1198

1199 Returns

1200 -------

1201 array

1202

1203 Raises

1204 ------

1205 NotImplementedError : subclass does not define accumulations

1206 """

1207 pyarrow_name = {

1208 "cumsum": "cumulative_sum_checked",

1209 }.get(name, name)

1210 pyarrow_meth = getattr(pc, pyarrow_name, None)

1211 if pyarrow_meth is None:

1212 return super()._accumulate(name, skipna=skipna, **kwargs)

1213

1214 data_to_accum = self._data

1215

1216 pa_dtype = data_to_accum.type

1217 if pa.types.is_duration(pa_dtype):

1218 data_to_accum = data_to_accum.cast(pa.int64())

1219

1220 result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)

1221

1222 if pa.types.is_duration(pa_dtype):

1223 result = result.cast(pa_dtype)

1224

1225 return type(self)(result)

1226

1227 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):

1228 """

1229 Return a scalar result of performing the reduction operation.

1230

1231 Parameters

1232 ----------

1233 name : str

1234 Name of the function, supported values are:

1235 { any, all, min, max, sum, mean, median, prod,

1236 std, var, sem, kurt, skew }.

1237 skipna : bool, default True

1238 If True, skip NaN values.

1239 **kwargs

1240 Additional keyword arguments passed to the reduction function.

1241 Currently, `ddof` is the only supported kwarg.

1242

1243 Returns

1244 -------

1245 scalar

1246

1247 Raises

1248 ------

1249 TypeError : subclass does not define reductions

1250 """

1251 pa_type = self._data.type

1252

1253 data_to_reduce = self._data

1254

1255 if name in ["any", "all"] and (

1256 pa.types.is_integer(pa_type)

1257 or pa.types.is_floating(pa_type)

1258 or pa.types.is_duration(pa_type)

1259 or pa.types.is_decimal(pa_type)

1260 ):

1261 # pyarrow only supports any/all for boolean dtype, we allow

1262 # for other dtypes, matching our non-pyarrow behavior

1263

1264 if pa.types.is_duration(pa_type):

1265 data_to_cmp = self._data.cast(pa.int64())

1266 else:

1267 data_to_cmp = self._data

1268

1269 not_eq = pc.not_equal(data_to_cmp, 0)

1270 data_to_reduce = not_eq

1271

1272 elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):

1273 data_to_reduce = self._data.cast(pa.int64())

1274

1275 elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type):

1276 nbits = pa_type.bit_width

1277 if nbits == 32:

1278 data_to_reduce = self._data.cast(pa.int32())

1279 else:

1280 data_to_reduce = self._data.cast(pa.int64())

1281

1282 if name == "sem":

1283

1284 def pyarrow_meth(data, skip_nulls, **kwargs):

1285 numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs)

1286 denominator = pc.sqrt_checked(pc.count(self._data))

1287 return pc.divide_checked(numerator, denominator)

1288

1289 else:

1290 pyarrow_name = {

1291 "median": "quantile",

1292 "prod": "product",

1293 "std": "stddev",

1294 "var": "variance",

1295 }.get(name, name)

1296 # error: Incompatible types in assignment

1297 # (expression has type "Optional[Any]", variable has type

1298 # "Callable[[Any, Any, KwArg(Any)], Any]")

1299 pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]

1300 if pyarrow_meth is None:

1301 # Let ExtensionArray._reduce raise the TypeError

1302 return super()._reduce(name, skipna=skipna, **kwargs)

1303

1304 # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0

1305 if name in ["any", "all"] and "min_count" not in kwargs:

1306 kwargs["min_count"] = 0

1307 elif name == "median":

1308 # GH 52679: Use quantile instead of approximate_median

1309 kwargs["q"] = 0.5

1310

1311 try:

1312 result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)

1313 except (AttributeError, NotImplementedError, TypeError) as err:

1314 msg = (

1315 f"'{type(self).__name__}' with dtype {self.dtype} "

1316 f"does not support reduction '{name}' with pyarrow "

1317 f"version {pa.__version__}. '{name}' may be supported by "

1318 f"upgrading pyarrow."

1319 )

1320 raise TypeError(msg) from err

1321 if name == "median":

1322 # GH 52679: Use quantile instead of approximate_median; returns array

1323 result = result[0]

1324 if pc.is_null(result).as_py():

1325 return self.dtype.na_value

1326

1327 if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):

1328 result = result.cast(pa_type)

1329 if name in ["median", "mean"] and pa.types.is_temporal(pa_type):

1330 result = result.cast(pa_type)

1331 if name in ["std", "sem"] and pa.types.is_temporal(pa_type):

1332 result = result.cast(pa.int64())

1333 if pa.types.is_duration(pa_type):

1334 result = result.cast(pa_type)

1335 elif pa.types.is_time(pa_type):

1336 unit = get_unit_from_pa_dtype(pa_type)

1337 result = result.cast(pa.duration(unit))

1338 elif pa.types.is_date(pa_type):

1339 # go with closest available unit, i.e. "s"

1340 result = result.cast(pa.duration("s"))

1341 else:

1342 # i.e. timestamp

1343 result = result.cast(pa.duration(pa_type.unit))

1344

1345 return result.as_py()

1346

1347 def __setitem__(self, key, value) -> None:

1348 """Set one or more values inplace.

1349

1350 Parameters

1351 ----------

1352 key : int, ndarray, or slice

1353 When called from, e.g. ``Series.__setitem__``, ``key`` will be

1354 one of

1355

1356 * scalar int

1357 * ndarray of integers.

1358 * boolean ndarray

1359 * slice object

1360

1361 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object

1362 value or values to be set of ``key``.

1363

1364 Returns

1365 -------

1366 None

1367 """

1368 # GH50085: unwrap 1D indexers

1369 if isinstance(key, tuple) and len(key) == 1:

1370 key = key[0]

1371

1372 key = check_array_indexer(self, key)

1373 value = self._maybe_convert_setitem_value(value)

1374

1375 if com.is_null_slice(key):

1376 # fast path (GH50248)

1377 data = self._if_else(True, value, self._data)

1378

1379 elif is_integer(key):

1380 # fast path

1381 key = cast(int, key)

1382 n = len(self)

1383 if key < 0:

1384 key += n

1385 if not 0 <= key < n:

1386 raise IndexError(

1387 f"index {key} is out of bounds for axis 0 with size {n}"

1388 )

1389 if is_list_like(value):

1390 raise ValueError("Length of indexer and values mismatch")

1391 elif isinstance(value, pa.Scalar):

1392 value = value.as_py()

1393 chunks = [

1394 *self._data[:key].chunks,

1395 pa.array([value], type=self._data.type, from_pandas=True),

1396 *self._data[key + 1 :].chunks,

1397 ]

1398 data = pa.chunked_array(chunks).combine_chunks()

1399

1400 elif is_bool_dtype(key):

1401 key = np.asarray(key, dtype=np.bool_)

1402 data = self._replace_with_mask(self._data, key, value)

1403

1404 elif is_scalar(value) or isinstance(value, pa.Scalar):

1405 mask = np.zeros(len(self), dtype=np.bool_)

1406 mask[key] = True

1407 data = self._if_else(mask, value, self._data)

1408

1409 else:

1410 indices = np.arange(len(self))[key]

1411 if len(indices) != len(value):

1412 raise ValueError("Length of indexer and values mismatch")

1413 if len(indices) == 0:

1414 return

1415 argsort = np.argsort(indices)

1416 indices = indices[argsort]

1417 value = value.take(argsort)

1418 mask = np.zeros(len(self), dtype=np.bool_)

1419 mask[indices] = True

1420 data = self._replace_with_mask(self._data, mask, value)

1421

1422 if isinstance(data, pa.Array):

1423 data = pa.chunked_array([data])

1424 self._data = data

1425

1426 def _rank(

1427 self,

1428 *,

1429 axis: AxisInt = 0,

1430 method: str = "average",

1431 na_option: str = "keep",

1432 ascending: bool = True,

1433 pct: bool = False,

1434 ):

1435 """

1436 See Series.rank.__doc__.

1437 """

1438 if pa_version_under9p0 or axis != 0:

1439 ranked = super()._rank(

1440 axis=axis,

1441 method=method,

1442 na_option=na_option,

1443 ascending=ascending,

1444 pct=pct,

1445 )

1446 # keep dtypes consistent with the implementation below

1447 if method == "average" or pct:

1448 pa_type = pa.float64()

1449 else:

1450 pa_type = pa.uint64()

1451 result = pa.array(ranked, type=pa_type, from_pandas=True)

1452 return type(self)(result)

1453

1454 data = self._data.combine_chunks()

1455 sort_keys = "ascending" if ascending else "descending"

1456 null_placement = "at_start" if na_option == "top" else "at_end"

1457 tiebreaker = "min" if method == "average" else method

1458

1459 result = pc.rank(

1460 data,

1461 sort_keys=sort_keys,

1462 null_placement=null_placement,

1463 tiebreaker=tiebreaker,

1464 )

1465

1466 if na_option == "keep":

1467 mask = pc.is_null(self._data)

1468 null = pa.scalar(None, type=result.type)

1469 result = pc.if_else(mask, null, result)

1470

1471 if method == "average":

1472 result_max = pc.rank(

1473 data,

1474 sort_keys=sort_keys,

1475 null_placement=null_placement,

1476 tiebreaker="max",

1477 )

1478 result_max = result_max.cast(pa.float64())

1479 result_min = result.cast(pa.float64())

1480 result = pc.divide(pc.add(result_min, result_max), 2)

1481

1482 if pct:

1483 if not pa.types.is_floating(result.type):

1484 result = result.cast(pa.float64())

1485 if method == "dense":

1486 divisor = pc.max(result)

1487 else:

1488 divisor = pc.count(result)

1489 result = pc.divide(result, divisor)

1490

1491 return type(self)(result)

1492

1493 def _quantile(

1494 self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str

1495 ) -> ArrowExtensionArrayT:

1496 """

1497 Compute the quantiles of self for each quantile in `qs`.

1498

1499 Parameters

1500 ----------

1501 qs : np.ndarray[float64]

1502 interpolation: str

1503

1504 Returns

1505 -------

1506 same type as self

1507 """

1508 pa_dtype = self._data.type

1509

1510 data = self._data

1511 if pa.types.is_temporal(pa_dtype):

1512 # https://github.com/apache/arrow/issues/33769 in these cases

1513 # we can cast to ints and back

1514 nbits = pa_dtype.bit_width

1515 if nbits == 32:

1516 data = data.cast(pa.int32())

1517 else:

1518 data = data.cast(pa.int64())

1519

1520 result = pc.quantile(data, q=qs, interpolation=interpolation)

1521

1522 if pa.types.is_temporal(pa_dtype):

1523 nbits = pa_dtype.bit_width

1524 if nbits == 32:

1525 result = result.cast(pa.int32())

1526 else:

1527 result = result.cast(pa.int64())

1528 result = result.cast(pa_dtype)

1529

1530 return type(self)(result)

1531

1532 def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT:

1533 """

1534 Returns the mode(s) of the ExtensionArray.

1535

1536 Always returns `ExtensionArray` even if only one value.

1537

1538 Parameters

1539 ----------

1540 dropna : bool, default True

1541 Don't consider counts of NA values.

1542

1543 Returns

1544 -------

1545 same type as self

1546 Sorted, if possible.

1547 """

1548 pa_type = self._data.type

1549 if pa.types.is_temporal(pa_type):

1550 nbits = pa_type.bit_width

1551 if nbits == 32:

1552 data = self._data.cast(pa.int32())

1553 elif nbits == 64:

1554 data = self._data.cast(pa.int64())

1555 else:

1556 raise NotImplementedError(pa_type)

1557 else:

1558 data = self._data

1559

1560 if dropna:

1561 data = data.drop_null()

1562

1563 res = pc.value_counts(data)

1564 most_common = res.field("values").filter(

1565 pc.equal(res.field("counts"), pc.max(res.field("counts")))

1566 )

1567

1568 if pa.types.is_temporal(pa_type):

1569 most_common = most_common.cast(pa_type)

1570

1571 return type(self)(most_common)

1572

1573 def _maybe_convert_setitem_value(self, value):

1574 """Maybe convert value to be pyarrow compatible."""

1575 if value is None:

1576 return value

1577 if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)):

1578 return value

1579 if is_list_like(value):

1580 pa_box = pa.array

1581 else:

1582 pa_box = pa.scalar

1583 try:

1584 value = pa_box(value, type=self._data.type, from_pandas=True)

1585 except pa.ArrowTypeError as err:

1586 msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"

1587 raise TypeError(msg) from err

1588 return value

1589

1590 @classmethod

1591 def _if_else(

1592 cls,

1593 cond: npt.NDArray[np.bool_] | bool,

1594 left: ArrayLike | Scalar,

1595 right: ArrayLike | Scalar,

1596 ):

1597 """

1598 Choose values based on a condition.

1599

1600 Analogous to pyarrow.compute.if_else, with logic

1601 to fallback to numpy for unsupported types.

1602

1603 Parameters

1604 ----------

1605 cond : npt.NDArray[np.bool_] or bool

1606 left : ArrayLike | Scalar

1607 right : ArrayLike | Scalar

1608

1609 Returns

1610 -------

1611 pa.Array

1612 """

1613 try:

1614 return pc.if_else(cond, left, right)

1615 except pa.ArrowNotImplementedError:

1616 pass

1617

1618 def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:

1619 if isinstance(value, (pa.Array, pa.ChunkedArray)):

1620 pa_type = value.type

1621 elif isinstance(value, pa.Scalar):

1622 pa_type = value.type

1623 value = value.as_py()

1624 else:

1625 pa_type = None

1626 return np.array(value, dtype=object), pa_type

1627

1628 left, left_type = _to_numpy_and_type(left)

1629 right, right_type = _to_numpy_and_type(right)

1630 pa_type = left_type or right_type

1631 result = np.where(cond, left, right)

1632 return pa.array(result, type=pa_type, from_pandas=True)

1633

1634 @classmethod

1635 def _replace_with_mask(

1636 cls,

1637 values: pa.Array | pa.ChunkedArray,

1638 mask: npt.NDArray[np.bool_] | bool,

1639 replacements: ArrayLike | Scalar,

1640 ):

1641 """

1642 Replace items selected with a mask.

1643

1644 Analogous to pyarrow.compute.replace_with_mask, with logic

1645 to fallback to numpy for unsupported types.

1646

1647 Parameters

1648 ----------

1649 values : pa.Array or pa.ChunkedArray

1650 mask : npt.NDArray[np.bool_] or bool

1651 replacements : ArrayLike or Scalar

1652 Replacement value(s)

1653

1654 Returns

1655 -------

1656 pa.Array or pa.ChunkedArray

1657 """

1658 if isinstance(replacements, pa.ChunkedArray):

1659 # replacements must be array or scalar, not ChunkedArray

1660 replacements = replacements.combine_chunks()

1661 if pa_version_under8p0:

1662 # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0:

1663 # version <= 7: segfaults with various types

1664 # version <= 6: fails to replace nulls

1665 if isinstance(replacements, pa.Array):

1666 indices = np.full(len(values), None)

1667 indices[mask] = np.arange(len(replacements))

1668 indices = pa.array(indices, type=pa.int64())

1669 replacements = replacements.take(indices)

1670 return cls._if_else(mask, replacements, values)

1671 if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type):

1672 # GH#52059 replace_with_mask segfaults for chunked array

1673 # https://github.com/apache/arrow/issues/34634

1674 values = values.combine_chunks()

1675 try:

1676 return pc.replace_with_mask(values, mask, replacements)

1677 except pa.ArrowNotImplementedError:

1678 pass

1679 if isinstance(replacements, pa.Array):

1680 replacements = np.array(replacements, dtype=object)

1681 elif isinstance(replacements, pa.Scalar):

1682 replacements = replacements.as_py()

1683 result = np.array(values, dtype=object)

1684 result[mask] = replacements

1685 return pa.array(result, type=values.type, from_pandas=True)

1686

1687 def _apply_elementwise(self, func: Callable) -> list[list[Any]]:

1688 """Apply a callable to each element while maintaining the chunking structure."""

1689 return [

1690 [

1691 None if val is None else func(val)

1692 for val in chunk.to_numpy(zero_copy_only=False)

1693 ]

1694 for chunk in self._data.iterchunks()

1695 ]

1696

1697 def _str_count(self, pat: str, flags: int = 0):

1698 if flags:

1699 raise NotImplementedError(f"count not implemented with {flags=}")

1700 return type(self)(pc.count_substring_regex(self._data, pat))

1701

1702 def _str_pad(

1703 self,

1704 width: int,

1705 side: Literal["left", "right", "both"] = "left",

1706 fillchar: str = " ",

1707 ):

1708 if side == "left":

1709 pa_pad = pc.utf8_lpad

1710 elif side == "right":

1711 pa_pad = pc.utf8_rpad

1712 elif side == "both":

1713 pa_pad = pc.utf8_center

1714 else:

1715 raise ValueError(

1716 f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"

1717 )

1718 return type(self)(pa_pad(self._data, width=width, padding=fillchar))

1719

1720 def _str_contains(

1721 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True

1722 ):

1723 if flags:

1724 raise NotImplementedError(f"contains not implemented with {flags=}")

1725

1726 if regex:

1727 pa_contains = pc.match_substring_regex

1728 else:

1729 pa_contains = pc.match_substring

1730 result = pa_contains(self._data, pat, ignore_case=not case)

1731 if not isna(na):

1732 result = result.fill_null(na)

1733 return type(self)(result)

1734

1735 def _str_startswith(self, pat: str, na=None):

1736 result = pc.starts_with(self._data, pattern=pat)

1737 if not isna(na):

1738 result = result.fill_null(na)

1739 return type(self)(result)

1740

1741 def _str_endswith(self, pat: str, na=None):

1742 result = pc.ends_with(self._data, pattern=pat)

1743 if not isna(na):

1744 result = result.fill_null(na)

1745 return type(self)(result)

1746

1747 def _str_replace(

1748 self,

1749 pat: str | re.Pattern,

1750 repl: str | Callable,

1751 n: int = -1,

1752 case: bool = True,

1753 flags: int = 0,

1754 regex: bool = True,

1755 ):

1756 if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:

1757 raise NotImplementedError(

1758 "replace is not supported with a re.Pattern, callable repl, "

1759 "case=False, or flags!=0"

1760 )

1761

1762 func = pc.replace_substring_regex if regex else pc.replace_substring

1763 result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)

1764 return type(self)(result)

1765

1766 def _str_repeat(self, repeats: int | Sequence[int]):

1767 if not isinstance(repeats, int):

1768 raise NotImplementedError(

1769 f"repeat is not implemented when repeats is {type(repeats).__name__}"

1770 )

1771 elif pa_version_under7p0:

1772 raise NotImplementedError("repeat is not implemented for pyarrow < 7")

1773 else:

1774 return type(self)(pc.binary_repeat(self._data, repeats))

1775

1776 def _str_match(

1777 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None

1778 ):

1779 if not pat.startswith("^"):

1780 pat = f"^{pat}"

1781 return self._str_contains(pat, case, flags, na, regex=True)

1782

1783 def _str_fullmatch(

1784 self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None

1785 ):

1786 if not pat.endswith("$") or pat.endswith("//$"):

1787 pat = f"{pat}$"

1788 return self._str_match(pat, case, flags, na)

1789

1790 def _str_find(self, sub: str, start: int = 0, end: int | None = None):

1791 if start != 0 and end is not None:

1792 slices = pc.utf8_slice_codeunits(self._data, start, stop=end)

1793 result = pc.find_substring(slices, sub)

1794 not_found = pc.equal(result, -1)

1795 offset_result = pc.add(result, end - start)

1796 result = pc.if_else(not_found, result, offset_result)

1797 elif start == 0 and end is None:

1798 slices = self._data

1799 result = pc.find_substring(slices, sub)

1800 else:

1801 raise NotImplementedError(

1802 f"find not implemented with {sub=}, {start=}, {end=}"

1803 )

1804 return type(self)(result)

1805

1806 def _str_get(self, i: int):

1807 lengths = pc.utf8_length(self._data)

1808 if i >= 0:

1809 out_of_bounds = pc.greater_equal(i, lengths)

1810 start = i

1811 stop = i + 1

1812 step = 1

1813 else:

1814 out_of_bounds = pc.greater(-i, lengths)

1815 start = i

1816 stop = i - 1

1817 step = -1

1818 not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))

1819 selected = pc.utf8_slice_codeunits(

1820 self._data, start=start, stop=stop, step=step

1821 )

1822 result = pa.array([None] * self._data.length(), type=self._data.type)

1823 result = pc.if_else(not_out_of_bounds, selected, result)

1824 return type(self)(result)

1825

1826 def _str_join(self, sep: str):

1827 return type(self)(pc.binary_join(self._data, sep))

1828

1829 def _str_partition(self, sep: str, expand: bool):

1830 predicate = lambda val: val.partition(sep)

1831 result = self._apply_elementwise(predicate)

1832 return type(self)(pa.chunked_array(result))

1833

1834 def _str_rpartition(self, sep: str, expand: bool):

1835 predicate = lambda val: val.rpartition(sep)

1836 result = self._apply_elementwise(predicate)

1837 return type(self)(pa.chunked_array(result))

1838

1839 def _str_slice(

1840 self, start: int | None = None, stop: int | None = None, step: int | None = None

1841 ):

1842 if start is None:

1843 start = 0

1844 if step is None:

1845 step = 1

1846 return type(self)(

1847 pc.utf8_slice_codeunits(self._data, start=start, stop=stop, step=step)

1848 )

1849

1850 def _str_slice_replace(

1851 self, start: int | None = None, stop: int | None = None, repl: str | None = None

1852 ):

1853 if repl is None:

1854 repl = ""

1855 if start is None:

1856 start = 0

1857 return type(self)(pc.utf8_replace_slice(self._data, start, stop, repl))

1858

1859 def _str_isalnum(self):

1860 return type(self)(pc.utf8_is_alnum(self._data))

1861

1862 def _str_isalpha(self):

1863 return type(self)(pc.utf8_is_alpha(self._data))

1864

1865 def _str_isdecimal(self):

1866 return type(self)(pc.utf8_is_decimal(self._data))

1867

1868 def _str_isdigit(self):

1869 return type(self)(pc.utf8_is_digit(self._data))

1870

1871 def _str_islower(self):

1872 return type(self)(pc.utf8_is_lower(self._data))

1873

1874 def _str_isnumeric(self):

1875 return type(self)(pc.utf8_is_numeric(self._data))

1876

1877 def _str_isspace(self):

1878 return type(self)(pc.utf8_is_space(self._data))

1879

1880 def _str_istitle(self):

1881 return type(self)(pc.utf8_is_title(self._data))

1882

1883 def _str_capitalize(self):

1884 return type(self)(pc.utf8_capitalize(self._data))

1885

1886 def _str_title(self):

1887 return type(self)(pc.utf8_title(self._data))

1888

1889 def _str_isupper(self):

1890 return type(self)(pc.utf8_is_upper(self._data))

1891

1892 def _str_swapcase(self):

1893 return type(self)(pc.utf8_swapcase(self._data))

1894

1895 def _str_len(self):

1896 return type(self)(pc.utf8_length(self._data))

1897

1898 def _str_lower(self):

1899 return type(self)(pc.utf8_lower(self._data))

1900

1901 def _str_upper(self):

1902 return type(self)(pc.utf8_upper(self._data))

1903

1904 def _str_strip(self, to_strip=None):

1905 if to_strip is None:

1906 result = pc.utf8_trim_whitespace(self._data)

1907 else:

1908 result = pc.utf8_trim(self._data, characters=to_strip)

1909 return type(self)(result)

1910

1911 def _str_lstrip(self, to_strip=None):

1912 if to_strip is None:

1913 result = pc.utf8_ltrim_whitespace(self._data)

1914 else:

1915 result = pc.utf8_ltrim(self._data, characters=to_strip)

1916 return type(self)(result)

1917

1918 def _str_rstrip(self, to_strip=None):

1919 if to_strip is None:

1920 result = pc.utf8_rtrim_whitespace(self._data)

1921 else:

1922 result = pc.utf8_rtrim(self._data, characters=to_strip)

1923 return type(self)(result)

1924

1925 def _str_removeprefix(self, prefix: str):

1926 # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed

1927 # starts_with = pc.starts_with(self._data, pattern=prefix)

1928 # removed = pc.utf8_slice_codeunits(self._data, len(prefix))

1929 # result = pc.if_else(starts_with, removed, self._data)

1930 # return type(self)(result)

1931 if sys.version_info < (3, 9):

1932 # NOTE pyupgrade will remove this when we run it with --py39-plus

1933 # so don't remove the unnecessary `else` statement below

1934 from pandas.util._str_methods import removeprefix

1935

1936 predicate = functools.partial(removeprefix, prefix=prefix)

1937 else:

1938 predicate = lambda val: val.removeprefix(prefix)

1939 result = self._apply_elementwise(predicate)

1940 return type(self)(pa.chunked_array(result))

1941

1942 def _str_removesuffix(self, suffix: str):

1943 ends_with = pc.ends_with(self._data, pattern=suffix)

1944 removed = pc.utf8_slice_codeunits(self._data, 0, stop=-len(suffix))

1945 result = pc.if_else(ends_with, removed, self._data)

1946 return type(self)(result)

1947

1948 def _str_casefold(self):

1949 predicate = lambda val: val.casefold()

1950 result = self._apply_elementwise(predicate)

1951 return type(self)(pa.chunked_array(result))

1952

1953 def _str_encode(self, encoding: str, errors: str = "strict"):

1954 predicate = lambda val: val.encode(encoding, errors)

1955 result = self._apply_elementwise(predicate)

1956 return type(self)(pa.chunked_array(result))

1957

1958 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):

1959 raise NotImplementedError(

1960 "str.extract not supported with pd.ArrowDtype(pa.string())."

1961 )

1962

1963 def _str_findall(self, pat: str, flags: int = 0):

1964 regex = re.compile(pat, flags=flags)

1965 predicate = lambda val: regex.findall(val)

1966 result = self._apply_elementwise(predicate)

1967 return type(self)(pa.chunked_array(result))

1968

1969 def _str_get_dummies(self, sep: str = "|"):

1970 split = pc.split_pattern(self._data, sep).combine_chunks()

1971 uniques = split.flatten().unique()

1972 uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques))

1973 result_data = []

1974 for lst in split.to_pylist():

1975 if lst is None:

1976 result_data.append([False] * len(uniques_sorted))

1977 else:

1978 res = pc.is_in(uniques_sorted, pa.array(set(lst)))

1979 result_data.append(res.to_pylist())

1980 result = type(self)(pa.array(result_data))

1981 return result, uniques_sorted.to_pylist()

1982

1983 def _str_index(self, sub: str, start: int = 0, end: int | None = None):

1984 predicate = lambda val: val.index(sub, start, end)

1985 result = self._apply_elementwise(predicate)

1986 return type(self)(pa.chunked_array(result))

1987

1988 def _str_rindex(self, sub: str, start: int = 0, end: int | None = None):

1989 predicate = lambda val: val.rindex(sub, start, end)

1990 result = self._apply_elementwise(predicate)

1991 return type(self)(pa.chunked_array(result))

1992

1993 def _str_normalize(self, form: str):

1994 predicate = lambda val: unicodedata.normalize(form, val)

1995 result = self._apply_elementwise(predicate)

1996 return type(self)(pa.chunked_array(result))

1997

1998 def _str_rfind(self, sub: str, start: int = 0, end=None):

1999 predicate = lambda val: val.rfind(sub, start, end)

2000 result = self._apply_elementwise(predicate)

2001 return type(self)(pa.chunked_array(result))

2002

2003 def _str_split(

2004 self,

2005 pat: str | None = None,

2006 n: int | None = -1,

2007 expand: bool = False,

2008 regex: bool | None = None,

2009 ):

2010 if n in {-1, 0}:

2011 n = None

2012 if regex:

2013 split_func = pc.split_pattern_regex

2014 else:

2015 split_func = pc.split_pattern

2016 return type(self)(split_func(self._data, pat, max_splits=n))

2017

2018 def _str_rsplit(self, pat: str | None = None, n: int | None = -1):

2019 if n in {-1, 0}:

2020 n = None

2021 return type(self)(pc.split_pattern(self._data, pat, max_splits=n, reverse=True))

2022

2023 def _str_translate(self, table: dict[int, str]):

2024 predicate = lambda val: val.translate(table)

2025 result = self._apply_elementwise(predicate)

2026 return type(self)(pa.chunked_array(result))

2027

2028 def _str_wrap(self, width: int, **kwargs):

2029 kwargs["width"] = width

2030 tw = textwrap.TextWrapper(**kwargs)

2031 predicate = lambda val: "\n".join(tw.wrap(val))

2032 result = self._apply_elementwise(predicate)

2033 return type(self)(pa.chunked_array(result))

2034

2035 @property

2036 def _dt_year(self):

2037 return type(self)(pc.year(self._data))

2038

2039 @property

2040 def _dt_day(self):

2041 return type(self)(pc.day(self._data))

2042

2043 @property

2044 def _dt_day_of_week(self):

2045 return type(self)(pc.day_of_week(self._data))

2046

2047 _dt_dayofweek = _dt_day_of_week

2048 _dt_weekday = _dt_day_of_week

2049

2050 @property

2051 def _dt_day_of_year(self):

2052 return type(self)(pc.day_of_year(self._data))

2053

2054 _dt_dayofyear = _dt_day_of_year

2055

2056 @property

2057 def _dt_hour(self):

2058 return type(self)(pc.hour(self._data))

2059

2060 def _dt_isocalendar(self):

2061 return type(self)(pc.iso_calendar(self._data))

2062

2063 @property

2064 def _dt_is_leap_year(self):

2065 return type(self)(pc.is_leap_year(self._data))

2066

2067 @property

2068 def _dt_microsecond(self):

2069 return type(self)(pc.microsecond(self._data))

2070

2071 @property

2072 def _dt_minute(self):

2073 return type(self)(pc.minute(self._data))

2074

2075 @property

2076 def _dt_month(self):

2077 return type(self)(pc.month(self._data))

2078

2079 @property

2080 def _dt_nanosecond(self):

2081 return type(self)(pc.nanosecond(self._data))

2082

2083 @property

2084 def _dt_quarter(self):

2085 return type(self)(pc.quarter(self._data))

2086

2087 @property

2088 def _dt_second(self):

2089 return type(self)(pc.second(self._data))

2090

2091 @property

2092 def _dt_date(self):

2093 return type(self)(self._data.cast(pa.date32()))

2094

2095 @property

2096 def _dt_time(self):

2097 unit = (

2098 self.dtype.pyarrow_dtype.unit

2099 if self.dtype.pyarrow_dtype.unit in {"us", "ns"}

2100 else "ns"

2101 )

2102 return type(self)(self._data.cast(pa.time64(unit)))

2103

2104 @property

2105 def _dt_tz(self):

2106 return self.dtype.pyarrow_dtype.tz

2107

2108 def _dt_strftime(self, format: str):

2109 return type(self)(pc.strftime(self._data, format=format))

2110

2111 def _round_temporally(

2112 self,

2113 method: Literal["ceil", "floor", "round"],

2114 freq,

2115 ambiguous: TimeAmbiguous = "raise",

2116 nonexistent: TimeNonexistent = "raise",

2117 ):

2118 if ambiguous != "raise":

2119 raise NotImplementedError("ambiguous is not supported.")

2120 if nonexistent != "raise":

2121 raise NotImplementedError("nonexistent is not supported.")

2122 offset = to_offset(freq)

2123 if offset is None:

2124 raise ValueError(f"Must specify a valid frequency: {freq}")

2125 pa_supported_unit = {

2126 "A": "year",

2127 "AS": "year",

2128 "Q": "quarter",

2129 "QS": "quarter",

2130 "M": "month",

2131 "MS": "month",

2132 "W": "week",

2133 "D": "day",

2134 "H": "hour",

2135 "T": "minute",

2136 "S": "second",

2137 "L": "millisecond",

2138 "U": "microsecond",

2139 "N": "nanosecond",

2140 }

2141 unit = pa_supported_unit.get(offset._prefix, None)

2142 if unit is None:

2143 raise ValueError(f"{freq=} is not supported")

2144 multiple = offset.n

2145 rounding_method = getattr(pc, f"{method}_temporal")

2146 return type(self)(rounding_method(self._data, multiple=multiple, unit=unit))

2147

2148 def _dt_ceil(

2149 self,

2150 freq,

2151 ambiguous: TimeAmbiguous = "raise",

2152 nonexistent: TimeNonexistent = "raise",

2153 ):

2154 return self._round_temporally("ceil", freq, ambiguous, nonexistent)

2155

2156 def _dt_floor(

2157 self,

2158 freq,

2159 ambiguous: TimeAmbiguous = "raise",

2160 nonexistent: TimeNonexistent = "raise",

2161 ):

2162 return self._round_temporally("floor", freq, ambiguous, nonexistent)

2163

2164 def _dt_round(

2165 self,

2166 freq,

2167 ambiguous: TimeAmbiguous = "raise",

2168 nonexistent: TimeNonexistent = "raise",

2169 ):

2170 return self._round_temporally("round", freq, ambiguous, nonexistent)

2171

2172 def _dt_to_pydatetime(self):

2173 if pa.types.is_date(self.dtype.pyarrow_dtype):

2174 raise ValueError(

2175 f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. "

2176 "Convert to pyarrow timestamp type."

2177 )

2178 data = self._data.to_pylist()

2179 if self._dtype.pyarrow_dtype.unit == "ns":

2180 data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data]

2181 return np.array(data, dtype=object)

2182

2183 def _dt_tz_localize(

2184 self,

2185 tz,

2186 ambiguous: TimeAmbiguous = "raise",

2187 nonexistent: TimeNonexistent = "raise",

2188 ):

2189 if ambiguous != "raise":

2190 raise NotImplementedError(f"{ambiguous=} is not supported")

2191 nonexistent_pa = {

2192 "raise": "raise",

2193 "shift_backward": "earliest",

2194 "shift_forward": "latest",

2195 }.get(

2196 nonexistent, None # type: ignore[arg-type]

2197 )

2198 if nonexistent_pa is None:

2199 raise NotImplementedError(f"{nonexistent=} is not supported")

2200 if tz is None:

2201 result = self._data.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit))

2202 else:

2203 result = pc.assume_timezone(

2204 self._data, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa

2205 )

2206 return type(self)(result)