Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/arrow/array.py: 20%

1from __future__ import annotations

3import functools

4import operator

5import re

6import textwrap

7from typing import (

8 TYPE_CHECKING,

9 Any,

10 Callable,

11 Literal,

12 cast,

13)

14import unicodedata

16import numpy as np

18from pandas._libs import lib

19from pandas._libs.tslibs import (

20 NaT,

21 Timedelta,

22 Timestamp,

23 timezones,

24)

25from pandas.compat import (

26 pa_version_under10p1,

27 pa_version_under11p0,

28 pa_version_under13p0,

29)

30from pandas.util._decorators import doc

31from pandas.util._validators import validate_fillna_kwargs

33from pandas.core.dtypes.cast import (

34 can_hold_element,

35 infer_dtype_from_scalar,

36)

37from pandas.core.dtypes.common import (

38 CategoricalDtype,

39 is_array_like,

40 is_bool_dtype,

41 is_float_dtype,

42 is_integer,

43 is_list_like,

44 is_numeric_dtype,

45 is_scalar,

46)

47from pandas.core.dtypes.dtypes import DatetimeTZDtype

48from pandas.core.dtypes.missing import isna

50from pandas.core import (

51 algorithms as algos,

52 missing,

53 ops,

54 roperator,

55)

56from pandas.core.algorithms import map_array

57from pandas.core.arraylike import OpsMixin

58from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin

59from pandas.core.arrays._utils import to_numpy_dtype_inference

60from pandas.core.arrays.base import (

61 ExtensionArray,

62 ExtensionArraySupportsAnyAll,

63)

64from pandas.core.arrays.masked import BaseMaskedArray

65from pandas.core.arrays.string_ import StringDtype

66import pandas.core.common as com

67from pandas.core.indexers import (

68 check_array_indexer,

69 unpack_tuple_and_ellipses,

70 validate_indices,

71)

72from pandas.core.strings.base import BaseStringArrayMethods

74from pandas.io._util import _arrow_dtype_mapping

75from pandas.tseries.frequencies import to_offset

77if not pa_version_under10p1:

78 import pyarrow as pa

79 import pyarrow.compute as pc

81 from pandas.core.dtypes.dtypes import ArrowDtype

83 ARROW_CMP_FUNCS = {

84 "eq": pc.equal,

85 "ne": pc.not_equal,

86 "lt": pc.less,

87 "gt": pc.greater,

88 "le": pc.less_equal,

89 "ge": pc.greater_equal,

90 }

92 ARROW_LOGICAL_FUNCS = {

93 "and_": pc.and_kleene,

94 "rand_": lambda x, y: pc.and_kleene(y, x),

95 "or_": pc.or_kleene,

96 "ror_": lambda x, y: pc.or_kleene(y, x),

97 "xor": pc.xor,

98 "rxor": lambda x, y: pc.xor(y, x),

99 }

100

101 ARROW_BIT_WISE_FUNCS = {

102 "and_": pc.bit_wise_and,

103 "rand_": lambda x, y: pc.bit_wise_and(y, x),

104 "or_": pc.bit_wise_or,

105 "ror_": lambda x, y: pc.bit_wise_or(y, x),

106 "xor": pc.bit_wise_xor,

107 "rxor": lambda x, y: pc.bit_wise_xor(y, x),

108 }

109

110 def cast_for_truediv(

111 arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar

112 ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]:

113 # Ensure int / int -> float mirroring Python/Numpy behavior

114 # as pc.divide_checked(int, int) -> int

115 if pa.types.is_integer(arrow_array.type) and pa.types.is_integer(

116 pa_object.type

117 ):

118 # GH: 56645.

119 # https://github.com/apache/arrow/issues/35563

120 return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast(

121 pa_object, pa.float64(), safe=False

122 )

123

124 return arrow_array, pa_object

125

126 def floordiv_compat(

127 left: pa.ChunkedArray | pa.Array | pa.Scalar,

128 right: pa.ChunkedArray | pa.Array | pa.Scalar,

129 ) -> pa.ChunkedArray:

130 # TODO: Replace with pyarrow floordiv kernel.

131 # https://github.com/apache/arrow/issues/39386

132 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type):

133 divided = pc.divide_checked(left, right)

134 if pa.types.is_signed_integer(divided.type):

135 # GH 56676

136 has_remainder = pc.not_equal(pc.multiply(divided, right), left)

137 has_one_negative_operand = pc.less(

138 pc.bit_wise_xor(left, right),

139 pa.scalar(0, type=divided.type),

140 )

141 result = pc.if_else(

142 pc.and_(

143 has_remainder,

144 has_one_negative_operand,

145 ),

146 # GH: 55561

147 pc.subtract(divided, pa.scalar(1, type=divided.type)),

148 divided,

149 )

150 else:

151 result = divided

152 result = result.cast(left.type)

153 else:

154 divided = pc.divide(left, right)

155 result = pc.floor(divided)

156 return result

157

158 ARROW_ARITHMETIC_FUNCS = {

159 "add": pc.add_checked,

160 "radd": lambda x, y: pc.add_checked(y, x),

161 "sub": pc.subtract_checked,

162 "rsub": lambda x, y: pc.subtract_checked(y, x),

163 "mul": pc.multiply_checked,

164 "rmul": lambda x, y: pc.multiply_checked(y, x),

165 "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)),

166 "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)),

167 "floordiv": lambda x, y: floordiv_compat(x, y),

168 "rfloordiv": lambda x, y: floordiv_compat(y, x),

169 "mod": NotImplemented,

170 "rmod": NotImplemented,

171 "divmod": NotImplemented,

172 "rdivmod": NotImplemented,

173 "pow": pc.power_checked,

174 "rpow": lambda x, y: pc.power_checked(y, x),

175 }

176

177if TYPE_CHECKING:

178 from collections.abc import Sequence

179

180 from pandas._typing import (

181 ArrayLike,

182 AxisInt,

183 Dtype,

184 FillnaOptions,

185 InterpolateOptions,

186 Iterator,

187 NpDtype,

188 NumpySorter,

189 NumpyValueArrayLike,

190 PositionalIndexer,

191 Scalar,

192 Self,

193 SortKind,

194 TakeIndexer,

195 TimeAmbiguous,

196 TimeNonexistent,

197 npt,

198 )

199

200 from pandas import Series

201 from pandas.core.arrays.datetimes import DatetimeArray

202 from pandas.core.arrays.timedeltas import TimedeltaArray

203

204

205def get_unit_from_pa_dtype(pa_dtype):

206 # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804

207 if pa_version_under11p0:

208 unit = str(pa_dtype).split("[", 1)[-1][:-1]

209 if unit not in ["s", "ms", "us", "ns"]:

210 raise ValueError(pa_dtype)

211 return unit

212 return pa_dtype.unit

213

214

215def to_pyarrow_type(

216 dtype: ArrowDtype | pa.DataType | Dtype | None,

217) -> pa.DataType | None:

218 """

219 Convert dtype to a pyarrow type instance.

220 """

221 if isinstance(dtype, ArrowDtype):

222 return dtype.pyarrow_dtype

223 elif isinstance(dtype, pa.DataType):

224 return dtype

225 elif isinstance(dtype, DatetimeTZDtype):

226 return pa.timestamp(dtype.unit, dtype.tz)

227 elif dtype:

228 try:

229 # Accepts python types too

230 # Doesn't handle all numpy types

231 return pa.from_numpy_dtype(dtype)

232 except pa.ArrowNotImplementedError:

233 pass

234 return None

235

236

237class ArrowExtensionArray(

238 OpsMixin,

239 ExtensionArraySupportsAnyAll,

240 ArrowStringArrayMixin,

241 BaseStringArrayMethods,

242):

243 """

244 Pandas ExtensionArray backed by a PyArrow ChunkedArray.

245

246 .. warning::

247

248 ArrowExtensionArray is considered experimental. The implementation and

249 parts of the API may change without warning.

250

251 Parameters

252 ----------

253 values : pyarrow.Array or pyarrow.ChunkedArray

254

255 Attributes

256 ----------

257 None

258

259 Methods

260 -------

261 None

262

263 Returns

264 -------

265 ArrowExtensionArray

266

267 Notes

268 -----

269 Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__

270 Some methods may either raise an exception or raise a ``PerformanceWarning`` if an

271 associated compute function is not available based on the installed version of PyArrow.

272

273 Please install the latest version of PyArrow to enable the best functionality and avoid

274 potential bugs in prior versions of PyArrow.

275

276 Examples

277 --------

278 Create an ArrowExtensionArray with :func:`pandas.array`:

279

280 >>> pd.array([1, 1, None], dtype="int64[pyarrow]")

281 <ArrowExtensionArray>

282 [1, 1, <NA>]

283 Length: 3, dtype: int64[pyarrow]

284 """ # noqa: E501 (http link too long)

285

286 _pa_array: pa.ChunkedArray

287 _dtype: ArrowDtype

288

289 def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:

290 if pa_version_under10p1:

291 msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."

292 raise ImportError(msg)

293 if isinstance(values, pa.Array):

294 self._pa_array = pa.chunked_array([values])

295 elif isinstance(values, pa.ChunkedArray):

296 self._pa_array = values

297 else:

298 raise ValueError(

299 f"Unsupported type '{type(values)}' for ArrowExtensionArray"

300 )

301 self._dtype = ArrowDtype(self._pa_array.type)

302

303 @classmethod

304 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):

305 """

306 Construct a new ExtensionArray from a sequence of scalars.

307 """

308 pa_type = to_pyarrow_type(dtype)

309 pa_array = cls._box_pa_array(scalars, pa_type=pa_type, copy=copy)

310 arr = cls(pa_array)

311 return arr

312

313 @classmethod

314 def _from_sequence_of_strings(

315 cls, strings, *, dtype: Dtype | None = None, copy: bool = False

316 ):

317 """

318 Construct a new ExtensionArray from a sequence of strings.

319 """

320 pa_type = to_pyarrow_type(dtype)

321 if (

322 pa_type is None

323 or pa.types.is_binary(pa_type)

324 or pa.types.is_string(pa_type)

325 or pa.types.is_large_string(pa_type)

326 ):

327 # pa_type is None: Let pa.array infer

328 # pa_type is string/binary: scalars already correct type

329 scalars = strings

330 elif pa.types.is_timestamp(pa_type):

331 from pandas.core.tools.datetimes import to_datetime

332

333 scalars = to_datetime(strings, errors="raise")

334 elif pa.types.is_date(pa_type):

335 from pandas.core.tools.datetimes import to_datetime

336

337 scalars = to_datetime(strings, errors="raise").date

338 elif pa.types.is_duration(pa_type):

339 from pandas.core.tools.timedeltas import to_timedelta

340

341 scalars = to_timedelta(strings, errors="raise")

342 if pa_type.unit != "ns":

343 # GH51175: test_from_sequence_of_strings_pa_array

344 # attempt to parse as int64 reflecting pyarrow's

345 # duration to string casting behavior

346 mask = isna(scalars)

347 if not isinstance(strings, (pa.Array, pa.ChunkedArray)):

348 strings = pa.array(strings, type=pa.string(), from_pandas=True)

349 strings = pc.if_else(mask, None, strings)

350 try:

351 scalars = strings.cast(pa.int64())

352 except pa.ArrowInvalid:

353 pass

354 elif pa.types.is_time(pa_type):

355 from pandas.core.tools.times import to_time

356

357 # "coerce" to allow "null times" (None) to not raise

358 scalars = to_time(strings, errors="coerce")

359 elif pa.types.is_boolean(pa_type):

360 # pyarrow string->bool casting is case-insensitive:

361 # "true" or "1" -> True

362 # "false" or "0" -> False

363 # Note: BooleanArray was previously used to parse these strings

364 # and allows "1.0" and "0.0". Pyarrow casting does not support

365 # this, but we allow it here.

366 if isinstance(strings, (pa.Array, pa.ChunkedArray)):

367 scalars = strings

368 else:

369 scalars = pa.array(strings, type=pa.string(), from_pandas=True)

370 scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)

371 scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)

372 scalars = scalars.cast(pa.bool_())

373 elif (

374 pa.types.is_integer(pa_type)

375 or pa.types.is_floating(pa_type)

376 or pa.types.is_decimal(pa_type)

377 ):

378 from pandas.core.tools.numeric import to_numeric

379

380 scalars = to_numeric(strings, errors="raise")

381 else:

382 raise NotImplementedError(

383 f"Converting strings to {pa_type} is not implemented."

384 )

385 return cls._from_sequence(scalars, dtype=pa_type, copy=copy)

386

387 @classmethod

388 def _box_pa(

389 cls, value, pa_type: pa.DataType | None = None

390 ) -> pa.Array | pa.ChunkedArray | pa.Scalar:

391 """

392 Box value into a pyarrow Array, ChunkedArray or Scalar.

393

394 Parameters

395 ----------

396 value : any

397 pa_type : pa.DataType | None

398

399 Returns

400 -------

401 pa.Array or pa.ChunkedArray or pa.Scalar

402 """

403 if isinstance(value, pa.Scalar) or not is_list_like(value):

404 return cls._box_pa_scalar(value, pa_type)

405 return cls._box_pa_array(value, pa_type)

406

407 @classmethod

408 def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:

409 """

410 Box value into a pyarrow Scalar.

411

412 Parameters

413 ----------

414 value : any

415 pa_type : pa.DataType | None

416

417 Returns

418 -------

419 pa.Scalar

420 """

421 if isinstance(value, pa.Scalar):

422 pa_scalar = value

423 elif isna(value):

424 pa_scalar = pa.scalar(None, type=pa_type)

425 else:

426 # Workaround https://github.com/apache/arrow/issues/37291

427 if isinstance(value, Timedelta):

428 if pa_type is None:

429 pa_type = pa.duration(value.unit)

430 elif value.unit != pa_type.unit:

431 value = value.as_unit(pa_type.unit)

432 value = value._value

433 elif isinstance(value, Timestamp):

434 if pa_type is None:

435 pa_type = pa.timestamp(value.unit, tz=value.tz)

436 elif value.unit != pa_type.unit:

437 value = value.as_unit(pa_type.unit)

438 value = value._value

439

440 pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)

441

442 if pa_type is not None and pa_scalar.type != pa_type:

443 pa_scalar = pa_scalar.cast(pa_type)

444

445 return pa_scalar

446

447 @classmethod

448 def _box_pa_array(

449 cls, value, pa_type: pa.DataType | None = None, copy: bool = False

450 ) -> pa.Array | pa.ChunkedArray:

451 """

452 Box value into a pyarrow Array or ChunkedArray.

453

454 Parameters

455 ----------

456 value : Sequence

457 pa_type : pa.DataType | None

458

459 Returns

460 -------

461 pa.Array or pa.ChunkedArray

462 """

463 if isinstance(value, cls):

464 pa_array = value._pa_array

465 elif isinstance(value, (pa.Array, pa.ChunkedArray)):

466 pa_array = value

467 elif isinstance(value, BaseMaskedArray):

468 # GH 52625

469 if copy:

470 value = value.copy()

471 pa_array = value.__arrow_array__()

472 else:

473 if (

474 isinstance(value, np.ndarray)

475 and pa_type is not None

476 and (

477 pa.types.is_large_binary(pa_type)

478 or pa.types.is_large_string(pa_type)

479 )

480 ):

481 # See https://github.com/apache/arrow/issues/35289

482 value = value.tolist()

483 elif copy and is_array_like(value):

484 # pa array should not get updated when numpy array is updated

485 value = value.copy()

486

487 if (

488 pa_type is not None

489 and pa.types.is_duration(pa_type)

490 and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi")

491 ):

492 # Workaround https://github.com/apache/arrow/issues/37291

493 from pandas.core.tools.timedeltas import to_timedelta

494

495 value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit)

496 value = value.to_numpy()

497

498 try:

499 pa_array = pa.array(value, type=pa_type, from_pandas=True)

500 except (pa.ArrowInvalid, pa.ArrowTypeError):

501 # GH50430: let pyarrow infer type, then cast

502 pa_array = pa.array(value, from_pandas=True)

503

504 if pa_type is None and pa.types.is_duration(pa_array.type):

505 # Workaround https://github.com/apache/arrow/issues/37291

506 from pandas.core.tools.timedeltas import to_timedelta

507

508 value = to_timedelta(value)

509 value = value.to_numpy()

510 pa_array = pa.array(value, type=pa_type, from_pandas=True)

511

512 if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0:

513 # GH52843: upstream bug for duration types when originally

514 # constructed with data containing numpy NaT.

515 # https://github.com/apache/arrow/issues/35088

516 arr = cls(pa_array)

517 arr = arr.fillna(arr.dtype.na_value)

518 pa_array = arr._pa_array

519

520 if pa_type is not None and pa_array.type != pa_type:

521 if pa.types.is_dictionary(pa_type):

522 pa_array = pa_array.dictionary_encode()

523 else:

524 try:

525 pa_array = pa_array.cast(pa_type)

526 except (

527 pa.ArrowInvalid,

528 pa.ArrowTypeError,

529 pa.ArrowNotImplementedError,

530 ):

531 if pa.types.is_string(pa_array.type) or pa.types.is_large_string(

532 pa_array.type

533 ):

534 # TODO: Move logic in _from_sequence_of_strings into

535 # _box_pa_array

536 return cls._from_sequence_of_strings(

537 value, dtype=pa_type

538 )._pa_array

539 else:

540 raise

541

542 return pa_array

543

544 def __getitem__(self, item: PositionalIndexer):

545 """Select a subset of self.

546

547 Parameters

548 ----------

549 item : int, slice, or ndarray

550 * int: The position in 'self' to get.

551 * slice: A slice object, where 'start', 'stop', and 'step' are

552 integers or None

553 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self'

554

555 Returns

556 -------

557 item : scalar or ExtensionArray

558

559 Notes

560 -----

561 For scalar ``item``, return a scalar value suitable for the array's

562 type. This should be an instance of ``self.dtype.type``.

563 For slice ``key``, return an instance of ``ExtensionArray``, even

564 if the slice is length 0 or 1.

565 For a boolean mask, return an instance of ``ExtensionArray``, filtered

566 to the values where ``item`` is True.

567 """

568 item = check_array_indexer(self, item)

569

570 if isinstance(item, np.ndarray):

571 if not len(item):

572 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string]

573 if self._dtype.name == "string" and self._dtype.storage in (

574 "pyarrow",

575 "pyarrow_numpy",

576 ):

577 pa_dtype = pa.string()

578 else:

579 pa_dtype = self._dtype.pyarrow_dtype

580 return type(self)(pa.chunked_array([], type=pa_dtype))

581 elif item.dtype.kind in "iu":

582 return self.take(item)

583 elif item.dtype.kind == "b":

584 return type(self)(self._pa_array.filter(item))

585 else:

586 raise IndexError(

587 "Only integers, slices and integer or "

588 "boolean arrays are valid indices."

589 )

590 elif isinstance(item, tuple):

591 item = unpack_tuple_and_ellipses(item)

592

593 if item is Ellipsis:

594 # TODO: should be handled by pyarrow?

595 item = slice(None)

596

597 if is_scalar(item) and not is_integer(item):

598 # e.g. "foo" or 2.5

599 # exception message copied from numpy

600 raise IndexError(

601 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "

602 r"(`None`) and integer or boolean arrays are valid indices"

603 )

604 # We are not an array indexer, so maybe e.g. a slice or integer

605 # indexer. We dispatch to pyarrow.

606 if isinstance(item, slice):

607 # Arrow bug https://github.com/apache/arrow/issues/38768

608 if item.start == item.stop:

609 pass

610 elif (

611 item.stop is not None

612 and item.stop < -len(self)

613 and item.step is not None

614 and item.step < 0

615 ):

616 item = slice(item.start, None, item.step)

617

618 value = self._pa_array[item]

619 if isinstance(value, pa.ChunkedArray):

620 return type(self)(value)

621 else:

622 pa_type = self._pa_array.type

623 scalar = value.as_py()

624 if scalar is None:

625 return self._dtype.na_value

626 elif pa.types.is_timestamp(pa_type) and pa_type.unit != "ns":

627 # GH 53326

628 return Timestamp(scalar).as_unit(pa_type.unit)

629 elif pa.types.is_duration(pa_type) and pa_type.unit != "ns":

630 # GH 53326

631 return Timedelta(scalar).as_unit(pa_type.unit)

632 else:

633 return scalar

634

635 def __iter__(self) -> Iterator[Any]:

636 """

637 Iterate over elements of the array.

638 """

639 na_value = self._dtype.na_value

640 # GH 53326

641 pa_type = self._pa_array.type

642 box_timestamp = pa.types.is_timestamp(pa_type) and pa_type.unit != "ns"

643 box_timedelta = pa.types.is_duration(pa_type) and pa_type.unit != "ns"

644 for value in self._pa_array:

645 val = value.as_py()

646 if val is None:

647 yield na_value

648 elif box_timestamp:

649 yield Timestamp(val).as_unit(pa_type.unit)

650 elif box_timedelta:

651 yield Timedelta(val).as_unit(pa_type.unit)

652 else:

653 yield val

654

655 def __arrow_array__(self, type=None):

656 """Convert myself to a pyarrow ChunkedArray."""

657 return self._pa_array

658

659 def __array__(

660 self, dtype: NpDtype | None = None, copy: bool | None = None

661 ) -> np.ndarray:

662 """Correctly construct numpy arrays when passed to `np.asarray()`."""

663 return self.to_numpy(dtype=dtype)

664

665 def __invert__(self) -> Self:

666 # This is a bit wise op for integer types

667 if pa.types.is_integer(self._pa_array.type):

668 return type(self)(pc.bit_wise_not(self._pa_array))

669 elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(

670 self._pa_array.type

671 ):

672 # Raise TypeError instead of pa.ArrowNotImplementedError

673 raise TypeError("__invert__ is not supported for string dtypes")

674 else:

675 return type(self)(pc.invert(self._pa_array))

676

677 def __neg__(self) -> Self:

678 return type(self)(pc.negate_checked(self._pa_array))

679

680 def __pos__(self) -> Self:

681 return type(self)(self._pa_array)

682

683 def __abs__(self) -> Self:

684 return type(self)(pc.abs_checked(self._pa_array))

685

686 # GH 42600: __getstate__/__setstate__ not necessary once

687 # https://issues.apache.org/jira/browse/ARROW-10739 is addressed

688 def __getstate__(self):

689 state = self.__dict__.copy()

690 state["_pa_array"] = self._pa_array.combine_chunks()

691 return state

692

693 def __setstate__(self, state) -> None:

694 if "_data" in state:

695 data = state.pop("_data")

696 else:

697 data = state["_pa_array"]

698 state["_pa_array"] = pa.chunked_array(data)

699 self.__dict__.update(state)

700

701 def _cmp_method(self, other, op):

702 pc_func = ARROW_CMP_FUNCS[op.__name__]

703 if isinstance(

704 other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray)

705 ) or isinstance(getattr(other, "dtype", None), CategoricalDtype):

706 result = pc_func(self._pa_array, self._box_pa(other))

707 elif is_scalar(other):

708 try:

709 result = pc_func(self._pa_array, self._box_pa(other))

710 except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid):

711 mask = isna(self) | isna(other)

712 valid = ~mask

713 result = np.zeros(len(self), dtype="bool")

714 np_array = np.array(self)

715 try:

716 result[valid] = op(np_array[valid], other)

717 except TypeError:

718 result = ops.invalid_comparison(np_array, other, op)

719 result = pa.array(result, type=pa.bool_())

720 result = pc.if_else(valid, result, None)

721 else:

722 raise NotImplementedError(

723 f"{op.__name__} not implemented for {type(other)}"

724 )

725 return ArrowExtensionArray(result)

726

727 def _evaluate_op_method(self, other, op, arrow_funcs):

728 pa_type = self._pa_array.type

729 other = self._box_pa(other)

730

731 if (

732 pa.types.is_string(pa_type)

733 or pa.types.is_large_string(pa_type)

734 or pa.types.is_binary(pa_type)

735 ):

736 if op in [operator.add, roperator.radd]:

737 sep = pa.scalar("", type=pa_type)

738 if op is operator.add:

739 result = pc.binary_join_element_wise(self._pa_array, other, sep)

740 elif op is roperator.radd:

741 result = pc.binary_join_element_wise(other, self._pa_array, sep)

742 return type(self)(result)

743 elif op in [operator.mul, roperator.rmul]:

744 binary = self._pa_array

745 integral = other

746 if not pa.types.is_integer(integral.type):

747 raise TypeError("Can only string multiply by an integer.")

748 pa_integral = pc.if_else(pc.less(integral, 0), 0, integral)

749 result = pc.binary_repeat(binary, pa_integral)

750 return type(self)(result)

751 elif (

752 pa.types.is_string(other.type)

753 or pa.types.is_binary(other.type)

754 or pa.types.is_large_string(other.type)

755 ) and op in [operator.mul, roperator.rmul]:

756 binary = other

757 integral = self._pa_array

758 if not pa.types.is_integer(integral.type):

759 raise TypeError("Can only string multiply by an integer.")

760 pa_integral = pc.if_else(pc.less(integral, 0), 0, integral)

761 result = pc.binary_repeat(binary, pa_integral)

762 return type(self)(result)

763 if (

764 isinstance(other, pa.Scalar)

765 and pc.is_null(other).as_py()

766 and op.__name__ in ARROW_LOGICAL_FUNCS

767 ):

768 # pyarrow kleene ops require null to be typed

769 other = other.cast(pa_type)

770

771 pc_func = arrow_funcs[op.__name__]

772 if pc_func is NotImplemented:

773 raise NotImplementedError(f"{op.__name__} not implemented.")

774

775 result = pc_func(self._pa_array, other)

776 return type(self)(result)

777

778 def _logical_method(self, other, op):

779 # For integer types `^`, `|`, `&` are bitwise operators and return

780 # integer types. Otherwise these are boolean ops.

781 if pa.types.is_integer(self._pa_array.type):

782 return self._evaluate_op_method(other, op, ARROW_BIT_WISE_FUNCS)

783 else:

784 return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS)

785

786 def _arith_method(self, other, op):

787 return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS)

788

789 def equals(self, other) -> bool:

790 if not isinstance(other, ArrowExtensionArray):

791 return False

792 # I'm told that pyarrow makes __eq__ behave like pandas' equals;

793 # TODO: is this documented somewhere?

794 return self._pa_array == other._pa_array

795

796 @property

797 def dtype(self) -> ArrowDtype:

798 """

799 An instance of 'ExtensionDtype'.

800 """

801 return self._dtype

802

803 @property

804 def nbytes(self) -> int:

805 """

806 The number of bytes needed to store this object in memory.

807 """

808 return self._pa_array.nbytes

809

810 def __len__(self) -> int:

811 """

812 Length of this array.

813

814 Returns

815 -------

816 length : int

817 """

818 return len(self._pa_array)

819

820 def __contains__(self, key) -> bool:

821 # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604

822 if isna(key) and key is not self.dtype.na_value:

823 if self.dtype.kind == "f" and lib.is_float(key):

824 return pc.any(pc.is_nan(self._pa_array)).as_py()

825

826 # e.g. date or timestamp types we do not allow None here to match pd.NA

827 return False

828 # TODO: maybe complex? object?

829

830 return bool(super().__contains__(key))

831

832 @property

833 def _hasna(self) -> bool:

834 return self._pa_array.null_count > 0

835

836 def isna(self) -> npt.NDArray[np.bool_]:

837 """

838 Boolean NumPy array indicating if each value is missing.

839

840 This should return a 1-D array the same length as 'self'.

841 """

842 # GH51630: fast paths

843 null_count = self._pa_array.null_count

844 if null_count == 0:

845 return np.zeros(len(self), dtype=np.bool_)

846 elif null_count == len(self):

847 return np.ones(len(self), dtype=np.bool_)

848

849 return self._pa_array.is_null().to_numpy()

850

851 def any(self, *, skipna: bool = True, **kwargs):

852 """

853 Return whether any element is truthy.

854

855 Returns False unless there is at least one element that is truthy.

856 By default, NAs are skipped. If ``skipna=False`` is specified and

857 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`

858 is used as for logical operations.

859

860 Parameters

861 ----------

862 skipna : bool, default True

863 Exclude NA values. If the entire array is NA and `skipna` is

864 True, then the result will be False, as for an empty array.

865 If `skipna` is False, the result will still be True if there is

866 at least one element that is truthy, otherwise NA will be returned

867 if there are NA's present.

868

869 Returns

870 -------

871 bool or :attr:`pandas.NA`

872

873 See Also

874 --------

875 ArrowExtensionArray.all : Return whether all elements are truthy.

876

877 Examples

878 --------

879 The result indicates whether any element is truthy (and by default

880 skips NAs):

881

882 >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any()

883 True

884 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any()

885 True

886 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any()

887 False

888 >>> pd.array([], dtype="boolean[pyarrow]").any()

889 False

890 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any()

891 False

892 >>> pd.array([pd.NA], dtype="float64[pyarrow]").any()

893 False

894

895 With ``skipna=False``, the result can be NA if this is logically

896 required (whether ``pd.NA`` is True or False influences the result):

897

898 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)

899 True

900 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)

901 True

902 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)

903 <NA>

904 >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False)

905 <NA>

906 """

907 return self._reduce("any", skipna=skipna, **kwargs)

908

909 def all(self, *, skipna: bool = True, **kwargs):

910 """

911 Return whether all elements are truthy.

912

913 Returns True unless there is at least one element that is falsey.

914 By default, NAs are skipped. If ``skipna=False`` is specified and

915 missing values are present, similar :ref:`Kleene logic <boolean.kleene>`

916 is used as for logical operations.

917

918 Parameters

919 ----------

920 skipna : bool, default True

921 Exclude NA values. If the entire array is NA and `skipna` is

922 True, then the result will be True, as for an empty array.

923 If `skipna` is False, the result will still be False if there is

924 at least one element that is falsey, otherwise NA will be returned

925 if there are NA's present.

926

927 Returns

928 -------

929 bool or :attr:`pandas.NA`

930

931 See Also

932 --------

933 ArrowExtensionArray.any : Return whether any element is truthy.

934

935 Examples

936 --------

937 The result indicates whether all elements are truthy (and by default

938 skips NAs):

939

940 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all()

941 True

942 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all()

943 True

944 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all()

945 False

946 >>> pd.array([], dtype="boolean[pyarrow]").all()

947 True

948 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all()

949 True

950 >>> pd.array([pd.NA], dtype="float64[pyarrow]").all()

951 True

952

953 With ``skipna=False``, the result can be NA if this is logically

954 required (whether ``pd.NA`` is True or False influences the result):

955

956 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)

957 <NA>

958 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)

959 <NA>

960 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)

961 False

962 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False)

963 False

964 """

965 return self._reduce("all", skipna=skipna, **kwargs)

966

967 def argsort(

968 self,

969 *,

970 ascending: bool = True,

971 kind: SortKind = "quicksort",

972 na_position: str = "last",

973 **kwargs,

974 ) -> np.ndarray:

975 order = "ascending" if ascending else "descending"

976 null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None)

977 if null_placement is None:

978 raise ValueError(f"invalid na_position: {na_position}")

979

980 result = pc.array_sort_indices(

981 self._pa_array, order=order, null_placement=null_placement

982 )

983 np_result = result.to_numpy()

984 return np_result.astype(np.intp, copy=False)

985

986 def _argmin_max(self, skipna: bool, method: str) -> int:

987 if self._pa_array.length() in (0, self._pa_array.null_count) or (

988 self._hasna and not skipna

989 ):

990 # For empty or all null, pyarrow returns -1 but pandas expects TypeError

991 # For skipna=False and data w/ null, pandas expects NotImplementedError

992 # let ExtensionArray.arg{max|min} raise

993 return getattr(super(), f"arg{method}")(skipna=skipna)

994

995 data = self._pa_array

996 if pa.types.is_duration(data.type):

997 data = data.cast(pa.int64())

998

999 value = getattr(pc, method)(data, skip_nulls=skipna)

1000 return pc.index(data, value).as_py()

1001

1002 def argmin(self, skipna: bool = True) -> int:

1003 return self._argmin_max(skipna, "min")

1004

1005 def argmax(self, skipna: bool = True) -> int:

1006 return self._argmin_max(skipna, "max")

1007

1008 def copy(self) -> Self:

1009 """

1010 Return a shallow copy of the array.

1011

1012 Underlying ChunkedArray is immutable, so a deep copy is unnecessary.

1013

1014 Returns

1015 -------

1016 type(self)

1017 """

1018 return type(self)(self._pa_array)

1019

1020 def dropna(self) -> Self:

1021 """

1022 Return ArrowExtensionArray without NA values.

1023

1024 Returns

1025 -------

1026 ArrowExtensionArray

1027 """

1028 return type(self)(pc.drop_null(self._pa_array))

1029

1030 def _pad_or_backfill(

1031 self,

1032 *,

1033 method: FillnaOptions,

1034 limit: int | None = None,

1035 limit_area: Literal["inside", "outside"] | None = None,

1036 copy: bool = True,

1037 ) -> Self:

1038 if not self._hasna:

1039 # TODO(CoW): Not necessary anymore when CoW is the default

1040 return self.copy()

1041

1042 if limit is None and limit_area is None:

1043 method = missing.clean_fill_method(method)

1044 try:

1045 if method == "pad":

1046 return type(self)(pc.fill_null_forward(self._pa_array))

1047 elif method == "backfill":

1048 return type(self)(pc.fill_null_backward(self._pa_array))

1049 except pa.ArrowNotImplementedError:

1050 # ArrowNotImplementedError: Function 'coalesce' has no kernel

1051 # matching input types (duration[ns], duration[ns])

1052 # TODO: remove try/except wrapper if/when pyarrow implements

1053 # a kernel for duration types.

1054 pass

1055

1056 # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove

1057 # this method entirely.

1058 return super()._pad_or_backfill(

1059 method=method, limit=limit, limit_area=limit_area, copy=copy

1060 )

1061

1062 @doc(ExtensionArray.fillna)

1063 def fillna(

1064 self,

1065 value: object | ArrayLike | None = None,

1066 method: FillnaOptions | None = None,

1067 limit: int | None = None,

1068 copy: bool = True,

1069 ) -> Self:

1070 value, method = validate_fillna_kwargs(value, method)

1071

1072 if not self._hasna:

1073 # TODO(CoW): Not necessary anymore when CoW is the default

1074 return self.copy()

1075

1076 if limit is not None:

1077 return super().fillna(value=value, method=method, limit=limit, copy=copy)

1078

1079 if method is not None:

1080 return super().fillna(method=method, limit=limit, copy=copy)

1081

1082 if isinstance(value, (np.ndarray, ExtensionArray)):

1083 # Similar to check_value_size, but we do not mask here since we may

1084 # end up passing it to the super() method.

1085 if len(value) != len(self):

1086 raise ValueError(

1087 f"Length of 'value' does not match. Got ({len(value)}) "

1088 f" expected {len(self)}"

1089 )

1090

1091 try:

1092 fill_value = self._box_pa(value, pa_type=self._pa_array.type)

1093 except pa.ArrowTypeError as err:

1094 msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"

1095 raise TypeError(msg) from err

1096

1097 try:

1098 return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value))

1099 except pa.ArrowNotImplementedError:

1100 # ArrowNotImplementedError: Function 'coalesce' has no kernel

1101 # matching input types (duration[ns], duration[ns])

1102 # TODO: remove try/except wrapper if/when pyarrow implements

1103 # a kernel for duration types.

1104 pass

1105

1106 return super().fillna(value=value, method=method, limit=limit, copy=copy)

1107

1108 def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:

1109 # short-circuit to return all False array.

1110 if not len(values):

1111 return np.zeros(len(self), dtype=bool)

1112

1113 result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True))

1114 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls

1115 # to False

1116 return np.array(result, dtype=np.bool_)

1117

1118 def _values_for_factorize(self) -> tuple[np.ndarray, Any]:

1119 """

1120 Return an array and missing value suitable for factorization.

1121

1122 Returns

1123 -------

1124 values : ndarray

1125 na_value : pd.NA

1126

1127 Notes

1128 -----

1129 The values returned by this method are also used in

1130 :func:`pandas.util.hash_pandas_object`.

1131 """

1132 values = self._pa_array.to_numpy()

1133 return values, self.dtype.na_value

1134

1135 @doc(ExtensionArray.factorize)

1136 def factorize(

1137 self,

1138 use_na_sentinel: bool = True,

1139 ) -> tuple[np.ndarray, ExtensionArray]:

1140 null_encoding = "mask" if use_na_sentinel else "encode"

1141

1142 data = self._pa_array

1143 pa_type = data.type

1144 if pa_version_under11p0 and pa.types.is_duration(pa_type):

1145 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323

1146 data = data.cast(pa.int64())

1147

1148 if pa.types.is_dictionary(data.type):

1149 encoded = data

1150 else:

1151 encoded = data.dictionary_encode(null_encoding=null_encoding)

1152 if encoded.length() == 0:

1153 indices = np.array([], dtype=np.intp)

1154 uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type))

1155 else:

1156 # GH 54844

1157 combined = encoded.combine_chunks()

1158 pa_indices = combined.indices

1159 if pa_indices.null_count > 0:

1160 pa_indices = pc.fill_null(pa_indices, -1)

1161 indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype(

1162 np.intp, copy=False

1163 )

1164 uniques = type(self)(combined.dictionary)

1165

1166 if pa_version_under11p0 and pa.types.is_duration(pa_type):

1167 uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype))

1168 return indices, uniques

1169

1170 def reshape(self, *args, **kwargs):

1171 raise NotImplementedError(

1172 f"{type(self)} does not support reshape "

1173 f"as backed by a 1D pyarrow.ChunkedArray."

1174 )

1175

1176 def round(self, decimals: int = 0, *args, **kwargs) -> Self:

1177 """

1178 Round each value in the array a to the given number of decimals.

1179

1180 Parameters

1181 ----------

1182 decimals : int, default 0

1183 Number of decimal places to round to. If decimals is negative,

1184 it specifies the number of positions to the left of the decimal point.

1185 *args, **kwargs

1186 Additional arguments and keywords have no effect.

1187

1188 Returns

1189 -------

1190 ArrowExtensionArray

1191 Rounded values of the ArrowExtensionArray.

1192

1193 See Also

1194 --------

1195 DataFrame.round : Round values of a DataFrame.

1196 Series.round : Round values of a Series.

1197 """

1198 return type(self)(pc.round(self._pa_array, ndigits=decimals))

1199

1200 @doc(ExtensionArray.searchsorted)

1201 def searchsorted(

1202 self,

1203 value: NumpyValueArrayLike | ExtensionArray,

1204 side: Literal["left", "right"] = "left",

1205 sorter: NumpySorter | None = None,

1206 ) -> npt.NDArray[np.intp] | np.intp:

1207 if self._hasna:

1208 raise ValueError(

1209 "searchsorted requires array to be sorted, which is impossible "

1210 "with NAs present."

1211 )

1212 if isinstance(value, ExtensionArray):

1213 value = value.astype(object)

1214 # Base class searchsorted would cast to object, which is *much* slower.

1215 dtype = None

1216 if isinstance(self.dtype, ArrowDtype):

1217 pa_dtype = self.dtype.pyarrow_dtype

1218 if (

1219 pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype)

1220 ) and pa_dtype.unit == "ns":

1221 # np.array[datetime/timedelta].searchsorted(datetime/timedelta)

1222 # erroneously fails when numpy type resolution is nanoseconds

1223 dtype = object

1224 return self.to_numpy(dtype=dtype).searchsorted(value, side=side, sorter=sorter)

1225

1226 def take(

1227 self,

1228 indices: TakeIndexer,

1229 allow_fill: bool = False,

1230 fill_value: Any = None,

1231 ) -> ArrowExtensionArray:

1232 """

1233 Take elements from an array.

1234

1235 Parameters

1236 ----------

1237 indices : sequence of int or one-dimensional np.ndarray of int

1238 Indices to be taken.

1239 allow_fill : bool, default False

1240 How to handle negative values in `indices`.

1241

1242 * False: negative values in `indices` indicate positional indices

1243 from the right (the default). This is similar to

1244 :func:`numpy.take`.

1245

1246 * True: negative values in `indices` indicate

1247 missing values. These values are set to `fill_value`. Any other

1248 other negative values raise a ``ValueError``.

1249

1250 fill_value : any, optional

1251 Fill value to use for NA-indices when `allow_fill` is True.

1252 This may be ``None``, in which case the default NA value for

1253 the type, ``self.dtype.na_value``, is used.

1254

1255 For many ExtensionArrays, there will be two representations of

1256 `fill_value`: a user-facing "boxed" scalar, and a low-level

1257 physical NA value. `fill_value` should be the user-facing version,

1258 and the implementation should handle translating that to the

1259 physical version for processing the take if necessary.

1260

1261 Returns

1262 -------

1263 ExtensionArray

1264

1265 Raises

1266 ------

1267 IndexError

1268 When the indices are out of bounds for the array.

1269 ValueError

1270 When `indices` contains negative values other than ``-1``

1271 and `allow_fill` is True.

1272

1273 See Also

1274 --------

1275 numpy.take

1276 api.extensions.take

1277

1278 Notes

1279 -----

1280 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,

1281 ``iloc``, when `indices` is a sequence of values. Additionally,

1282 it's called by :meth:`Series.reindex`, or any other method

1283 that causes realignment, with a `fill_value`.

1284 """

1285 indices_array = np.asanyarray(indices)

1286

1287 if len(self._pa_array) == 0 and (indices_array >= 0).any():

1288 raise IndexError("cannot do a non-empty take")

1289 if indices_array.size > 0 and indices_array.max() >= len(self._pa_array):

1290 raise IndexError("out of bounds value in 'indices'.")

1291

1292 if allow_fill:

1293 fill_mask = indices_array < 0

1294 if fill_mask.any():

1295 validate_indices(indices_array, len(self._pa_array))

1296 # TODO(ARROW-9433): Treat negative indices as NULL

1297 indices_array = pa.array(indices_array, mask=fill_mask)

1298 result = self._pa_array.take(indices_array)

1299 if isna(fill_value):

1300 return type(self)(result)

1301 # TODO: ArrowNotImplementedError: Function fill_null has no

1302 # kernel matching input types (array[string], scalar[string])

1303 result = type(self)(result)

1304 result[fill_mask] = fill_value

1305 return result

1306 # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))

1307 else:

1308 # Nothing to fill

1309 return type(self)(self._pa_array.take(indices))

1310 else: # allow_fill=False

1311 # TODO(ARROW-9432): Treat negative indices as indices from the right.

1312 if (indices_array < 0).any():

1313 # Don't modify in-place

1314 indices_array = np.copy(indices_array)

1315 indices_array[indices_array < 0] += len(self._pa_array)

1316 return type(self)(self._pa_array.take(indices_array))

1317

1318 def _maybe_convert_datelike_array(self):

1319 """Maybe convert to a datelike array."""

1320 pa_type = self._pa_array.type

1321 if pa.types.is_timestamp(pa_type):

1322 return self._to_datetimearray()

1323 elif pa.types.is_duration(pa_type):

1324 return self._to_timedeltaarray()

1325 return self

1326

1327 def _to_datetimearray(self) -> DatetimeArray:

1328 """Convert a pyarrow timestamp typed array to a DatetimeArray."""

1329 from pandas.core.arrays.datetimes import (

1330 DatetimeArray,

1331 tz_to_dtype,

1332 )

1333

1334 pa_type = self._pa_array.type

1335 assert pa.types.is_timestamp(pa_type)

1336 np_dtype = np.dtype(f"M8[{pa_type.unit}]")

1337 dtype = tz_to_dtype(pa_type.tz, pa_type.unit)

1338 np_array = self._pa_array.to_numpy()

1339 np_array = np_array.astype(np_dtype)

1340 return DatetimeArray._simple_new(np_array, dtype=dtype)

1341

1342 def _to_timedeltaarray(self) -> TimedeltaArray:

1343 """Convert a pyarrow duration typed array to a TimedeltaArray."""

1344 from pandas.core.arrays.timedeltas import TimedeltaArray

1345

1346 pa_type = self._pa_array.type

1347 assert pa.types.is_duration(pa_type)

1348 np_dtype = np.dtype(f"m8[{pa_type.unit}]")

1349 np_array = self._pa_array.to_numpy()

1350 np_array = np_array.astype(np_dtype)

1351 return TimedeltaArray._simple_new(np_array, dtype=np_dtype)

1352

1353 def _values_for_json(self) -> np.ndarray:

1354 if is_numeric_dtype(self.dtype):

1355 return np.asarray(self, dtype=object)

1356 return super()._values_for_json()

1357

1358 @doc(ExtensionArray.to_numpy)

1359 def to_numpy(

1360 self,

1361 dtype: npt.DTypeLike | None = None,

1362 copy: bool = False,

1363 na_value: object = lib.no_default,

1364 ) -> np.ndarray:

1365 original_na_value = na_value

1366 dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna)

1367 pa_type = self._pa_array.type

1368 if not self._hasna or isna(na_value) or pa.types.is_null(pa_type):

1369 data = self

1370 else:

1371 data = self.fillna(na_value)

1372 copy = False

1373

1374 if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type):

1375 # GH 55997

1376 if dtype != object and na_value is self.dtype.na_value:

1377 na_value = lib.no_default

1378 result = data._maybe_convert_datelike_array().to_numpy(

1379 dtype=dtype, na_value=na_value

1380 )

1381 elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type):

1382 # convert to list of python datetime.time objects before

1383 # wrapping in ndarray

1384 result = np.array(list(data), dtype=dtype)

1385 if data._hasna:

1386 result[data.isna()] = na_value

1387 elif pa.types.is_null(pa_type):

1388 if dtype is not None and isna(na_value):

1389 na_value = None

1390 result = np.full(len(data), fill_value=na_value, dtype=dtype)

1391 elif not data._hasna or (

1392 pa.types.is_floating(pa_type)

1393 and (

1394 na_value is np.nan

1395 or original_na_value is lib.no_default

1396 and is_float_dtype(dtype)

1397 )

1398 ):

1399 result = data._pa_array.to_numpy()

1400 if dtype is not None:

1401 result = result.astype(dtype, copy=False)

1402 if copy:

1403 result = result.copy()

1404 else:

1405 if dtype is None:

1406 empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False)

1407 if can_hold_element(empty, na_value):

1408 dtype = empty.dtype

1409 else:

1410 dtype = np.object_

1411 result = np.empty(len(data), dtype=dtype)

1412 mask = data.isna()

1413 result[mask] = na_value

1414 result[~mask] = data[~mask]._pa_array.to_numpy()

1415 return result

1416

1417 def map(self, mapper, na_action=None):

1418 if is_numeric_dtype(self.dtype):

1419 return map_array(self.to_numpy(), mapper, na_action=na_action)

1420 else:

1421 return super().map(mapper, na_action)

1422

1423 @doc(ExtensionArray.duplicated)

1424 def duplicated(

1425 self, keep: Literal["first", "last", False] = "first"

1426 ) -> npt.NDArray[np.bool_]:

1427 pa_type = self._pa_array.type

1428 if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type):

1429 values = self.to_numpy(na_value=0)

1430 elif pa.types.is_boolean(pa_type):

1431 values = self.to_numpy(na_value=False)

1432 elif pa.types.is_temporal(pa_type):

1433 if pa_type.bit_width == 32:

1434 pa_type = pa.int32()

1435 else:

1436 pa_type = pa.int64()

1437 arr = self.astype(ArrowDtype(pa_type))

1438 values = arr.to_numpy(na_value=0)

1439 else:

1440 # factorize the values to avoid the performance penalty of

1441 # converting to object dtype

1442 values = self.factorize()[0]

1443

1444 mask = self.isna() if self._hasna else None

1445 return algos.duplicated(values, keep=keep, mask=mask)

1446

1447 def unique(self) -> Self:

1448 """

1449 Compute the ArrowExtensionArray of unique values.

1450

1451 Returns

1452 -------

1453 ArrowExtensionArray

1454 """

1455 pa_type = self._pa_array.type

1456

1457 if pa_version_under11p0 and pa.types.is_duration(pa_type):

1458 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323

1459 data = self._pa_array.cast(pa.int64())

1460 else:

1461 data = self._pa_array

1462

1463 pa_result = pc.unique(data)

1464

1465 if pa_version_under11p0 and pa.types.is_duration(pa_type):

1466 pa_result = pa_result.cast(pa_type)

1467

1468 return type(self)(pa_result)

1469

1470 def value_counts(self, dropna: bool = True) -> Series:

1471 """

1472 Return a Series containing counts of each unique value.

1473

1474 Parameters

1475 ----------

1476 dropna : bool, default True

1477 Don't include counts of missing values.

1478

1479 Returns

1480 -------

1481 counts : Series

1482

1483 See Also

1484 --------

1485 Series.value_counts

1486 """

1487 pa_type = self._pa_array.type

1488 if pa_version_under11p0 and pa.types.is_duration(pa_type):

1489 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323

1490 data = self._pa_array.cast(pa.int64())

1491 else:

1492 data = self._pa_array

1493

1494 from pandas import (

1495 Index,

1496 Series,

1497 )

1498

1499 vc = data.value_counts()

1500

1501 values = vc.field(0)

1502 counts = vc.field(1)

1503 if dropna and data.null_count > 0:

1504 mask = values.is_valid()

1505 values = values.filter(mask)

1506 counts = counts.filter(mask)

1507

1508 if pa_version_under11p0 and pa.types.is_duration(pa_type):

1509 values = values.cast(pa_type)

1510

1511 counts = ArrowExtensionArray(counts)

1512

1513 index = Index(type(self)(values))

1514

1515 return Series(counts, index=index, name="count", copy=False)

1516

1517 @classmethod

1518 def _concat_same_type(cls, to_concat) -> Self:

1519 """

1520 Concatenate multiple ArrowExtensionArrays.

1521

1522 Parameters

1523 ----------

1524 to_concat : sequence of ArrowExtensionArrays

1525

1526 Returns

1527 -------

1528 ArrowExtensionArray

1529 """

1530 chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()]

1531 if to_concat[0].dtype == "string":

1532 # StringDtype has no attribute pyarrow_dtype

1533 pa_dtype = pa.large_string()

1534 else:

1535 pa_dtype = to_concat[0].dtype.pyarrow_dtype

1536 arr = pa.chunked_array(chunks, type=pa_dtype)

1537 return cls(arr)

1538

1539 def _accumulate(

1540 self, name: str, *, skipna: bool = True, **kwargs

1541 ) -> ArrowExtensionArray | ExtensionArray:

1542 """

1543 Return an ExtensionArray performing an accumulation operation.

1544

1545 The underlying data type might change.

1546

1547 Parameters

1548 ----------

1549 name : str

1550 Name of the function, supported values are:

1551 - cummin

1552 - cummax

1553 - cumsum

1554 - cumprod

1555 skipna : bool, default True

1556 If True, skip NA values.

1557 **kwargs

1558 Additional keyword arguments passed to the accumulation function.

1559 Currently, there is no supported kwarg.

1560

1561 Returns

1562 -------

1563 array

1564

1565 Raises

1566 ------

1567 NotImplementedError : subclass does not define accumulations

1568 """

1569 pyarrow_name = {

1570 "cummax": "cumulative_max",

1571 "cummin": "cumulative_min",

1572 "cumprod": "cumulative_prod_checked",

1573 "cumsum": "cumulative_sum_checked",

1574 }.get(name, name)

1575 pyarrow_meth = getattr(pc, pyarrow_name, None)

1576 if pyarrow_meth is None:

1577 return super()._accumulate(name, skipna=skipna, **kwargs)

1578

1579 data_to_accum = self._pa_array

1580

1581 pa_dtype = data_to_accum.type

1582

1583 convert_to_int = (

1584 pa.types.is_temporal(pa_dtype) and name in ["cummax", "cummin"]

1585 ) or (pa.types.is_duration(pa_dtype) and name == "cumsum")

1586

1587 if convert_to_int:

1588 if pa_dtype.bit_width == 32:

1589 data_to_accum = data_to_accum.cast(pa.int32())

1590 else:

1591 data_to_accum = data_to_accum.cast(pa.int64())

1592

1593 result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)

1594

1595 if convert_to_int:

1596 result = result.cast(pa_dtype)

1597

1598 return type(self)(result)

1599

1600 def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar:

1601 """

1602 Return a pyarrow scalar result of performing the reduction operation.

1603

1604 Parameters

1605 ----------

1606 name : str

1607 Name of the function, supported values are:

1608 { any, all, min, max, sum, mean, median, prod,

1609 std, var, sem, kurt, skew }.

1610 skipna : bool, default True

1611 If True, skip NaN values.

1612 **kwargs

1613 Additional keyword arguments passed to the reduction function.

1614 Currently, `ddof` is the only supported kwarg.

1615

1616 Returns

1617 -------

1618 pyarrow scalar

1619

1620 Raises

1621 ------

1622 TypeError : subclass does not define reductions

1623 """

1624 pa_type = self._pa_array.type

1625

1626 data_to_reduce = self._pa_array

1627

1628 cast_kwargs = {} if pa_version_under13p0 else {"safe": False}

1629

1630 if name in ["any", "all"] and (

1631 pa.types.is_integer(pa_type)

1632 or pa.types.is_floating(pa_type)

1633 or pa.types.is_duration(pa_type)

1634 or pa.types.is_decimal(pa_type)

1635 ):

1636 # pyarrow only supports any/all for boolean dtype, we allow

1637 # for other dtypes, matching our non-pyarrow behavior

1638

1639 if pa.types.is_duration(pa_type):

1640 data_to_cmp = self._pa_array.cast(pa.int64())

1641 else:

1642 data_to_cmp = self._pa_array

1643

1644 not_eq = pc.not_equal(data_to_cmp, 0)

1645 data_to_reduce = not_eq

1646

1647 elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):

1648 data_to_reduce = self._pa_array.cast(pa.int64())

1649

1650 elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type):

1651 nbits = pa_type.bit_width

1652 if nbits == 32:

1653 data_to_reduce = self._pa_array.cast(pa.int32())

1654 else:

1655 data_to_reduce = self._pa_array.cast(pa.int64())

1656

1657 if name == "sem":

1658

1659 def pyarrow_meth(data, skip_nulls, **kwargs):

1660 numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs)

1661 denominator = pc.sqrt_checked(pc.count(self._pa_array))

1662 return pc.divide_checked(numerator, denominator)

1663

1664 else:

1665 pyarrow_name = {

1666 "median": "quantile",

1667 "prod": "product",

1668 "std": "stddev",

1669 "var": "variance",

1670 }.get(name, name)

1671 # error: Incompatible types in assignment

1672 # (expression has type "Optional[Any]", variable has type

1673 # "Callable[[Any, Any, KwArg(Any)], Any]")

1674 pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment]

1675 if pyarrow_meth is None:

1676 # Let ExtensionArray._reduce raise the TypeError

1677 return super()._reduce(name, skipna=skipna, **kwargs)

1678

1679 # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0

1680 if name in ["any", "all"] and "min_count" not in kwargs:

1681 kwargs["min_count"] = 0

1682 elif name == "median":

1683 # GH 52679: Use quantile instead of approximate_median

1684 kwargs["q"] = 0.5

1685

1686 try:

1687 result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)

1688 except (AttributeError, NotImplementedError, TypeError) as err:

1689 msg = (

1690 f"'{type(self).__name__}' with dtype {self.dtype} "

1691 f"does not support reduction '{name}' with pyarrow "

1692 f"version {pa.__version__}. '{name}' may be supported by "

1693 f"upgrading pyarrow."

1694 )

1695 raise TypeError(msg) from err

1696 if name == "median":

1697 # GH 52679: Use quantile instead of approximate_median; returns array

1698 result = result[0]

1699 if pc.is_null(result).as_py():

1700 return result

1701

1702 if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type):

1703 result = result.cast(pa_type)

1704 if name in ["median", "mean"] and pa.types.is_temporal(pa_type):

1705 if not pa_version_under13p0:

1706 nbits = pa_type.bit_width

1707 if nbits == 32:

1708 result = result.cast(pa.int32(), **cast_kwargs)

1709 else:

1710 result = result.cast(pa.int64(), **cast_kwargs)

1711 result = result.cast(pa_type)

1712 if name in ["std", "sem"] and pa.types.is_temporal(pa_type):

1713 result = result.cast(pa.int64(), **cast_kwargs)

1714 if pa.types.is_duration(pa_type):

1715 result = result.cast(pa_type)

1716 elif pa.types.is_time(pa_type):

1717 unit = get_unit_from_pa_dtype(pa_type)

1718 result = result.cast(pa.duration(unit))

1719 elif pa.types.is_date(pa_type):

1720 # go with closest available unit, i.e. "s"

1721 result = result.cast(pa.duration("s"))

1722 else:

1723 # i.e. timestamp

1724 result = result.cast(pa.duration(pa_type.unit))

1725

1726 return result

1727

1728 def _reduce(

1729 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs

1730 ):

1731 """

1732 Return a scalar result of performing the reduction operation.

1733

1734 Parameters

1735 ----------

1736 name : str

1737 Name of the function, supported values are:

1738 { any, all, min, max, sum, mean, median, prod,

1739 std, var, sem, kurt, skew }.

1740 skipna : bool, default True

1741 If True, skip NaN values.

1742 **kwargs

1743 Additional keyword arguments passed to the reduction function.

1744 Currently, `ddof` is the only supported kwarg.

1745

1746 Returns

1747 -------

1748 scalar

1749

1750 Raises

1751 ------

1752 TypeError : subclass does not define reductions

1753 """

1754 result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs)

1755 if isinstance(result, pa.Array):

1756 return type(self)(result)

1757 else:

1758 return result

1759

1760 def _reduce_calc(

1761 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs

1762 ):

1763 pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs)

1764

1765 if keepdims:

1766 if isinstance(pa_result, pa.Scalar):

1767 result = pa.array([pa_result.as_py()], type=pa_result.type)

1768 else:

1769 result = pa.array(

1770 [pa_result],

1771 type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]),

1772 )

1773 return result

1774

1775 if pc.is_null(pa_result).as_py():

1776 return self.dtype.na_value

1777 elif isinstance(pa_result, pa.Scalar):

1778 return pa_result.as_py()

1779 else:

1780 return pa_result

1781

1782 def _explode(self):

1783 """

1784 See Series.explode.__doc__.

1785 """

1786 # child class explode method supports only list types; return

1787 # default implementation for non list types.

1788 if not pa.types.is_list(self.dtype.pyarrow_dtype):

1789 return super()._explode()

1790 values = self

1791 counts = pa.compute.list_value_length(values._pa_array)

1792 counts = counts.fill_null(1).to_numpy()

1793 fill_value = pa.scalar([None], type=self._pa_array.type)

1794 mask = counts == 0

1795 if mask.any():

1796 values = values.copy()

1797 values[mask] = fill_value

1798 counts = counts.copy()

1799 counts[mask] = 1

1800 values = values.fillna(fill_value)

1801 values = type(self)(pa.compute.list_flatten(values._pa_array))

1802 return values, counts

1803

1804 def __setitem__(self, key, value) -> None:

1805 """Set one or more values inplace.

1806

1807 Parameters

1808 ----------

1809 key : int, ndarray, or slice

1810 When called from, e.g. ``Series.__setitem__``, ``key`` will be

1811 one of

1812

1813 * scalar int

1814 * ndarray of integers.

1815 * boolean ndarray

1816 * slice object

1817

1818 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object

1819 value or values to be set of ``key``.

1820

1821 Returns

1822 -------

1823 None

1824 """

1825 # GH50085: unwrap 1D indexers

1826 if isinstance(key, tuple) and len(key) == 1:

1827 key = key[0]

1828

1829 key = check_array_indexer(self, key)

1830 value = self._maybe_convert_setitem_value(value)

1831

1832 if com.is_null_slice(key):

1833 # fast path (GH50248)

1834 data = self._if_else(True, value, self._pa_array)

1835

1836 elif is_integer(key):

1837 # fast path

1838 key = cast(int, key)

1839 n = len(self)

1840 if key < 0:

1841 key += n

1842 if not 0 <= key < n:

1843 raise IndexError(

1844 f"index {key} is out of bounds for axis 0 with size {n}"

1845 )

1846 if isinstance(value, pa.Scalar):

1847 value = value.as_py()

1848 elif is_list_like(value):

1849 raise ValueError("Length of indexer and values mismatch")

1850 chunks = [

1851 *self._pa_array[:key].chunks,

1852 pa.array([value], type=self._pa_array.type, from_pandas=True),

1853 *self._pa_array[key + 1 :].chunks,

1854 ]

1855 data = pa.chunked_array(chunks).combine_chunks()

1856

1857 elif is_bool_dtype(key):

1858 key = np.asarray(key, dtype=np.bool_)

1859 data = self._replace_with_mask(self._pa_array, key, value)

1860

1861 elif is_scalar(value) or isinstance(value, pa.Scalar):

1862 mask = np.zeros(len(self), dtype=np.bool_)

1863 mask[key] = True

1864 data = self._if_else(mask, value, self._pa_array)

1865

1866 else:

1867 indices = np.arange(len(self))[key]

1868 if len(indices) != len(value):

1869 raise ValueError("Length of indexer and values mismatch")

1870 if len(indices) == 0:

1871 return

1872 argsort = np.argsort(indices)

1873 indices = indices[argsort]

1874 value = value.take(argsort)

1875 mask = np.zeros(len(self), dtype=np.bool_)

1876 mask[indices] = True

1877 data = self._replace_with_mask(self._pa_array, mask, value)

1878

1879 if isinstance(data, pa.Array):

1880 data = pa.chunked_array([data])

1881 self._pa_array = data

1882

1883 def _rank_calc(

1884 self,

1885 *,

1886 axis: AxisInt = 0,

1887 method: str = "average",

1888 na_option: str = "keep",

1889 ascending: bool = True,

1890 pct: bool = False,

1891 ):

1892 if axis != 0:

1893 ranked = super()._rank(

1894 axis=axis,

1895 method=method,

1896 na_option=na_option,

1897 ascending=ascending,

1898 pct=pct,

1899 )

1900 # keep dtypes consistent with the implementation below

1901 if method == "average" or pct:

1902 pa_type = pa.float64()

1903 else:

1904 pa_type = pa.uint64()

1905 result = pa.array(ranked, type=pa_type, from_pandas=True)

1906 return result

1907

1908 data = self._pa_array.combine_chunks()

1909 sort_keys = "ascending" if ascending else "descending"

1910 null_placement = "at_start" if na_option == "top" else "at_end"

1911 tiebreaker = "min" if method == "average" else method

1912

1913 result = pc.rank(

1914 data,

1915 sort_keys=sort_keys,

1916 null_placement=null_placement,

1917 tiebreaker=tiebreaker,

1918 )

1919

1920 if na_option == "keep":

1921 mask = pc.is_null(self._pa_array)

1922 null = pa.scalar(None, type=result.type)

1923 result = pc.if_else(mask, null, result)

1924

1925 if method == "average":

1926 result_max = pc.rank(

1927 data,

1928 sort_keys=sort_keys,

1929 null_placement=null_placement,

1930 tiebreaker="max",

1931 )

1932 result_max = result_max.cast(pa.float64())

1933 result_min = result.cast(pa.float64())

1934 result = pc.divide(pc.add(result_min, result_max), 2)

1935

1936 if pct:

1937 if not pa.types.is_floating(result.type):

1938 result = result.cast(pa.float64())

1939 if method == "dense":

1940 divisor = pc.max(result)

1941 else:

1942 divisor = pc.count(result)

1943 result = pc.divide(result, divisor)

1944

1945 return result

1946

1947 def _rank(

1948 self,

1949 *,

1950 axis: AxisInt = 0,

1951 method: str = "average",

1952 na_option: str = "keep",

1953 ascending: bool = True,

1954 pct: bool = False,

1955 ):

1956 """

1957 See Series.rank.__doc__.

1958 """

1959 return type(self)(

1960 self._rank_calc(

1961 axis=axis,

1962 method=method,

1963 na_option=na_option,

1964 ascending=ascending,

1965 pct=pct,

1966 )

1967 )

1968

1969 def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self:

1970 """

1971 Compute the quantiles of self for each quantile in `qs`.

1972

1973 Parameters

1974 ----------

1975 qs : np.ndarray[float64]

1976 interpolation: str

1977

1978 Returns

1979 -------

1980 same type as self

1981 """

1982 pa_dtype = self._pa_array.type

1983

1984 data = self._pa_array

1985 if pa.types.is_temporal(pa_dtype):

1986 # https://github.com/apache/arrow/issues/33769 in these cases

1987 # we can cast to ints and back

1988 nbits = pa_dtype.bit_width

1989 if nbits == 32:

1990 data = data.cast(pa.int32())

1991 else:

1992 data = data.cast(pa.int64())

1993

1994 result = pc.quantile(data, q=qs, interpolation=interpolation)

1995

1996 if pa.types.is_temporal(pa_dtype):

1997 if pa.types.is_floating(result.type):

1998 result = pc.floor(result)

1999 nbits = pa_dtype.bit_width

2000 if nbits == 32:

2001 result = result.cast(pa.int32())

2002 else:

2003 result = result.cast(pa.int64())

2004 result = result.cast(pa_dtype)

2005

2006 return type(self)(result)

2007

2008 def _mode(self, dropna: bool = True) -> Self:

2009 """

2010 Returns the mode(s) of the ExtensionArray.

2011

2012 Always returns `ExtensionArray` even if only one value.

2013

2014 Parameters

2015 ----------

2016 dropna : bool, default True

2017 Don't consider counts of NA values.

2018

2019 Returns

2020 -------

2021 same type as self

2022 Sorted, if possible.

2023 """

2024 pa_type = self._pa_array.type

2025 if pa.types.is_temporal(pa_type):

2026 nbits = pa_type.bit_width

2027 if nbits == 32:

2028 data = self._pa_array.cast(pa.int32())

2029 elif nbits == 64:

2030 data = self._pa_array.cast(pa.int64())

2031 else:

2032 raise NotImplementedError(pa_type)

2033 else:

2034 data = self._pa_array

2035

2036 if dropna:

2037 data = data.drop_null()

2038

2039 res = pc.value_counts(data)

2040 most_common = res.field("values").filter(

2041 pc.equal(res.field("counts"), pc.max(res.field("counts")))

2042 )

2043

2044 if pa.types.is_temporal(pa_type):

2045 most_common = most_common.cast(pa_type)

2046

2047 most_common = most_common.take(pc.array_sort_indices(most_common))

2048 return type(self)(most_common)

2049

2050 def _maybe_convert_setitem_value(self, value):

2051 """Maybe convert value to be pyarrow compatible."""

2052 try:

2053 value = self._box_pa(value, self._pa_array.type)

2054 except pa.ArrowTypeError as err:

2055 msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"

2056 raise TypeError(msg) from err

2057 return value

2058

2059 def interpolate(

2060 self,

2061 *,

2062 method: InterpolateOptions,

2063 axis: int,

2064 index,

2065 limit,

2066 limit_direction,

2067 limit_area,

2068 copy: bool,

2069 **kwargs,

2070 ) -> Self:

2071 """

2072 See NDFrame.interpolate.__doc__.

2073 """

2074 # NB: we return type(self) even if copy=False

2075 mask = self.isna()

2076 if self.dtype.kind == "f":

2077 data = self._pa_array.to_numpy()

2078 elif self.dtype.kind in "iu":

2079 data = self.to_numpy(dtype="f8", na_value=0.0)

2080 else:

2081 raise NotImplementedError(

2082 f"interpolate is not implemented for dtype={self.dtype}"

2083 )

2084

2085 missing.interpolate_2d_inplace(

2086 data,

2087 method=method,

2088 axis=0,

2089 index=index,

2090 limit=limit,

2091 limit_direction=limit_direction,

2092 limit_area=limit_area,

2093 mask=mask,

2094 **kwargs,

2095 )

2096 return type(self)(self._box_pa_array(pa.array(data, mask=mask)))

2097

2098 @classmethod

2099 def _if_else(

2100 cls,

2101 cond: npt.NDArray[np.bool_] | bool,

2102 left: ArrayLike | Scalar,

2103 right: ArrayLike | Scalar,

2104 ):

2105 """

2106 Choose values based on a condition.

2107

2108 Analogous to pyarrow.compute.if_else, with logic

2109 to fallback to numpy for unsupported types.

2110

2111 Parameters

2112 ----------

2113 cond : npt.NDArray[np.bool_] or bool

2114 left : ArrayLike | Scalar

2115 right : ArrayLike | Scalar

2116

2117 Returns

2118 -------

2119 pa.Array

2120 """

2121 try:

2122 return pc.if_else(cond, left, right)

2123 except pa.ArrowNotImplementedError:

2124 pass

2125

2126 def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:

2127 if isinstance(value, (pa.Array, pa.ChunkedArray)):

2128 pa_type = value.type

2129 elif isinstance(value, pa.Scalar):

2130 pa_type = value.type

2131 value = value.as_py()

2132 else:

2133 pa_type = None

2134 return np.array(value, dtype=object), pa_type

2135

2136 left, left_type = _to_numpy_and_type(left)

2137 right, right_type = _to_numpy_and_type(right)

2138 pa_type = left_type or right_type

2139 result = np.where(cond, left, right)

2140 return pa.array(result, type=pa_type, from_pandas=True)

2141

2142 @classmethod

2143 def _replace_with_mask(

2144 cls,

2145 values: pa.Array | pa.ChunkedArray,

2146 mask: npt.NDArray[np.bool_] | bool,

2147 replacements: ArrayLike | Scalar,

2148 ):

2149 """

2150 Replace items selected with a mask.

2151

2152 Analogous to pyarrow.compute.replace_with_mask, with logic

2153 to fallback to numpy for unsupported types.

2154

2155 Parameters

2156 ----------

2157 values : pa.Array or pa.ChunkedArray

2158 mask : npt.NDArray[np.bool_] or bool

2159 replacements : ArrayLike or Scalar

2160 Replacement value(s)

2161

2162 Returns

2163 -------

2164 pa.Array or pa.ChunkedArray

2165 """

2166 if isinstance(replacements, pa.ChunkedArray):

2167 # replacements must be array or scalar, not ChunkedArray

2168 replacements = replacements.combine_chunks()

2169 if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type):

2170 # GH#52059 replace_with_mask segfaults for chunked array

2171 # https://github.com/apache/arrow/issues/34634

2172 values = values.combine_chunks()

2173 try:

2174 return pc.replace_with_mask(values, mask, replacements)

2175 except pa.ArrowNotImplementedError:

2176 pass

2177 if isinstance(replacements, pa.Array):

2178 replacements = np.array(replacements, dtype=object)

2179 elif isinstance(replacements, pa.Scalar):

2180 replacements = replacements.as_py()

2181 result = np.array(values, dtype=object)

2182 result[mask] = replacements

2183 return pa.array(result, type=values.type, from_pandas=True)

2184

2185 # ------------------------------------------------------------------

2186 # GroupBy Methods

2187

2188 def _to_masked(self):

2189 pa_dtype = self._pa_array.type

2190

2191 if pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype):

2192 na_value = 1

2193 elif pa.types.is_boolean(pa_dtype):

2194 na_value = True

2195 else:

2196 raise NotImplementedError

2197

2198 dtype = _arrow_dtype_mapping()[pa_dtype]

2199 mask = self.isna()

2200 arr = self.to_numpy(dtype=dtype.numpy_dtype, na_value=na_value)

2201 return dtype.construct_array_type()(arr, mask)

2202

2203 def _groupby_op(

2204 self,

2205 *,

2206 how: str,

2207 has_dropped_na: bool,

2208 min_count: int,

2209 ngroups: int,

2210 ids: npt.NDArray[np.intp],

2211 **kwargs,

2212 ):

2213 if isinstance(self.dtype, StringDtype):

2214 return super()._groupby_op(

2215 how=how,

2216 has_dropped_na=has_dropped_na,

2217 min_count=min_count,

2218 ngroups=ngroups,

2219 ids=ids,

2220 **kwargs,

2221 )

2222

2223 # maybe convert to a compatible dtype optimized for groupby

2224 values: ExtensionArray

2225 pa_type = self._pa_array.type

2226 if pa.types.is_timestamp(pa_type):

2227 values = self._to_datetimearray()

2228 elif pa.types.is_duration(pa_type):

2229 values = self._to_timedeltaarray()

2230 else:

2231 values = self._to_masked()

2232

2233 result = values._groupby_op(

2234 how=how,

2235 has_dropped_na=has_dropped_na,

2236 min_count=min_count,

2237 ngroups=ngroups,

2238 ids=ids,

2239 **kwargs,

2240 )

2241 if isinstance(result, np.ndarray):

2242 return result

2243 return type(self)._from_sequence(result, copy=False)

2244

2245 def _apply_elementwise(self, func: Callable) -> list[list[Any]]:

2246 """Apply a callable to each element while maintaining the chunking structure."""

2247 return [

2248 [

2249 None if val is None else func(val)

2250 for val in chunk.to_numpy(zero_copy_only=False)

2251 ]

2252 for chunk in self._pa_array.iterchunks()

2253 ]

2254

2255 def _str_count(self, pat: str, flags: int = 0):

2256 if flags:

2257 raise NotImplementedError(f"count not implemented with {flags=}")

2258 return type(self)(pc.count_substring_regex(self._pa_array, pat))

2259

2260 def _str_contains(

2261 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True

2262 ):

2263 if flags:

2264 raise NotImplementedError(f"contains not implemented with {flags=}")

2265

2266 if regex:

2267 pa_contains = pc.match_substring_regex

2268 else:

2269 pa_contains = pc.match_substring

2270 result = pa_contains(self._pa_array, pat, ignore_case=not case)

2271 if not isna(na):

2272 result = result.fill_null(na)

2273 return type(self)(result)

2274

2275 def _str_startswith(self, pat: str | tuple[str, ...], na=None):

2276 if isinstance(pat, str):

2277 result = pc.starts_with(self._pa_array, pattern=pat)

2278 else:

2279 if len(pat) == 0:

2280 # For empty tuple, pd.StringDtype() returns null for missing values

2281 # and false for valid values.

2282 result = pc.if_else(pc.is_null(self._pa_array), None, False)

2283 else:

2284 result = pc.starts_with(self._pa_array, pattern=pat[0])

2285

2286 for p in pat[1:]:

2287 result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))

2288 if not isna(na):

2289 result = result.fill_null(na)

2290 return type(self)(result)

2291

2292 def _str_endswith(self, pat: str | tuple[str, ...], na=None):

2293 if isinstance(pat, str):

2294 result = pc.ends_with(self._pa_array, pattern=pat)

2295 else:

2296 if len(pat) == 0:

2297 # For empty tuple, pd.StringDtype() returns null for missing values

2298 # and false for valid values.

2299 result = pc.if_else(pc.is_null(self._pa_array), None, False)

2300 else:

2301 result = pc.ends_with(self._pa_array, pattern=pat[0])

2302

2303 for p in pat[1:]:

2304 result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))

2305 if not isna(na):

2306 result = result.fill_null(na)

2307 return type(self)(result)

2308

2309 def _str_replace(

2310 self,

2311 pat: str | re.Pattern,

2312 repl: str | Callable,

2313 n: int = -1,

2314 case: bool = True,

2315 flags: int = 0,

2316 regex: bool = True,

2317 ):

2318 if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:

2319 raise NotImplementedError(

2320 "replace is not supported with a re.Pattern, callable repl, "

2321 "case=False, or flags!=0"

2322 )

2323

2324 func = pc.replace_substring_regex if regex else pc.replace_substring

2325 # https://github.com/apache/arrow/issues/39149

2326 # GH 56404, unexpected behavior with negative max_replacements with pyarrow.

2327 pa_max_replacements = None if n < 0 else n

2328 result = func(

2329 self._pa_array,

2330 pattern=pat,

2331 replacement=repl,

2332 max_replacements=pa_max_replacements,

2333 )

2334 return type(self)(result)

2335

2336 def _str_repeat(self, repeats: int | Sequence[int]):

2337 if not isinstance(repeats, int):

2338 raise NotImplementedError(

2339 f"repeat is not implemented when repeats is {type(repeats).__name__}"

2340 )

2341 else:

2342 return type(self)(pc.binary_repeat(self._pa_array, repeats))

2343

2344 def _str_match(

2345 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None

2346 ):

2347 if not pat.startswith("^"):

2348 pat = f"^{pat}"

2349 return self._str_contains(pat, case, flags, na, regex=True)

2350

2351 def _str_fullmatch(

2352 self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None

2353 ):

2354 if not pat.endswith("$") or pat.endswith("\\$"):

2355 pat = f"{pat}$"

2356 return self._str_match(pat, case, flags, na)

2357

2358 def _str_find(self, sub: str, start: int = 0, end: int | None = None):

2359 if start != 0 and end is not None:

2360 slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)

2361 result = pc.find_substring(slices, sub)

2362 not_found = pc.equal(result, -1)

2363 start_offset = max(0, start)

2364 offset_result = pc.add(result, start_offset)

2365 result = pc.if_else(not_found, result, offset_result)

2366 elif start == 0 and end is None:

2367 slices = self._pa_array

2368 result = pc.find_substring(slices, sub)

2369 else:

2370 raise NotImplementedError(

2371 f"find not implemented with {sub=}, {start=}, {end=}"

2372 )

2373 return type(self)(result)

2374

2375 def _str_join(self, sep: str):

2376 if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(

2377 self._pa_array.type

2378 ):

2379 result = self._apply_elementwise(list)

2380 result = pa.chunked_array(result, type=pa.list_(pa.string()))

2381 else:

2382 result = self._pa_array

2383 return type(self)(pc.binary_join(result, sep))

2384

2385 def _str_partition(self, sep: str, expand: bool):

2386 predicate = lambda val: val.partition(sep)

2387 result = self._apply_elementwise(predicate)

2388 return type(self)(pa.chunked_array(result))

2389

2390 def _str_rpartition(self, sep: str, expand: bool):

2391 predicate = lambda val: val.rpartition(sep)

2392 result = self._apply_elementwise(predicate)

2393 return type(self)(pa.chunked_array(result))

2394

2395 def _str_slice(

2396 self, start: int | None = None, stop: int | None = None, step: int | None = None

2397 ):

2398 if start is None:

2399 start = 0

2400 if step is None:

2401 step = 1

2402 return type(self)(

2403 pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step)

2404 )

2405

2406 def _str_isalnum(self):

2407 return type(self)(pc.utf8_is_alnum(self._pa_array))

2408

2409 def _str_isalpha(self):

2410 return type(self)(pc.utf8_is_alpha(self._pa_array))

2411

2412 def _str_isdecimal(self):

2413 return type(self)(pc.utf8_is_decimal(self._pa_array))

2414

2415 def _str_isdigit(self):

2416 return type(self)(pc.utf8_is_digit(self._pa_array))

2417

2418 def _str_islower(self):

2419 return type(self)(pc.utf8_is_lower(self._pa_array))

2420

2421 def _str_isnumeric(self):

2422 return type(self)(pc.utf8_is_numeric(self._pa_array))

2423

2424 def _str_isspace(self):

2425 return type(self)(pc.utf8_is_space(self._pa_array))

2426

2427 def _str_istitle(self):

2428 return type(self)(pc.utf8_is_title(self._pa_array))

2429

2430 def _str_isupper(self):

2431 return type(self)(pc.utf8_is_upper(self._pa_array))

2432

2433 def _str_len(self):

2434 return type(self)(pc.utf8_length(self._pa_array))

2435

2436 def _str_lower(self):

2437 return type(self)(pc.utf8_lower(self._pa_array))

2438

2439 def _str_upper(self):

2440 return type(self)(pc.utf8_upper(self._pa_array))

2441

2442 def _str_strip(self, to_strip=None):

2443 if to_strip is None:

2444 result = pc.utf8_trim_whitespace(self._pa_array)

2445 else:

2446 result = pc.utf8_trim(self._pa_array, characters=to_strip)

2447 return type(self)(result)

2448

2449 def _str_lstrip(self, to_strip=None):

2450 if to_strip is None:

2451 result = pc.utf8_ltrim_whitespace(self._pa_array)

2452 else:

2453 result = pc.utf8_ltrim(self._pa_array, characters=to_strip)

2454 return type(self)(result)

2455

2456 def _str_rstrip(self, to_strip=None):

2457 if to_strip is None:

2458 result = pc.utf8_rtrim_whitespace(self._pa_array)

2459 else:

2460 result = pc.utf8_rtrim(self._pa_array, characters=to_strip)

2461 return type(self)(result)

2462

2463 def _str_removeprefix(self, prefix: str):

2464 if not pa_version_under13p0:

2465 starts_with = pc.starts_with(self._pa_array, pattern=prefix)

2466 removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))

2467 result = pc.if_else(starts_with, removed, self._pa_array)

2468 return type(self)(result)

2469 predicate = lambda val: val.removeprefix(prefix)

2470 result = self._apply_elementwise(predicate)

2471 return type(self)(pa.chunked_array(result))

2472

2473 def _str_casefold(self):

2474 predicate = lambda val: val.casefold()

2475 result = self._apply_elementwise(predicate)

2476 return type(self)(pa.chunked_array(result))

2477

2478 def _str_encode(self, encoding: str, errors: str = "strict"):

2479 predicate = lambda val: val.encode(encoding, errors)

2480 result = self._apply_elementwise(predicate)

2481 return type(self)(pa.chunked_array(result))

2482

2483 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):

2484 if flags:

2485 raise NotImplementedError("Only flags=0 is implemented.")

2486 groups = re.compile(pat).groupindex.keys()

2487 if len(groups) == 0:

2488 raise ValueError(f"{pat=} must contain a symbolic group name.")

2489 result = pc.extract_regex(self._pa_array, pat)

2490 if expand:

2491 return {

2492 col: type(self)(pc.struct_field(result, [i]))

2493 for col, i in zip(groups, range(result.type.num_fields))

2494 }

2495 else:

2496 return type(self)(pc.struct_field(result, [0]))

2497

2498 def _str_findall(self, pat: str, flags: int = 0):

2499 regex = re.compile(pat, flags=flags)

2500 predicate = lambda val: regex.findall(val)

2501 result = self._apply_elementwise(predicate)

2502 return type(self)(pa.chunked_array(result))

2503

2504 def _str_get_dummies(self, sep: str = "|"):

2505 split = pc.split_pattern(self._pa_array, sep)

2506 flattened_values = pc.list_flatten(split)

2507 uniques = flattened_values.unique()

2508 uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques))

2509 lengths = pc.list_value_length(split).fill_null(0).to_numpy()

2510 n_rows = len(self)

2511 n_cols = len(uniques)

2512 indices = pc.index_in(flattened_values, uniques_sorted).to_numpy()

2513 indices = indices + np.arange(n_rows).repeat(lengths) * n_cols

2514 dummies = np.zeros(n_rows * n_cols, dtype=np.bool_)

2515 dummies[indices] = True

2516 dummies = dummies.reshape((n_rows, n_cols))

2517 result = type(self)(pa.array(list(dummies)))

2518 return result, uniques_sorted.to_pylist()

2519

2520 def _str_index(self, sub: str, start: int = 0, end: int | None = None):

2521 predicate = lambda val: val.index(sub, start, end)

2522 result = self._apply_elementwise(predicate)

2523 return type(self)(pa.chunked_array(result))

2524

2525 def _str_rindex(self, sub: str, start: int = 0, end: int | None = None):

2526 predicate = lambda val: val.rindex(sub, start, end)

2527 result = self._apply_elementwise(predicate)

2528 return type(self)(pa.chunked_array(result))

2529

2530 def _str_normalize(self, form: str):

2531 predicate = lambda val: unicodedata.normalize(form, val)

2532 result = self._apply_elementwise(predicate)

2533 return type(self)(pa.chunked_array(result))

2534

2535 def _str_rfind(self, sub: str, start: int = 0, end=None):

2536 predicate = lambda val: val.rfind(sub, start, end)

2537 result = self._apply_elementwise(predicate)

2538 return type(self)(pa.chunked_array(result))

2539

2540 def _str_split(

2541 self,

2542 pat: str | None = None,

2543 n: int | None = -1,

2544 expand: bool = False,

2545 regex: bool | None = None,

2546 ):

2547 if n in {-1, 0}:

2548 n = None

2549 if pat is None:

2550 split_func = pc.utf8_split_whitespace

2551 elif regex:

2552 split_func = functools.partial(pc.split_pattern_regex, pattern=pat)

2553 else:

2554 split_func = functools.partial(pc.split_pattern, pattern=pat)

2555 return type(self)(split_func(self._pa_array, max_splits=n))

2556

2557 def _str_rsplit(self, pat: str | None = None, n: int | None = -1):

2558 if n in {-1, 0}:

2559 n = None

2560 if pat is None:

2561 return type(self)(

2562 pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True)

2563 )

2564 else:

2565 return type(self)(

2566 pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True)

2567 )

2568

2569 def _str_translate(self, table: dict[int, str]):

2570 predicate = lambda val: val.translate(table)

2571 result = self._apply_elementwise(predicate)

2572 return type(self)(pa.chunked_array(result))

2573

2574 def _str_wrap(self, width: int, **kwargs):

2575 kwargs["width"] = width

2576 tw = textwrap.TextWrapper(**kwargs)

2577 predicate = lambda val: "\n".join(tw.wrap(val))

2578 result = self._apply_elementwise(predicate)

2579 return type(self)(pa.chunked_array(result))

2580

2581 @property

2582 def _dt_days(self):

2583 return type(self)(

2584 pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32())

2585 )

2586

2587 @property

2588 def _dt_hours(self):

2589 return type(self)(

2590 pa.array(

2591 [

2592 td.components.hours if td is not NaT else None

2593 for td in self._to_timedeltaarray()

2594 ],

2595 type=pa.int32(),

2596 )

2597 )

2598

2599 @property

2600 def _dt_minutes(self):

2601 return type(self)(

2602 pa.array(

2603 [

2604 td.components.minutes if td is not NaT else None

2605 for td in self._to_timedeltaarray()

2606 ],

2607 type=pa.int32(),

2608 )

2609 )

2610

2611 @property

2612 def _dt_seconds(self):

2613 return type(self)(

2614 pa.array(

2615 self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32()

2616 )

2617 )

2618

2619 @property

2620 def _dt_milliseconds(self):

2621 return type(self)(

2622 pa.array(

2623 [

2624 td.components.milliseconds if td is not NaT else None

2625 for td in self._to_timedeltaarray()

2626 ],

2627 type=pa.int32(),

2628 )

2629 )

2630

2631 @property

2632 def _dt_microseconds(self):

2633 return type(self)(

2634 pa.array(

2635 self._to_timedeltaarray().microseconds,

2636 from_pandas=True,

2637 type=pa.int32(),

2638 )

2639 )

2640

2641 @property

2642 def _dt_nanoseconds(self):

2643 return type(self)(

2644 pa.array(

2645 self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32()

2646 )

2647 )

2648

2649 def _dt_to_pytimedelta(self):

2650 data = self._pa_array.to_pylist()

2651 if self._dtype.pyarrow_dtype.unit == "ns":

2652 data = [None if ts is None else ts.to_pytimedelta() for ts in data]

2653 return np.array(data, dtype=object)

2654

2655 def _dt_total_seconds(self):

2656 return type(self)(

2657 pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True)

2658 )

2659

2660 def _dt_as_unit(self, unit: str):

2661 if pa.types.is_date(self.dtype.pyarrow_dtype):

2662 raise NotImplementedError("as_unit not implemented for date types")

2663 pd_array = self._maybe_convert_datelike_array()

2664 # Don't just cast _pa_array in order to follow pandas unit conversion rules

2665 return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True))

2666

2667 @property

2668 def _dt_year(self):

2669 return type(self)(pc.year(self._pa_array))

2670

2671 @property

2672 def _dt_day(self):

2673 return type(self)(pc.day(self._pa_array))

2674

2675 @property

2676 def _dt_day_of_week(self):

2677 return type(self)(pc.day_of_week(self._pa_array))

2678

2679 _dt_dayofweek = _dt_day_of_week

2680 _dt_weekday = _dt_day_of_week

2681

2682 @property

2683 def _dt_day_of_year(self):

2684 return type(self)(pc.day_of_year(self._pa_array))

2685

2686 _dt_dayofyear = _dt_day_of_year

2687

2688 @property

2689 def _dt_hour(self):

2690 return type(self)(pc.hour(self._pa_array))

2691

2692 def _dt_isocalendar(self):

2693 return type(self)(pc.iso_calendar(self._pa_array))

2694

2695 @property

2696 def _dt_is_leap_year(self):

2697 return type(self)(pc.is_leap_year(self._pa_array))

2698

2699 @property

2700 def _dt_is_month_start(self):

2701 return type(self)(pc.equal(pc.day(self._pa_array), 1))

2702

2703 @property

2704 def _dt_is_month_end(self):

2705 result = pc.equal(

2706 pc.days_between(

2707 pc.floor_temporal(self._pa_array, unit="day"),

2708 pc.ceil_temporal(self._pa_array, unit="month"),

2709 ),

2710 1,

2711 )

2712 return type(self)(result)

2713

2714 @property

2715 def _dt_is_year_start(self):

2716 return type(self)(

2717 pc.and_(

2718 pc.equal(pc.month(self._pa_array), 1),

2719 pc.equal(pc.day(self._pa_array), 1),

2720 )

2721 )

2722

2723 @property

2724 def _dt_is_year_end(self):

2725 return type(self)(

2726 pc.and_(

2727 pc.equal(pc.month(self._pa_array), 12),

2728 pc.equal(pc.day(self._pa_array), 31),

2729 )

2730 )

2731

2732 @property

2733 def _dt_is_quarter_start(self):

2734 result = pc.equal(

2735 pc.floor_temporal(self._pa_array, unit="quarter"),

2736 pc.floor_temporal(self._pa_array, unit="day"),

2737 )

2738 return type(self)(result)

2739

2740 @property

2741 def _dt_is_quarter_end(self):

2742 result = pc.equal(

2743 pc.days_between(

2744 pc.floor_temporal(self._pa_array, unit="day"),

2745 pc.ceil_temporal(self._pa_array, unit="quarter"),

2746 ),

2747 1,

2748 )

2749 return type(self)(result)

2750

2751 @property

2752 def _dt_days_in_month(self):

2753 result = pc.days_between(

2754 pc.floor_temporal(self._pa_array, unit="month"),

2755 pc.ceil_temporal(self._pa_array, unit="month"),

2756 )

2757 return type(self)(result)

2758

2759 _dt_daysinmonth = _dt_days_in_month

2760

2761 @property

2762 def _dt_microsecond(self):

2763 return type(self)(pc.microsecond(self._pa_array))

2764

2765 @property

2766 def _dt_minute(self):

2767 return type(self)(pc.minute(self._pa_array))

2768

2769 @property

2770 def _dt_month(self):

2771 return type(self)(pc.month(self._pa_array))

2772

2773 @property

2774 def _dt_nanosecond(self):

2775 return type(self)(pc.nanosecond(self._pa_array))

2776

2777 @property

2778 def _dt_quarter(self):

2779 return type(self)(pc.quarter(self._pa_array))

2780

2781 @property

2782 def _dt_second(self):

2783 return type(self)(pc.second(self._pa_array))

2784

2785 @property

2786 def _dt_date(self):

2787 return type(self)(self._pa_array.cast(pa.date32()))

2788

2789 @property

2790 def _dt_time(self):

2791 unit = (

2792 self.dtype.pyarrow_dtype.unit

2793 if self.dtype.pyarrow_dtype.unit in {"us", "ns"}

2794 else "ns"

2795 )

2796 return type(self)(self._pa_array.cast(pa.time64(unit)))

2797

2798 @property

2799 def _dt_tz(self):

2800 return timezones.maybe_get_tz(self.dtype.pyarrow_dtype.tz)

2801

2802 @property

2803 def _dt_unit(self):

2804 return self.dtype.pyarrow_dtype.unit

2805

2806 def _dt_normalize(self):

2807 return type(self)(pc.floor_temporal(self._pa_array, 1, "day"))

2808

2809 def _dt_strftime(self, format: str):

2810 return type(self)(pc.strftime(self._pa_array, format=format))

2811

2812 def _round_temporally(

2813 self,

2814 method: Literal["ceil", "floor", "round"],

2815 freq,

2816 ambiguous: TimeAmbiguous = "raise",

2817 nonexistent: TimeNonexistent = "raise",

2818 ):

2819 if ambiguous != "raise":

2820 raise NotImplementedError("ambiguous is not supported.")

2821 if nonexistent != "raise":

2822 raise NotImplementedError("nonexistent is not supported.")

2823 offset = to_offset(freq)

2824 if offset is None:

2825 raise ValueError(f"Must specify a valid frequency: {freq}")

2826 pa_supported_unit = {

2827 "Y": "year",

2828 "YS": "year",

2829 "Q": "quarter",

2830 "QS": "quarter",

2831 "M": "month",

2832 "MS": "month",

2833 "W": "week",

2834 "D": "day",

2835 "h": "hour",

2836 "min": "minute",

2837 "s": "second",

2838 "ms": "millisecond",

2839 "us": "microsecond",

2840 "ns": "nanosecond",

2841 }

2842 unit = pa_supported_unit.get(offset._prefix, None)

2843 if unit is None:

2844 raise ValueError(f"{freq=} is not supported")

2845 multiple = offset.n

2846 rounding_method = getattr(pc, f"{method}_temporal")

2847 return type(self)(rounding_method(self._pa_array, multiple=multiple, unit=unit))

2848

2849 def _dt_ceil(

2850 self,

2851 freq,

2852 ambiguous: TimeAmbiguous = "raise",

2853 nonexistent: TimeNonexistent = "raise",

2854 ):

2855 return self._round_temporally("ceil", freq, ambiguous, nonexistent)

2856

2857 def _dt_floor(

2858 self,

2859 freq,

2860 ambiguous: TimeAmbiguous = "raise",

2861 nonexistent: TimeNonexistent = "raise",

2862 ):

2863 return self._round_temporally("floor", freq, ambiguous, nonexistent)

2864

2865 def _dt_round(

2866 self,

2867 freq,

2868 ambiguous: TimeAmbiguous = "raise",

2869 nonexistent: TimeNonexistent = "raise",

2870 ):

2871 return self._round_temporally("round", freq, ambiguous, nonexistent)

2872

2873 def _dt_day_name(self, locale: str | None = None):

2874 if locale is None:

2875 locale = "C"

2876 return type(self)(pc.strftime(self._pa_array, format="%A", locale=locale))

2877

2878 def _dt_month_name(self, locale: str | None = None):

2879 if locale is None:

2880 locale = "C"

2881 return type(self)(pc.strftime(self._pa_array, format="%B", locale=locale))

2882

2883 def _dt_to_pydatetime(self):

2884 if pa.types.is_date(self.dtype.pyarrow_dtype):

2885 raise ValueError(

2886 f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. "

2887 "Convert to pyarrow timestamp type."

2888 )

2889 data = self._pa_array.to_pylist()

2890 if self._dtype.pyarrow_dtype.unit == "ns":

2891 data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data]

2892 return np.array(data, dtype=object)

2893

2894 def _dt_tz_localize(

2895 self,

2896 tz,

2897 ambiguous: TimeAmbiguous = "raise",

2898 nonexistent: TimeNonexistent = "raise",

2899 ):

2900 if ambiguous != "raise":

2901 raise NotImplementedError(f"{ambiguous=} is not supported")

2902 nonexistent_pa = {

2903 "raise": "raise",

2904 "shift_backward": "earliest",

2905 "shift_forward": "latest",

2906 }.get(

2907 nonexistent, None # type: ignore[arg-type]

2908 )

2909 if nonexistent_pa is None:

2910 raise NotImplementedError(f"{nonexistent=} is not supported")

2911 if tz is None:

2912 result = self._pa_array.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit))

2913 else:

2914 result = pc.assume_timezone(

2915 self._pa_array, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa

2916 )

2917 return type(self)(result)

2918

2919 def _dt_tz_convert(self, tz):

2920 if self.dtype.pyarrow_dtype.tz is None:

2921 raise TypeError(

2922 "Cannot convert tz-naive timestamps, use tz_localize to localize"

2923 )

2924 current_unit = self.dtype.pyarrow_dtype.unit

2925 result = self._pa_array.cast(pa.timestamp(current_unit, tz))

2926 return type(self)(result)

2927

2928

2929def transpose_homogeneous_pyarrow(

2930 arrays: Sequence[ArrowExtensionArray],

2931) -> list[ArrowExtensionArray]:

2932 """Transpose arrow extension arrays in a list, but faster.

2933

2934 Input should be a list of arrays of equal length and all have the same

2935 dtype. The caller is responsible for ensuring validity of input data.

2936 """

2937 arrays = list(arrays)

2938 nrows, ncols = len(arrays[0]), len(arrays)

2939 indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten()

2940 arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks])

2941 arr = arr.take(indices)

2942 return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)]