Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/arrow/array.py: 20%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1306 statements  

1from __future__ import annotations 

2 

3import functools 

4import operator 

5import re 

6import textwrap 

7from typing import ( 

8 TYPE_CHECKING, 

9 Any, 

10 Callable, 

11 Literal, 

12 cast, 

13) 

14import unicodedata 

15 

16import numpy as np 

17 

18from pandas._libs import lib 

19from pandas._libs.tslibs import ( 

20 NaT, 

21 Timedelta, 

22 Timestamp, 

23 timezones, 

24) 

25from pandas.compat import ( 

26 pa_version_under10p1, 

27 pa_version_under11p0, 

28 pa_version_under13p0, 

29) 

30from pandas.util._decorators import doc 

31from pandas.util._validators import validate_fillna_kwargs 

32 

33from pandas.core.dtypes.cast import ( 

34 can_hold_element, 

35 infer_dtype_from_scalar, 

36) 

37from pandas.core.dtypes.common import ( 

38 CategoricalDtype, 

39 is_array_like, 

40 is_bool_dtype, 

41 is_float_dtype, 

42 is_integer, 

43 is_list_like, 

44 is_numeric_dtype, 

45 is_scalar, 

46) 

47from pandas.core.dtypes.dtypes import DatetimeTZDtype 

48from pandas.core.dtypes.missing import isna 

49 

50from pandas.core import ( 

51 algorithms as algos, 

52 missing, 

53 ops, 

54 roperator, 

55) 

56from pandas.core.algorithms import map_array 

57from pandas.core.arraylike import OpsMixin 

58from pandas.core.arrays._arrow_string_mixins import ArrowStringArrayMixin 

59from pandas.core.arrays._utils import to_numpy_dtype_inference 

60from pandas.core.arrays.base import ( 

61 ExtensionArray, 

62 ExtensionArraySupportsAnyAll, 

63) 

64from pandas.core.arrays.masked import BaseMaskedArray 

65from pandas.core.arrays.string_ import StringDtype 

66import pandas.core.common as com 

67from pandas.core.indexers import ( 

68 check_array_indexer, 

69 unpack_tuple_and_ellipses, 

70 validate_indices, 

71) 

72from pandas.core.strings.base import BaseStringArrayMethods 

73 

74from pandas.io._util import _arrow_dtype_mapping 

75from pandas.tseries.frequencies import to_offset 

76 

77if not pa_version_under10p1: 

78 import pyarrow as pa 

79 import pyarrow.compute as pc 

80 

81 from pandas.core.dtypes.dtypes import ArrowDtype 

82 

83 ARROW_CMP_FUNCS = { 

84 "eq": pc.equal, 

85 "ne": pc.not_equal, 

86 "lt": pc.less, 

87 "gt": pc.greater, 

88 "le": pc.less_equal, 

89 "ge": pc.greater_equal, 

90 } 

91 

92 ARROW_LOGICAL_FUNCS = { 

93 "and_": pc.and_kleene, 

94 "rand_": lambda x, y: pc.and_kleene(y, x), 

95 "or_": pc.or_kleene, 

96 "ror_": lambda x, y: pc.or_kleene(y, x), 

97 "xor": pc.xor, 

98 "rxor": lambda x, y: pc.xor(y, x), 

99 } 

100 

101 ARROW_BIT_WISE_FUNCS = { 

102 "and_": pc.bit_wise_and, 

103 "rand_": lambda x, y: pc.bit_wise_and(y, x), 

104 "or_": pc.bit_wise_or, 

105 "ror_": lambda x, y: pc.bit_wise_or(y, x), 

106 "xor": pc.bit_wise_xor, 

107 "rxor": lambda x, y: pc.bit_wise_xor(y, x), 

108 } 

109 

110 def cast_for_truediv( 

111 arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar 

112 ) -> tuple[pa.ChunkedArray, pa.Array | pa.Scalar]: 

113 # Ensure int / int -> float mirroring Python/Numpy behavior 

114 # as pc.divide_checked(int, int) -> int 

115 if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( 

116 pa_object.type 

117 ): 

118 # GH: 56645. 

119 # https://github.com/apache/arrow/issues/35563 

120 return pc.cast(arrow_array, pa.float64(), safe=False), pc.cast( 

121 pa_object, pa.float64(), safe=False 

122 ) 

123 

124 return arrow_array, pa_object 

125 

126 def floordiv_compat( 

127 left: pa.ChunkedArray | pa.Array | pa.Scalar, 

128 right: pa.ChunkedArray | pa.Array | pa.Scalar, 

129 ) -> pa.ChunkedArray: 

130 # TODO: Replace with pyarrow floordiv kernel. 

131 # https://github.com/apache/arrow/issues/39386 

132 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): 

133 divided = pc.divide_checked(left, right) 

134 if pa.types.is_signed_integer(divided.type): 

135 # GH 56676 

136 has_remainder = pc.not_equal(pc.multiply(divided, right), left) 

137 has_one_negative_operand = pc.less( 

138 pc.bit_wise_xor(left, right), 

139 pa.scalar(0, type=divided.type), 

140 ) 

141 result = pc.if_else( 

142 pc.and_( 

143 has_remainder, 

144 has_one_negative_operand, 

145 ), 

146 # GH: 55561 

147 pc.subtract(divided, pa.scalar(1, type=divided.type)), 

148 divided, 

149 ) 

150 else: 

151 result = divided 

152 result = result.cast(left.type) 

153 else: 

154 divided = pc.divide(left, right) 

155 result = pc.floor(divided) 

156 return result 

157 

158 ARROW_ARITHMETIC_FUNCS = { 

159 "add": pc.add_checked, 

160 "radd": lambda x, y: pc.add_checked(y, x), 

161 "sub": pc.subtract_checked, 

162 "rsub": lambda x, y: pc.subtract_checked(y, x), 

163 "mul": pc.multiply_checked, 

164 "rmul": lambda x, y: pc.multiply_checked(y, x), 

165 "truediv": lambda x, y: pc.divide(*cast_for_truediv(x, y)), 

166 "rtruediv": lambda x, y: pc.divide(*cast_for_truediv(y, x)), 

167 "floordiv": lambda x, y: floordiv_compat(x, y), 

168 "rfloordiv": lambda x, y: floordiv_compat(y, x), 

169 "mod": NotImplemented, 

170 "rmod": NotImplemented, 

171 "divmod": NotImplemented, 

172 "rdivmod": NotImplemented, 

173 "pow": pc.power_checked, 

174 "rpow": lambda x, y: pc.power_checked(y, x), 

175 } 

176 

177if TYPE_CHECKING: 

178 from collections.abc import Sequence 

179 

180 from pandas._typing import ( 

181 ArrayLike, 

182 AxisInt, 

183 Dtype, 

184 FillnaOptions, 

185 InterpolateOptions, 

186 Iterator, 

187 NpDtype, 

188 NumpySorter, 

189 NumpyValueArrayLike, 

190 PositionalIndexer, 

191 Scalar, 

192 Self, 

193 SortKind, 

194 TakeIndexer, 

195 TimeAmbiguous, 

196 TimeNonexistent, 

197 npt, 

198 ) 

199 

200 from pandas import Series 

201 from pandas.core.arrays.datetimes import DatetimeArray 

202 from pandas.core.arrays.timedeltas import TimedeltaArray 

203 

204 

205def get_unit_from_pa_dtype(pa_dtype): 

206 # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804 

207 if pa_version_under11p0: 

208 unit = str(pa_dtype).split("[", 1)[-1][:-1] 

209 if unit not in ["s", "ms", "us", "ns"]: 

210 raise ValueError(pa_dtype) 

211 return unit 

212 return pa_dtype.unit 

213 

214 

215def to_pyarrow_type( 

216 dtype: ArrowDtype | pa.DataType | Dtype | None, 

217) -> pa.DataType | None: 

218 """ 

219 Convert dtype to a pyarrow type instance. 

220 """ 

221 if isinstance(dtype, ArrowDtype): 

222 return dtype.pyarrow_dtype 

223 elif isinstance(dtype, pa.DataType): 

224 return dtype 

225 elif isinstance(dtype, DatetimeTZDtype): 

226 return pa.timestamp(dtype.unit, dtype.tz) 

227 elif dtype: 

228 try: 

229 # Accepts python types too 

230 # Doesn't handle all numpy types 

231 return pa.from_numpy_dtype(dtype) 

232 except pa.ArrowNotImplementedError: 

233 pass 

234 return None 

235 

236 

237class ArrowExtensionArray( 

238 OpsMixin, 

239 ExtensionArraySupportsAnyAll, 

240 ArrowStringArrayMixin, 

241 BaseStringArrayMethods, 

242): 

243 """ 

244 Pandas ExtensionArray backed by a PyArrow ChunkedArray. 

245 

246 .. warning:: 

247 

248 ArrowExtensionArray is considered experimental. The implementation and 

249 parts of the API may change without warning. 

250 

251 Parameters 

252 ---------- 

253 values : pyarrow.Array or pyarrow.ChunkedArray 

254 

255 Attributes 

256 ---------- 

257 None 

258 

259 Methods 

260 ------- 

261 None 

262 

263 Returns 

264 ------- 

265 ArrowExtensionArray 

266 

267 Notes 

268 ----- 

269 Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__ 

270 Some methods may either raise an exception or raise a ``PerformanceWarning`` if an 

271 associated compute function is not available based on the installed version of PyArrow. 

272 

273 Please install the latest version of PyArrow to enable the best functionality and avoid 

274 potential bugs in prior versions of PyArrow. 

275 

276 Examples 

277 -------- 

278 Create an ArrowExtensionArray with :func:`pandas.array`: 

279 

280 >>> pd.array([1, 1, None], dtype="int64[pyarrow]") 

281 <ArrowExtensionArray> 

282 [1, 1, <NA>] 

283 Length: 3, dtype: int64[pyarrow] 

284 """ # noqa: E501 (http link too long) 

285 

286 _pa_array: pa.ChunkedArray 

287 _dtype: ArrowDtype 

288 

289 def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: 

290 if pa_version_under10p1: 

291 msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray." 

292 raise ImportError(msg) 

293 if isinstance(values, pa.Array): 

294 self._pa_array = pa.chunked_array([values]) 

295 elif isinstance(values, pa.ChunkedArray): 

296 self._pa_array = values 

297 else: 

298 raise ValueError( 

299 f"Unsupported type '{type(values)}' for ArrowExtensionArray" 

300 ) 

301 self._dtype = ArrowDtype(self._pa_array.type) 

302 

303 @classmethod 

304 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): 

305 """ 

306 Construct a new ExtensionArray from a sequence of scalars. 

307 """ 

308 pa_type = to_pyarrow_type(dtype) 

309 pa_array = cls._box_pa_array(scalars, pa_type=pa_type, copy=copy) 

310 arr = cls(pa_array) 

311 return arr 

312 

313 @classmethod 

314 def _from_sequence_of_strings( 

315 cls, strings, *, dtype: Dtype | None = None, copy: bool = False 

316 ): 

317 """ 

318 Construct a new ExtensionArray from a sequence of strings. 

319 """ 

320 pa_type = to_pyarrow_type(dtype) 

321 if ( 

322 pa_type is None 

323 or pa.types.is_binary(pa_type) 

324 or pa.types.is_string(pa_type) 

325 or pa.types.is_large_string(pa_type) 

326 ): 

327 # pa_type is None: Let pa.array infer 

328 # pa_type is string/binary: scalars already correct type 

329 scalars = strings 

330 elif pa.types.is_timestamp(pa_type): 

331 from pandas.core.tools.datetimes import to_datetime 

332 

333 scalars = to_datetime(strings, errors="raise") 

334 elif pa.types.is_date(pa_type): 

335 from pandas.core.tools.datetimes import to_datetime 

336 

337 scalars = to_datetime(strings, errors="raise").date 

338 elif pa.types.is_duration(pa_type): 

339 from pandas.core.tools.timedeltas import to_timedelta 

340 

341 scalars = to_timedelta(strings, errors="raise") 

342 if pa_type.unit != "ns": 

343 # GH51175: test_from_sequence_of_strings_pa_array 

344 # attempt to parse as int64 reflecting pyarrow's 

345 # duration to string casting behavior 

346 mask = isna(scalars) 

347 if not isinstance(strings, (pa.Array, pa.ChunkedArray)): 

348 strings = pa.array(strings, type=pa.string(), from_pandas=True) 

349 strings = pc.if_else(mask, None, strings) 

350 try: 

351 scalars = strings.cast(pa.int64()) 

352 except pa.ArrowInvalid: 

353 pass 

354 elif pa.types.is_time(pa_type): 

355 from pandas.core.tools.times import to_time 

356 

357 # "coerce" to allow "null times" (None) to not raise 

358 scalars = to_time(strings, errors="coerce") 

359 elif pa.types.is_boolean(pa_type): 

360 # pyarrow string->bool casting is case-insensitive: 

361 # "true" or "1" -> True 

362 # "false" or "0" -> False 

363 # Note: BooleanArray was previously used to parse these strings 

364 # and allows "1.0" and "0.0". Pyarrow casting does not support 

365 # this, but we allow it here. 

366 if isinstance(strings, (pa.Array, pa.ChunkedArray)): 

367 scalars = strings 

368 else: 

369 scalars = pa.array(strings, type=pa.string(), from_pandas=True) 

370 scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars) 

371 scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars) 

372 scalars = scalars.cast(pa.bool_()) 

373 elif ( 

374 pa.types.is_integer(pa_type) 

375 or pa.types.is_floating(pa_type) 

376 or pa.types.is_decimal(pa_type) 

377 ): 

378 from pandas.core.tools.numeric import to_numeric 

379 

380 scalars = to_numeric(strings, errors="raise") 

381 else: 

382 raise NotImplementedError( 

383 f"Converting strings to {pa_type} is not implemented." 

384 ) 

385 return cls._from_sequence(scalars, dtype=pa_type, copy=copy) 

386 

387 @classmethod 

388 def _box_pa( 

389 cls, value, pa_type: pa.DataType | None = None 

390 ) -> pa.Array | pa.ChunkedArray | pa.Scalar: 

391 """ 

392 Box value into a pyarrow Array, ChunkedArray or Scalar. 

393 

394 Parameters 

395 ---------- 

396 value : any 

397 pa_type : pa.DataType | None 

398 

399 Returns 

400 ------- 

401 pa.Array or pa.ChunkedArray or pa.Scalar 

402 """ 

403 if isinstance(value, pa.Scalar) or not is_list_like(value): 

404 return cls._box_pa_scalar(value, pa_type) 

405 return cls._box_pa_array(value, pa_type) 

406 

407 @classmethod 

408 def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar: 

409 """ 

410 Box value into a pyarrow Scalar. 

411 

412 Parameters 

413 ---------- 

414 value : any 

415 pa_type : pa.DataType | None 

416 

417 Returns 

418 ------- 

419 pa.Scalar 

420 """ 

421 if isinstance(value, pa.Scalar): 

422 pa_scalar = value 

423 elif isna(value): 

424 pa_scalar = pa.scalar(None, type=pa_type) 

425 else: 

426 # Workaround https://github.com/apache/arrow/issues/37291 

427 if isinstance(value, Timedelta): 

428 if pa_type is None: 

429 pa_type = pa.duration(value.unit) 

430 elif value.unit != pa_type.unit: 

431 value = value.as_unit(pa_type.unit) 

432 value = value._value 

433 elif isinstance(value, Timestamp): 

434 if pa_type is None: 

435 pa_type = pa.timestamp(value.unit, tz=value.tz) 

436 elif value.unit != pa_type.unit: 

437 value = value.as_unit(pa_type.unit) 

438 value = value._value 

439 

440 pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True) 

441 

442 if pa_type is not None and pa_scalar.type != pa_type: 

443 pa_scalar = pa_scalar.cast(pa_type) 

444 

445 return pa_scalar 

446 

447 @classmethod 

448 def _box_pa_array( 

449 cls, value, pa_type: pa.DataType | None = None, copy: bool = False 

450 ) -> pa.Array | pa.ChunkedArray: 

451 """ 

452 Box value into a pyarrow Array or ChunkedArray. 

453 

454 Parameters 

455 ---------- 

456 value : Sequence 

457 pa_type : pa.DataType | None 

458 

459 Returns 

460 ------- 

461 pa.Array or pa.ChunkedArray 

462 """ 

463 if isinstance(value, cls): 

464 pa_array = value._pa_array 

465 elif isinstance(value, (pa.Array, pa.ChunkedArray)): 

466 pa_array = value 

467 elif isinstance(value, BaseMaskedArray): 

468 # GH 52625 

469 if copy: 

470 value = value.copy() 

471 pa_array = value.__arrow_array__() 

472 else: 

473 if ( 

474 isinstance(value, np.ndarray) 

475 and pa_type is not None 

476 and ( 

477 pa.types.is_large_binary(pa_type) 

478 or pa.types.is_large_string(pa_type) 

479 ) 

480 ): 

481 # See https://github.com/apache/arrow/issues/35289 

482 value = value.tolist() 

483 elif copy and is_array_like(value): 

484 # pa array should not get updated when numpy array is updated 

485 value = value.copy() 

486 

487 if ( 

488 pa_type is not None 

489 and pa.types.is_duration(pa_type) 

490 and (not isinstance(value, np.ndarray) or value.dtype.kind not in "mi") 

491 ): 

492 # Workaround https://github.com/apache/arrow/issues/37291 

493 from pandas.core.tools.timedeltas import to_timedelta 

494 

495 value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) 

496 value = value.to_numpy() 

497 

498 try: 

499 pa_array = pa.array(value, type=pa_type, from_pandas=True) 

500 except (pa.ArrowInvalid, pa.ArrowTypeError): 

501 # GH50430: let pyarrow infer type, then cast 

502 pa_array = pa.array(value, from_pandas=True) 

503 

504 if pa_type is None and pa.types.is_duration(pa_array.type): 

505 # Workaround https://github.com/apache/arrow/issues/37291 

506 from pandas.core.tools.timedeltas import to_timedelta 

507 

508 value = to_timedelta(value) 

509 value = value.to_numpy() 

510 pa_array = pa.array(value, type=pa_type, from_pandas=True) 

511 

512 if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0: 

513 # GH52843: upstream bug for duration types when originally 

514 # constructed with data containing numpy NaT. 

515 # https://github.com/apache/arrow/issues/35088 

516 arr = cls(pa_array) 

517 arr = arr.fillna(arr.dtype.na_value) 

518 pa_array = arr._pa_array 

519 

520 if pa_type is not None and pa_array.type != pa_type: 

521 if pa.types.is_dictionary(pa_type): 

522 pa_array = pa_array.dictionary_encode() 

523 else: 

524 try: 

525 pa_array = pa_array.cast(pa_type) 

526 except ( 

527 pa.ArrowInvalid, 

528 pa.ArrowTypeError, 

529 pa.ArrowNotImplementedError, 

530 ): 

531 if pa.types.is_string(pa_array.type) or pa.types.is_large_string( 

532 pa_array.type 

533 ): 

534 # TODO: Move logic in _from_sequence_of_strings into 

535 # _box_pa_array 

536 return cls._from_sequence_of_strings( 

537 value, dtype=pa_type 

538 )._pa_array 

539 else: 

540 raise 

541 

542 return pa_array 

543 

544 def __getitem__(self, item: PositionalIndexer): 

545 """Select a subset of self. 

546 

547 Parameters 

548 ---------- 

549 item : int, slice, or ndarray 

550 * int: The position in 'self' to get. 

551 * slice: A slice object, where 'start', 'stop', and 'step' are 

552 integers or None 

553 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' 

554 

555 Returns 

556 ------- 

557 item : scalar or ExtensionArray 

558 

559 Notes 

560 ----- 

561 For scalar ``item``, return a scalar value suitable for the array's 

562 type. This should be an instance of ``self.dtype.type``. 

563 For slice ``key``, return an instance of ``ExtensionArray``, even 

564 if the slice is length 0 or 1. 

565 For a boolean mask, return an instance of ``ExtensionArray``, filtered 

566 to the values where ``item`` is True. 

567 """ 

568 item = check_array_indexer(self, item) 

569 

570 if isinstance(item, np.ndarray): 

571 if not len(item): 

572 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] 

573 if self._dtype.name == "string" and self._dtype.storage in ( 

574 "pyarrow", 

575 "pyarrow_numpy", 

576 ): 

577 pa_dtype = pa.string() 

578 else: 

579 pa_dtype = self._dtype.pyarrow_dtype 

580 return type(self)(pa.chunked_array([], type=pa_dtype)) 

581 elif item.dtype.kind in "iu": 

582 return self.take(item) 

583 elif item.dtype.kind == "b": 

584 return type(self)(self._pa_array.filter(item)) 

585 else: 

586 raise IndexError( 

587 "Only integers, slices and integer or " 

588 "boolean arrays are valid indices." 

589 ) 

590 elif isinstance(item, tuple): 

591 item = unpack_tuple_and_ellipses(item) 

592 

593 if item is Ellipsis: 

594 # TODO: should be handled by pyarrow? 

595 item = slice(None) 

596 

597 if is_scalar(item) and not is_integer(item): 

598 # e.g. "foo" or 2.5 

599 # exception message copied from numpy 

600 raise IndexError( 

601 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " 

602 r"(`None`) and integer or boolean arrays are valid indices" 

603 ) 

604 # We are not an array indexer, so maybe e.g. a slice or integer 

605 # indexer. We dispatch to pyarrow. 

606 if isinstance(item, slice): 

607 # Arrow bug https://github.com/apache/arrow/issues/38768 

608 if item.start == item.stop: 

609 pass 

610 elif ( 

611 item.stop is not None 

612 and item.stop < -len(self) 

613 and item.step is not None 

614 and item.step < 0 

615 ): 

616 item = slice(item.start, None, item.step) 

617 

618 value = self._pa_array[item] 

619 if isinstance(value, pa.ChunkedArray): 

620 return type(self)(value) 

621 else: 

622 pa_type = self._pa_array.type 

623 scalar = value.as_py() 

624 if scalar is None: 

625 return self._dtype.na_value 

626 elif pa.types.is_timestamp(pa_type) and pa_type.unit != "ns": 

627 # GH 53326 

628 return Timestamp(scalar).as_unit(pa_type.unit) 

629 elif pa.types.is_duration(pa_type) and pa_type.unit != "ns": 

630 # GH 53326 

631 return Timedelta(scalar).as_unit(pa_type.unit) 

632 else: 

633 return scalar 

634 

635 def __iter__(self) -> Iterator[Any]: 

636 """ 

637 Iterate over elements of the array. 

638 """ 

639 na_value = self._dtype.na_value 

640 # GH 53326 

641 pa_type = self._pa_array.type 

642 box_timestamp = pa.types.is_timestamp(pa_type) and pa_type.unit != "ns" 

643 box_timedelta = pa.types.is_duration(pa_type) and pa_type.unit != "ns" 

644 for value in self._pa_array: 

645 val = value.as_py() 

646 if val is None: 

647 yield na_value 

648 elif box_timestamp: 

649 yield Timestamp(val).as_unit(pa_type.unit) 

650 elif box_timedelta: 

651 yield Timedelta(val).as_unit(pa_type.unit) 

652 else: 

653 yield val 

654 

655 def __arrow_array__(self, type=None): 

656 """Convert myself to a pyarrow ChunkedArray.""" 

657 return self._pa_array 

658 

659 def __array__( 

660 self, dtype: NpDtype | None = None, copy: bool | None = None 

661 ) -> np.ndarray: 

662 """Correctly construct numpy arrays when passed to `np.asarray()`.""" 

663 return self.to_numpy(dtype=dtype) 

664 

665 def __invert__(self) -> Self: 

666 # This is a bit wise op for integer types 

667 if pa.types.is_integer(self._pa_array.type): 

668 return type(self)(pc.bit_wise_not(self._pa_array)) 

669 elif pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( 

670 self._pa_array.type 

671 ): 

672 # Raise TypeError instead of pa.ArrowNotImplementedError 

673 raise TypeError("__invert__ is not supported for string dtypes") 

674 else: 

675 return type(self)(pc.invert(self._pa_array)) 

676 

677 def __neg__(self) -> Self: 

678 return type(self)(pc.negate_checked(self._pa_array)) 

679 

680 def __pos__(self) -> Self: 

681 return type(self)(self._pa_array) 

682 

683 def __abs__(self) -> Self: 

684 return type(self)(pc.abs_checked(self._pa_array)) 

685 

686 # GH 42600: __getstate__/__setstate__ not necessary once 

687 # https://issues.apache.org/jira/browse/ARROW-10739 is addressed 

688 def __getstate__(self): 

689 state = self.__dict__.copy() 

690 state["_pa_array"] = self._pa_array.combine_chunks() 

691 return state 

692 

693 def __setstate__(self, state) -> None: 

694 if "_data" in state: 

695 data = state.pop("_data") 

696 else: 

697 data = state["_pa_array"] 

698 state["_pa_array"] = pa.chunked_array(data) 

699 self.__dict__.update(state) 

700 

701 def _cmp_method(self, other, op): 

702 pc_func = ARROW_CMP_FUNCS[op.__name__] 

703 if isinstance( 

704 other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) 

705 ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): 

706 result = pc_func(self._pa_array, self._box_pa(other)) 

707 elif is_scalar(other): 

708 try: 

709 result = pc_func(self._pa_array, self._box_pa(other)) 

710 except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): 

711 mask = isna(self) | isna(other) 

712 valid = ~mask 

713 result = np.zeros(len(self), dtype="bool") 

714 np_array = np.array(self) 

715 try: 

716 result[valid] = op(np_array[valid], other) 

717 except TypeError: 

718 result = ops.invalid_comparison(np_array, other, op) 

719 result = pa.array(result, type=pa.bool_()) 

720 result = pc.if_else(valid, result, None) 

721 else: 

722 raise NotImplementedError( 

723 f"{op.__name__} not implemented for {type(other)}" 

724 ) 

725 return ArrowExtensionArray(result) 

726 

727 def _evaluate_op_method(self, other, op, arrow_funcs): 

728 pa_type = self._pa_array.type 

729 other = self._box_pa(other) 

730 

731 if ( 

732 pa.types.is_string(pa_type) 

733 or pa.types.is_large_string(pa_type) 

734 or pa.types.is_binary(pa_type) 

735 ): 

736 if op in [operator.add, roperator.radd]: 

737 sep = pa.scalar("", type=pa_type) 

738 if op is operator.add: 

739 result = pc.binary_join_element_wise(self._pa_array, other, sep) 

740 elif op is roperator.radd: 

741 result = pc.binary_join_element_wise(other, self._pa_array, sep) 

742 return type(self)(result) 

743 elif op in [operator.mul, roperator.rmul]: 

744 binary = self._pa_array 

745 integral = other 

746 if not pa.types.is_integer(integral.type): 

747 raise TypeError("Can only string multiply by an integer.") 

748 pa_integral = pc.if_else(pc.less(integral, 0), 0, integral) 

749 result = pc.binary_repeat(binary, pa_integral) 

750 return type(self)(result) 

751 elif ( 

752 pa.types.is_string(other.type) 

753 or pa.types.is_binary(other.type) 

754 or pa.types.is_large_string(other.type) 

755 ) and op in [operator.mul, roperator.rmul]: 

756 binary = other 

757 integral = self._pa_array 

758 if not pa.types.is_integer(integral.type): 

759 raise TypeError("Can only string multiply by an integer.") 

760 pa_integral = pc.if_else(pc.less(integral, 0), 0, integral) 

761 result = pc.binary_repeat(binary, pa_integral) 

762 return type(self)(result) 

763 if ( 

764 isinstance(other, pa.Scalar) 

765 and pc.is_null(other).as_py() 

766 and op.__name__ in ARROW_LOGICAL_FUNCS 

767 ): 

768 # pyarrow kleene ops require null to be typed 

769 other = other.cast(pa_type) 

770 

771 pc_func = arrow_funcs[op.__name__] 

772 if pc_func is NotImplemented: 

773 raise NotImplementedError(f"{op.__name__} not implemented.") 

774 

775 result = pc_func(self._pa_array, other) 

776 return type(self)(result) 

777 

778 def _logical_method(self, other, op): 

779 # For integer types `^`, `|`, `&` are bitwise operators and return 

780 # integer types. Otherwise these are boolean ops. 

781 if pa.types.is_integer(self._pa_array.type): 

782 return self._evaluate_op_method(other, op, ARROW_BIT_WISE_FUNCS) 

783 else: 

784 return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) 

785 

786 def _arith_method(self, other, op): 

787 return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) 

788 

789 def equals(self, other) -> bool: 

790 if not isinstance(other, ArrowExtensionArray): 

791 return False 

792 # I'm told that pyarrow makes __eq__ behave like pandas' equals; 

793 # TODO: is this documented somewhere? 

794 return self._pa_array == other._pa_array 

795 

796 @property 

797 def dtype(self) -> ArrowDtype: 

798 """ 

799 An instance of 'ExtensionDtype'. 

800 """ 

801 return self._dtype 

802 

803 @property 

804 def nbytes(self) -> int: 

805 """ 

806 The number of bytes needed to store this object in memory. 

807 """ 

808 return self._pa_array.nbytes 

809 

810 def __len__(self) -> int: 

811 """ 

812 Length of this array. 

813 

814 Returns 

815 ------- 

816 length : int 

817 """ 

818 return len(self._pa_array) 

819 

820 def __contains__(self, key) -> bool: 

821 # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 

822 if isna(key) and key is not self.dtype.na_value: 

823 if self.dtype.kind == "f" and lib.is_float(key): 

824 return pc.any(pc.is_nan(self._pa_array)).as_py() 

825 

826 # e.g. date or timestamp types we do not allow None here to match pd.NA 

827 return False 

828 # TODO: maybe complex? object? 

829 

830 return bool(super().__contains__(key)) 

831 

832 @property 

833 def _hasna(self) -> bool: 

834 return self._pa_array.null_count > 0 

835 

836 def isna(self) -> npt.NDArray[np.bool_]: 

837 """ 

838 Boolean NumPy array indicating if each value is missing. 

839 

840 This should return a 1-D array the same length as 'self'. 

841 """ 

842 # GH51630: fast paths 

843 null_count = self._pa_array.null_count 

844 if null_count == 0: 

845 return np.zeros(len(self), dtype=np.bool_) 

846 elif null_count == len(self): 

847 return np.ones(len(self), dtype=np.bool_) 

848 

849 return self._pa_array.is_null().to_numpy() 

850 

851 def any(self, *, skipna: bool = True, **kwargs): 

852 """ 

853 Return whether any element is truthy. 

854 

855 Returns False unless there is at least one element that is truthy. 

856 By default, NAs are skipped. If ``skipna=False`` is specified and 

857 missing values are present, similar :ref:`Kleene logic <boolean.kleene>` 

858 is used as for logical operations. 

859 

860 Parameters 

861 ---------- 

862 skipna : bool, default True 

863 Exclude NA values. If the entire array is NA and `skipna` is 

864 True, then the result will be False, as for an empty array. 

865 If `skipna` is False, the result will still be True if there is 

866 at least one element that is truthy, otherwise NA will be returned 

867 if there are NA's present. 

868 

869 Returns 

870 ------- 

871 bool or :attr:`pandas.NA` 

872 

873 See Also 

874 -------- 

875 ArrowExtensionArray.all : Return whether all elements are truthy. 

876 

877 Examples 

878 -------- 

879 The result indicates whether any element is truthy (and by default 

880 skips NAs): 

881 

882 >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any() 

883 True 

884 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any() 

885 True 

886 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any() 

887 False 

888 >>> pd.array([], dtype="boolean[pyarrow]").any() 

889 False 

890 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any() 

891 False 

892 >>> pd.array([pd.NA], dtype="float64[pyarrow]").any() 

893 False 

894 

895 With ``skipna=False``, the result can be NA if this is logically 

896 required (whether ``pd.NA`` is True or False influences the result): 

897 

898 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) 

899 True 

900 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) 

901 True 

902 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) 

903 <NA> 

904 >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) 

905 <NA> 

906 """ 

907 return self._reduce("any", skipna=skipna, **kwargs) 

908 

909 def all(self, *, skipna: bool = True, **kwargs): 

910 """ 

911 Return whether all elements are truthy. 

912 

913 Returns True unless there is at least one element that is falsey. 

914 By default, NAs are skipped. If ``skipna=False`` is specified and 

915 missing values are present, similar :ref:`Kleene logic <boolean.kleene>` 

916 is used as for logical operations. 

917 

918 Parameters 

919 ---------- 

920 skipna : bool, default True 

921 Exclude NA values. If the entire array is NA and `skipna` is 

922 True, then the result will be True, as for an empty array. 

923 If `skipna` is False, the result will still be False if there is 

924 at least one element that is falsey, otherwise NA will be returned 

925 if there are NA's present. 

926 

927 Returns 

928 ------- 

929 bool or :attr:`pandas.NA` 

930 

931 See Also 

932 -------- 

933 ArrowExtensionArray.any : Return whether any element is truthy. 

934 

935 Examples 

936 -------- 

937 The result indicates whether all elements are truthy (and by default 

938 skips NAs): 

939 

940 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all() 

941 True 

942 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all() 

943 True 

944 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all() 

945 False 

946 >>> pd.array([], dtype="boolean[pyarrow]").all() 

947 True 

948 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all() 

949 True 

950 >>> pd.array([pd.NA], dtype="float64[pyarrow]").all() 

951 True 

952 

953 With ``skipna=False``, the result can be NA if this is logically 

954 required (whether ``pd.NA`` is True or False influences the result): 

955 

956 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) 

957 <NA> 

958 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) 

959 <NA> 

960 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) 

961 False 

962 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) 

963 False 

964 """ 

965 return self._reduce("all", skipna=skipna, **kwargs) 

966 

967 def argsort( 

968 self, 

969 *, 

970 ascending: bool = True, 

971 kind: SortKind = "quicksort", 

972 na_position: str = "last", 

973 **kwargs, 

974 ) -> np.ndarray: 

975 order = "ascending" if ascending else "descending" 

976 null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None) 

977 if null_placement is None: 

978 raise ValueError(f"invalid na_position: {na_position}") 

979 

980 result = pc.array_sort_indices( 

981 self._pa_array, order=order, null_placement=null_placement 

982 ) 

983 np_result = result.to_numpy() 

984 return np_result.astype(np.intp, copy=False) 

985 

986 def _argmin_max(self, skipna: bool, method: str) -> int: 

987 if self._pa_array.length() in (0, self._pa_array.null_count) or ( 

988 self._hasna and not skipna 

989 ): 

990 # For empty or all null, pyarrow returns -1 but pandas expects TypeError 

991 # For skipna=False and data w/ null, pandas expects NotImplementedError 

992 # let ExtensionArray.arg{max|min} raise 

993 return getattr(super(), f"arg{method}")(skipna=skipna) 

994 

995 data = self._pa_array 

996 if pa.types.is_duration(data.type): 

997 data = data.cast(pa.int64()) 

998 

999 value = getattr(pc, method)(data, skip_nulls=skipna) 

1000 return pc.index(data, value).as_py() 

1001 

1002 def argmin(self, skipna: bool = True) -> int: 

1003 return self._argmin_max(skipna, "min") 

1004 

1005 def argmax(self, skipna: bool = True) -> int: 

1006 return self._argmin_max(skipna, "max") 

1007 

1008 def copy(self) -> Self: 

1009 """ 

1010 Return a shallow copy of the array. 

1011 

1012 Underlying ChunkedArray is immutable, so a deep copy is unnecessary. 

1013 

1014 Returns 

1015 ------- 

1016 type(self) 

1017 """ 

1018 return type(self)(self._pa_array) 

1019 

1020 def dropna(self) -> Self: 

1021 """ 

1022 Return ArrowExtensionArray without NA values. 

1023 

1024 Returns 

1025 ------- 

1026 ArrowExtensionArray 

1027 """ 

1028 return type(self)(pc.drop_null(self._pa_array)) 

1029 

1030 def _pad_or_backfill( 

1031 self, 

1032 *, 

1033 method: FillnaOptions, 

1034 limit: int | None = None, 

1035 limit_area: Literal["inside", "outside"] | None = None, 

1036 copy: bool = True, 

1037 ) -> Self: 

1038 if not self._hasna: 

1039 # TODO(CoW): Not necessary anymore when CoW is the default 

1040 return self.copy() 

1041 

1042 if limit is None and limit_area is None: 

1043 method = missing.clean_fill_method(method) 

1044 try: 

1045 if method == "pad": 

1046 return type(self)(pc.fill_null_forward(self._pa_array)) 

1047 elif method == "backfill": 

1048 return type(self)(pc.fill_null_backward(self._pa_array)) 

1049 except pa.ArrowNotImplementedError: 

1050 # ArrowNotImplementedError: Function 'coalesce' has no kernel 

1051 # matching input types (duration[ns], duration[ns]) 

1052 # TODO: remove try/except wrapper if/when pyarrow implements 

1053 # a kernel for duration types. 

1054 pass 

1055 

1056 # TODO(3.0): after EA.fillna 'method' deprecation is enforced, we can remove 

1057 # this method entirely. 

1058 return super()._pad_or_backfill( 

1059 method=method, limit=limit, limit_area=limit_area, copy=copy 

1060 ) 

1061 

1062 @doc(ExtensionArray.fillna) 

1063 def fillna( 

1064 self, 

1065 value: object | ArrayLike | None = None, 

1066 method: FillnaOptions | None = None, 

1067 limit: int | None = None, 

1068 copy: bool = True, 

1069 ) -> Self: 

1070 value, method = validate_fillna_kwargs(value, method) 

1071 

1072 if not self._hasna: 

1073 # TODO(CoW): Not necessary anymore when CoW is the default 

1074 return self.copy() 

1075 

1076 if limit is not None: 

1077 return super().fillna(value=value, method=method, limit=limit, copy=copy) 

1078 

1079 if method is not None: 

1080 return super().fillna(method=method, limit=limit, copy=copy) 

1081 

1082 if isinstance(value, (np.ndarray, ExtensionArray)): 

1083 # Similar to check_value_size, but we do not mask here since we may 

1084 # end up passing it to the super() method. 

1085 if len(value) != len(self): 

1086 raise ValueError( 

1087 f"Length of 'value' does not match. Got ({len(value)}) " 

1088 f" expected {len(self)}" 

1089 ) 

1090 

1091 try: 

1092 fill_value = self._box_pa(value, pa_type=self._pa_array.type) 

1093 except pa.ArrowTypeError as err: 

1094 msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" 

1095 raise TypeError(msg) from err 

1096 

1097 try: 

1098 return type(self)(pc.fill_null(self._pa_array, fill_value=fill_value)) 

1099 except pa.ArrowNotImplementedError: 

1100 # ArrowNotImplementedError: Function 'coalesce' has no kernel 

1101 # matching input types (duration[ns], duration[ns]) 

1102 # TODO: remove try/except wrapper if/when pyarrow implements 

1103 # a kernel for duration types. 

1104 pass 

1105 

1106 return super().fillna(value=value, method=method, limit=limit, copy=copy) 

1107 

1108 def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: 

1109 # short-circuit to return all False array. 

1110 if not len(values): 

1111 return np.zeros(len(self), dtype=bool) 

1112 

1113 result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True)) 

1114 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls 

1115 # to False 

1116 return np.array(result, dtype=np.bool_) 

1117 

1118 def _values_for_factorize(self) -> tuple[np.ndarray, Any]: 

1119 """ 

1120 Return an array and missing value suitable for factorization. 

1121 

1122 Returns 

1123 ------- 

1124 values : ndarray 

1125 na_value : pd.NA 

1126 

1127 Notes 

1128 ----- 

1129 The values returned by this method are also used in 

1130 :func:`pandas.util.hash_pandas_object`. 

1131 """ 

1132 values = self._pa_array.to_numpy() 

1133 return values, self.dtype.na_value 

1134 

1135 @doc(ExtensionArray.factorize) 

1136 def factorize( 

1137 self, 

1138 use_na_sentinel: bool = True, 

1139 ) -> tuple[np.ndarray, ExtensionArray]: 

1140 null_encoding = "mask" if use_na_sentinel else "encode" 

1141 

1142 data = self._pa_array 

1143 pa_type = data.type 

1144 if pa_version_under11p0 and pa.types.is_duration(pa_type): 

1145 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 

1146 data = data.cast(pa.int64()) 

1147 

1148 if pa.types.is_dictionary(data.type): 

1149 encoded = data 

1150 else: 

1151 encoded = data.dictionary_encode(null_encoding=null_encoding) 

1152 if encoded.length() == 0: 

1153 indices = np.array([], dtype=np.intp) 

1154 uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) 

1155 else: 

1156 # GH 54844 

1157 combined = encoded.combine_chunks() 

1158 pa_indices = combined.indices 

1159 if pa_indices.null_count > 0: 

1160 pa_indices = pc.fill_null(pa_indices, -1) 

1161 indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype( 

1162 np.intp, copy=False 

1163 ) 

1164 uniques = type(self)(combined.dictionary) 

1165 

1166 if pa_version_under11p0 and pa.types.is_duration(pa_type): 

1167 uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) 

1168 return indices, uniques 

1169 

1170 def reshape(self, *args, **kwargs): 

1171 raise NotImplementedError( 

1172 f"{type(self)} does not support reshape " 

1173 f"as backed by a 1D pyarrow.ChunkedArray." 

1174 ) 

1175 

1176 def round(self, decimals: int = 0, *args, **kwargs) -> Self: 

1177 """ 

1178 Round each value in the array a to the given number of decimals. 

1179 

1180 Parameters 

1181 ---------- 

1182 decimals : int, default 0 

1183 Number of decimal places to round to. If decimals is negative, 

1184 it specifies the number of positions to the left of the decimal point. 

1185 *args, **kwargs 

1186 Additional arguments and keywords have no effect. 

1187 

1188 Returns 

1189 ------- 

1190 ArrowExtensionArray 

1191 Rounded values of the ArrowExtensionArray. 

1192 

1193 See Also 

1194 -------- 

1195 DataFrame.round : Round values of a DataFrame. 

1196 Series.round : Round values of a Series. 

1197 """ 

1198 return type(self)(pc.round(self._pa_array, ndigits=decimals)) 

1199 

1200 @doc(ExtensionArray.searchsorted) 

1201 def searchsorted( 

1202 self, 

1203 value: NumpyValueArrayLike | ExtensionArray, 

1204 side: Literal["left", "right"] = "left", 

1205 sorter: NumpySorter | None = None, 

1206 ) -> npt.NDArray[np.intp] | np.intp: 

1207 if self._hasna: 

1208 raise ValueError( 

1209 "searchsorted requires array to be sorted, which is impossible " 

1210 "with NAs present." 

1211 ) 

1212 if isinstance(value, ExtensionArray): 

1213 value = value.astype(object) 

1214 # Base class searchsorted would cast to object, which is *much* slower. 

1215 dtype = None 

1216 if isinstance(self.dtype, ArrowDtype): 

1217 pa_dtype = self.dtype.pyarrow_dtype 

1218 if ( 

1219 pa.types.is_timestamp(pa_dtype) or pa.types.is_duration(pa_dtype) 

1220 ) and pa_dtype.unit == "ns": 

1221 # np.array[datetime/timedelta].searchsorted(datetime/timedelta) 

1222 # erroneously fails when numpy type resolution is nanoseconds 

1223 dtype = object 

1224 return self.to_numpy(dtype=dtype).searchsorted(value, side=side, sorter=sorter) 

1225 

1226 def take( 

1227 self, 

1228 indices: TakeIndexer, 

1229 allow_fill: bool = False, 

1230 fill_value: Any = None, 

1231 ) -> ArrowExtensionArray: 

1232 """ 

1233 Take elements from an array. 

1234 

1235 Parameters 

1236 ---------- 

1237 indices : sequence of int or one-dimensional np.ndarray of int 

1238 Indices to be taken. 

1239 allow_fill : bool, default False 

1240 How to handle negative values in `indices`. 

1241 

1242 * False: negative values in `indices` indicate positional indices 

1243 from the right (the default). This is similar to 

1244 :func:`numpy.take`. 

1245 

1246 * True: negative values in `indices` indicate 

1247 missing values. These values are set to `fill_value`. Any other 

1248 other negative values raise a ``ValueError``. 

1249 

1250 fill_value : any, optional 

1251 Fill value to use for NA-indices when `allow_fill` is True. 

1252 This may be ``None``, in which case the default NA value for 

1253 the type, ``self.dtype.na_value``, is used. 

1254 

1255 For many ExtensionArrays, there will be two representations of 

1256 `fill_value`: a user-facing "boxed" scalar, and a low-level 

1257 physical NA value. `fill_value` should be the user-facing version, 

1258 and the implementation should handle translating that to the 

1259 physical version for processing the take if necessary. 

1260 

1261 Returns 

1262 ------- 

1263 ExtensionArray 

1264 

1265 Raises 

1266 ------ 

1267 IndexError 

1268 When the indices are out of bounds for the array. 

1269 ValueError 

1270 When `indices` contains negative values other than ``-1`` 

1271 and `allow_fill` is True. 

1272 

1273 See Also 

1274 -------- 

1275 numpy.take 

1276 api.extensions.take 

1277 

1278 Notes 

1279 ----- 

1280 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, 

1281 ``iloc``, when `indices` is a sequence of values. Additionally, 

1282 it's called by :meth:`Series.reindex`, or any other method 

1283 that causes realignment, with a `fill_value`. 

1284 """ 

1285 indices_array = np.asanyarray(indices) 

1286 

1287 if len(self._pa_array) == 0 and (indices_array >= 0).any(): 

1288 raise IndexError("cannot do a non-empty take") 

1289 if indices_array.size > 0 and indices_array.max() >= len(self._pa_array): 

1290 raise IndexError("out of bounds value in 'indices'.") 

1291 

1292 if allow_fill: 

1293 fill_mask = indices_array < 0 

1294 if fill_mask.any(): 

1295 validate_indices(indices_array, len(self._pa_array)) 

1296 # TODO(ARROW-9433): Treat negative indices as NULL 

1297 indices_array = pa.array(indices_array, mask=fill_mask) 

1298 result = self._pa_array.take(indices_array) 

1299 if isna(fill_value): 

1300 return type(self)(result) 

1301 # TODO: ArrowNotImplementedError: Function fill_null has no 

1302 # kernel matching input types (array[string], scalar[string]) 

1303 result = type(self)(result) 

1304 result[fill_mask] = fill_value 

1305 return result 

1306 # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) 

1307 else: 

1308 # Nothing to fill 

1309 return type(self)(self._pa_array.take(indices)) 

1310 else: # allow_fill=False 

1311 # TODO(ARROW-9432): Treat negative indices as indices from the right. 

1312 if (indices_array < 0).any(): 

1313 # Don't modify in-place 

1314 indices_array = np.copy(indices_array) 

1315 indices_array[indices_array < 0] += len(self._pa_array) 

1316 return type(self)(self._pa_array.take(indices_array)) 

1317 

1318 def _maybe_convert_datelike_array(self): 

1319 """Maybe convert to a datelike array.""" 

1320 pa_type = self._pa_array.type 

1321 if pa.types.is_timestamp(pa_type): 

1322 return self._to_datetimearray() 

1323 elif pa.types.is_duration(pa_type): 

1324 return self._to_timedeltaarray() 

1325 return self 

1326 

1327 def _to_datetimearray(self) -> DatetimeArray: 

1328 """Convert a pyarrow timestamp typed array to a DatetimeArray.""" 

1329 from pandas.core.arrays.datetimes import ( 

1330 DatetimeArray, 

1331 tz_to_dtype, 

1332 ) 

1333 

1334 pa_type = self._pa_array.type 

1335 assert pa.types.is_timestamp(pa_type) 

1336 np_dtype = np.dtype(f"M8[{pa_type.unit}]") 

1337 dtype = tz_to_dtype(pa_type.tz, pa_type.unit) 

1338 np_array = self._pa_array.to_numpy() 

1339 np_array = np_array.astype(np_dtype) 

1340 return DatetimeArray._simple_new(np_array, dtype=dtype) 

1341 

1342 def _to_timedeltaarray(self) -> TimedeltaArray: 

1343 """Convert a pyarrow duration typed array to a TimedeltaArray.""" 

1344 from pandas.core.arrays.timedeltas import TimedeltaArray 

1345 

1346 pa_type = self._pa_array.type 

1347 assert pa.types.is_duration(pa_type) 

1348 np_dtype = np.dtype(f"m8[{pa_type.unit}]") 

1349 np_array = self._pa_array.to_numpy() 

1350 np_array = np_array.astype(np_dtype) 

1351 return TimedeltaArray._simple_new(np_array, dtype=np_dtype) 

1352 

1353 def _values_for_json(self) -> np.ndarray: 

1354 if is_numeric_dtype(self.dtype): 

1355 return np.asarray(self, dtype=object) 

1356 return super()._values_for_json() 

1357 

1358 @doc(ExtensionArray.to_numpy) 

1359 def to_numpy( 

1360 self, 

1361 dtype: npt.DTypeLike | None = None, 

1362 copy: bool = False, 

1363 na_value: object = lib.no_default, 

1364 ) -> np.ndarray: 

1365 original_na_value = na_value 

1366 dtype, na_value = to_numpy_dtype_inference(self, dtype, na_value, self._hasna) 

1367 pa_type = self._pa_array.type 

1368 if not self._hasna or isna(na_value) or pa.types.is_null(pa_type): 

1369 data = self 

1370 else: 

1371 data = self.fillna(na_value) 

1372 copy = False 

1373 

1374 if pa.types.is_timestamp(pa_type) or pa.types.is_duration(pa_type): 

1375 # GH 55997 

1376 if dtype != object and na_value is self.dtype.na_value: 

1377 na_value = lib.no_default 

1378 result = data._maybe_convert_datelike_array().to_numpy( 

1379 dtype=dtype, na_value=na_value 

1380 ) 

1381 elif pa.types.is_time(pa_type) or pa.types.is_date(pa_type): 

1382 # convert to list of python datetime.time objects before 

1383 # wrapping in ndarray 

1384 result = np.array(list(data), dtype=dtype) 

1385 if data._hasna: 

1386 result[data.isna()] = na_value 

1387 elif pa.types.is_null(pa_type): 

1388 if dtype is not None and isna(na_value): 

1389 na_value = None 

1390 result = np.full(len(data), fill_value=na_value, dtype=dtype) 

1391 elif not data._hasna or ( 

1392 pa.types.is_floating(pa_type) 

1393 and ( 

1394 na_value is np.nan 

1395 or original_na_value is lib.no_default 

1396 and is_float_dtype(dtype) 

1397 ) 

1398 ): 

1399 result = data._pa_array.to_numpy() 

1400 if dtype is not None: 

1401 result = result.astype(dtype, copy=False) 

1402 if copy: 

1403 result = result.copy() 

1404 else: 

1405 if dtype is None: 

1406 empty = pa.array([], type=pa_type).to_numpy(zero_copy_only=False) 

1407 if can_hold_element(empty, na_value): 

1408 dtype = empty.dtype 

1409 else: 

1410 dtype = np.object_ 

1411 result = np.empty(len(data), dtype=dtype) 

1412 mask = data.isna() 

1413 result[mask] = na_value 

1414 result[~mask] = data[~mask]._pa_array.to_numpy() 

1415 return result 

1416 

1417 def map(self, mapper, na_action=None): 

1418 if is_numeric_dtype(self.dtype): 

1419 return map_array(self.to_numpy(), mapper, na_action=na_action) 

1420 else: 

1421 return super().map(mapper, na_action) 

1422 

1423 @doc(ExtensionArray.duplicated) 

1424 def duplicated( 

1425 self, keep: Literal["first", "last", False] = "first" 

1426 ) -> npt.NDArray[np.bool_]: 

1427 pa_type = self._pa_array.type 

1428 if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type): 

1429 values = self.to_numpy(na_value=0) 

1430 elif pa.types.is_boolean(pa_type): 

1431 values = self.to_numpy(na_value=False) 

1432 elif pa.types.is_temporal(pa_type): 

1433 if pa_type.bit_width == 32: 

1434 pa_type = pa.int32() 

1435 else: 

1436 pa_type = pa.int64() 

1437 arr = self.astype(ArrowDtype(pa_type)) 

1438 values = arr.to_numpy(na_value=0) 

1439 else: 

1440 # factorize the values to avoid the performance penalty of 

1441 # converting to object dtype 

1442 values = self.factorize()[0] 

1443 

1444 mask = self.isna() if self._hasna else None 

1445 return algos.duplicated(values, keep=keep, mask=mask) 

1446 

1447 def unique(self) -> Self: 

1448 """ 

1449 Compute the ArrowExtensionArray of unique values. 

1450 

1451 Returns 

1452 ------- 

1453 ArrowExtensionArray 

1454 """ 

1455 pa_type = self._pa_array.type 

1456 

1457 if pa_version_under11p0 and pa.types.is_duration(pa_type): 

1458 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 

1459 data = self._pa_array.cast(pa.int64()) 

1460 else: 

1461 data = self._pa_array 

1462 

1463 pa_result = pc.unique(data) 

1464 

1465 if pa_version_under11p0 and pa.types.is_duration(pa_type): 

1466 pa_result = pa_result.cast(pa_type) 

1467 

1468 return type(self)(pa_result) 

1469 

1470 def value_counts(self, dropna: bool = True) -> Series: 

1471 """ 

1472 Return a Series containing counts of each unique value. 

1473 

1474 Parameters 

1475 ---------- 

1476 dropna : bool, default True 

1477 Don't include counts of missing values. 

1478 

1479 Returns 

1480 ------- 

1481 counts : Series 

1482 

1483 See Also 

1484 -------- 

1485 Series.value_counts 

1486 """ 

1487 pa_type = self._pa_array.type 

1488 if pa_version_under11p0 and pa.types.is_duration(pa_type): 

1489 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 

1490 data = self._pa_array.cast(pa.int64()) 

1491 else: 

1492 data = self._pa_array 

1493 

1494 from pandas import ( 

1495 Index, 

1496 Series, 

1497 ) 

1498 

1499 vc = data.value_counts() 

1500 

1501 values = vc.field(0) 

1502 counts = vc.field(1) 

1503 if dropna and data.null_count > 0: 

1504 mask = values.is_valid() 

1505 values = values.filter(mask) 

1506 counts = counts.filter(mask) 

1507 

1508 if pa_version_under11p0 and pa.types.is_duration(pa_type): 

1509 values = values.cast(pa_type) 

1510 

1511 counts = ArrowExtensionArray(counts) 

1512 

1513 index = Index(type(self)(values)) 

1514 

1515 return Series(counts, index=index, name="count", copy=False) 

1516 

1517 @classmethod 

1518 def _concat_same_type(cls, to_concat) -> Self: 

1519 """ 

1520 Concatenate multiple ArrowExtensionArrays. 

1521 

1522 Parameters 

1523 ---------- 

1524 to_concat : sequence of ArrowExtensionArrays 

1525 

1526 Returns 

1527 ------- 

1528 ArrowExtensionArray 

1529 """ 

1530 chunks = [array for ea in to_concat for array in ea._pa_array.iterchunks()] 

1531 if to_concat[0].dtype == "string": 

1532 # StringDtype has no attribute pyarrow_dtype 

1533 pa_dtype = pa.large_string() 

1534 else: 

1535 pa_dtype = to_concat[0].dtype.pyarrow_dtype 

1536 arr = pa.chunked_array(chunks, type=pa_dtype) 

1537 return cls(arr) 

1538 

1539 def _accumulate( 

1540 self, name: str, *, skipna: bool = True, **kwargs 

1541 ) -> ArrowExtensionArray | ExtensionArray: 

1542 """ 

1543 Return an ExtensionArray performing an accumulation operation. 

1544 

1545 The underlying data type might change. 

1546 

1547 Parameters 

1548 ---------- 

1549 name : str 

1550 Name of the function, supported values are: 

1551 - cummin 

1552 - cummax 

1553 - cumsum 

1554 - cumprod 

1555 skipna : bool, default True 

1556 If True, skip NA values. 

1557 **kwargs 

1558 Additional keyword arguments passed to the accumulation function. 

1559 Currently, there is no supported kwarg. 

1560 

1561 Returns 

1562 ------- 

1563 array 

1564 

1565 Raises 

1566 ------ 

1567 NotImplementedError : subclass does not define accumulations 

1568 """ 

1569 pyarrow_name = { 

1570 "cummax": "cumulative_max", 

1571 "cummin": "cumulative_min", 

1572 "cumprod": "cumulative_prod_checked", 

1573 "cumsum": "cumulative_sum_checked", 

1574 }.get(name, name) 

1575 pyarrow_meth = getattr(pc, pyarrow_name, None) 

1576 if pyarrow_meth is None: 

1577 return super()._accumulate(name, skipna=skipna, **kwargs) 

1578 

1579 data_to_accum = self._pa_array 

1580 

1581 pa_dtype = data_to_accum.type 

1582 

1583 convert_to_int = ( 

1584 pa.types.is_temporal(pa_dtype) and name in ["cummax", "cummin"] 

1585 ) or (pa.types.is_duration(pa_dtype) and name == "cumsum") 

1586 

1587 if convert_to_int: 

1588 if pa_dtype.bit_width == 32: 

1589 data_to_accum = data_to_accum.cast(pa.int32()) 

1590 else: 

1591 data_to_accum = data_to_accum.cast(pa.int64()) 

1592 

1593 result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) 

1594 

1595 if convert_to_int: 

1596 result = result.cast(pa_dtype) 

1597 

1598 return type(self)(result) 

1599 

1600 def _reduce_pyarrow(self, name: str, *, skipna: bool = True, **kwargs) -> pa.Scalar: 

1601 """ 

1602 Return a pyarrow scalar result of performing the reduction operation. 

1603 

1604 Parameters 

1605 ---------- 

1606 name : str 

1607 Name of the function, supported values are: 

1608 { any, all, min, max, sum, mean, median, prod, 

1609 std, var, sem, kurt, skew }. 

1610 skipna : bool, default True 

1611 If True, skip NaN values. 

1612 **kwargs 

1613 Additional keyword arguments passed to the reduction function. 

1614 Currently, `ddof` is the only supported kwarg. 

1615 

1616 Returns 

1617 ------- 

1618 pyarrow scalar 

1619 

1620 Raises 

1621 ------ 

1622 TypeError : subclass does not define reductions 

1623 """ 

1624 pa_type = self._pa_array.type 

1625 

1626 data_to_reduce = self._pa_array 

1627 

1628 cast_kwargs = {} if pa_version_under13p0 else {"safe": False} 

1629 

1630 if name in ["any", "all"] and ( 

1631 pa.types.is_integer(pa_type) 

1632 or pa.types.is_floating(pa_type) 

1633 or pa.types.is_duration(pa_type) 

1634 or pa.types.is_decimal(pa_type) 

1635 ): 

1636 # pyarrow only supports any/all for boolean dtype, we allow 

1637 # for other dtypes, matching our non-pyarrow behavior 

1638 

1639 if pa.types.is_duration(pa_type): 

1640 data_to_cmp = self._pa_array.cast(pa.int64()) 

1641 else: 

1642 data_to_cmp = self._pa_array 

1643 

1644 not_eq = pc.not_equal(data_to_cmp, 0) 

1645 data_to_reduce = not_eq 

1646 

1647 elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): 

1648 data_to_reduce = self._pa_array.cast(pa.int64()) 

1649 

1650 elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type): 

1651 nbits = pa_type.bit_width 

1652 if nbits == 32: 

1653 data_to_reduce = self._pa_array.cast(pa.int32()) 

1654 else: 

1655 data_to_reduce = self._pa_array.cast(pa.int64()) 

1656 

1657 if name == "sem": 

1658 

1659 def pyarrow_meth(data, skip_nulls, **kwargs): 

1660 numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs) 

1661 denominator = pc.sqrt_checked(pc.count(self._pa_array)) 

1662 return pc.divide_checked(numerator, denominator) 

1663 

1664 else: 

1665 pyarrow_name = { 

1666 "median": "quantile", 

1667 "prod": "product", 

1668 "std": "stddev", 

1669 "var": "variance", 

1670 }.get(name, name) 

1671 # error: Incompatible types in assignment 

1672 # (expression has type "Optional[Any]", variable has type 

1673 # "Callable[[Any, Any, KwArg(Any)], Any]") 

1674 pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment] 

1675 if pyarrow_meth is None: 

1676 # Let ExtensionArray._reduce raise the TypeError 

1677 return super()._reduce(name, skipna=skipna, **kwargs) 

1678 

1679 # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0 

1680 if name in ["any", "all"] and "min_count" not in kwargs: 

1681 kwargs["min_count"] = 0 

1682 elif name == "median": 

1683 # GH 52679: Use quantile instead of approximate_median 

1684 kwargs["q"] = 0.5 

1685 

1686 try: 

1687 result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs) 

1688 except (AttributeError, NotImplementedError, TypeError) as err: 

1689 msg = ( 

1690 f"'{type(self).__name__}' with dtype {self.dtype} " 

1691 f"does not support reduction '{name}' with pyarrow " 

1692 f"version {pa.__version__}. '{name}' may be supported by " 

1693 f"upgrading pyarrow." 

1694 ) 

1695 raise TypeError(msg) from err 

1696 if name == "median": 

1697 # GH 52679: Use quantile instead of approximate_median; returns array 

1698 result = result[0] 

1699 if pc.is_null(result).as_py(): 

1700 return result 

1701 

1702 if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): 

1703 result = result.cast(pa_type) 

1704 if name in ["median", "mean"] and pa.types.is_temporal(pa_type): 

1705 if not pa_version_under13p0: 

1706 nbits = pa_type.bit_width 

1707 if nbits == 32: 

1708 result = result.cast(pa.int32(), **cast_kwargs) 

1709 else: 

1710 result = result.cast(pa.int64(), **cast_kwargs) 

1711 result = result.cast(pa_type) 

1712 if name in ["std", "sem"] and pa.types.is_temporal(pa_type): 

1713 result = result.cast(pa.int64(), **cast_kwargs) 

1714 if pa.types.is_duration(pa_type): 

1715 result = result.cast(pa_type) 

1716 elif pa.types.is_time(pa_type): 

1717 unit = get_unit_from_pa_dtype(pa_type) 

1718 result = result.cast(pa.duration(unit)) 

1719 elif pa.types.is_date(pa_type): 

1720 # go with closest available unit, i.e. "s" 

1721 result = result.cast(pa.duration("s")) 

1722 else: 

1723 # i.e. timestamp 

1724 result = result.cast(pa.duration(pa_type.unit)) 

1725 

1726 return result 

1727 

1728 def _reduce( 

1729 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs 

1730 ): 

1731 """ 

1732 Return a scalar result of performing the reduction operation. 

1733 

1734 Parameters 

1735 ---------- 

1736 name : str 

1737 Name of the function, supported values are: 

1738 { any, all, min, max, sum, mean, median, prod, 

1739 std, var, sem, kurt, skew }. 

1740 skipna : bool, default True 

1741 If True, skip NaN values. 

1742 **kwargs 

1743 Additional keyword arguments passed to the reduction function. 

1744 Currently, `ddof` is the only supported kwarg. 

1745 

1746 Returns 

1747 ------- 

1748 scalar 

1749 

1750 Raises 

1751 ------ 

1752 TypeError : subclass does not define reductions 

1753 """ 

1754 result = self._reduce_calc(name, skipna=skipna, keepdims=keepdims, **kwargs) 

1755 if isinstance(result, pa.Array): 

1756 return type(self)(result) 

1757 else: 

1758 return result 

1759 

1760 def _reduce_calc( 

1761 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs 

1762 ): 

1763 pa_result = self._reduce_pyarrow(name, skipna=skipna, **kwargs) 

1764 

1765 if keepdims: 

1766 if isinstance(pa_result, pa.Scalar): 

1767 result = pa.array([pa_result.as_py()], type=pa_result.type) 

1768 else: 

1769 result = pa.array( 

1770 [pa_result], 

1771 type=to_pyarrow_type(infer_dtype_from_scalar(pa_result)[0]), 

1772 ) 

1773 return result 

1774 

1775 if pc.is_null(pa_result).as_py(): 

1776 return self.dtype.na_value 

1777 elif isinstance(pa_result, pa.Scalar): 

1778 return pa_result.as_py() 

1779 else: 

1780 return pa_result 

1781 

1782 def _explode(self): 

1783 """ 

1784 See Series.explode.__doc__. 

1785 """ 

1786 # child class explode method supports only list types; return 

1787 # default implementation for non list types. 

1788 if not pa.types.is_list(self.dtype.pyarrow_dtype): 

1789 return super()._explode() 

1790 values = self 

1791 counts = pa.compute.list_value_length(values._pa_array) 

1792 counts = counts.fill_null(1).to_numpy() 

1793 fill_value = pa.scalar([None], type=self._pa_array.type) 

1794 mask = counts == 0 

1795 if mask.any(): 

1796 values = values.copy() 

1797 values[mask] = fill_value 

1798 counts = counts.copy() 

1799 counts[mask] = 1 

1800 values = values.fillna(fill_value) 

1801 values = type(self)(pa.compute.list_flatten(values._pa_array)) 

1802 return values, counts 

1803 

1804 def __setitem__(self, key, value) -> None: 

1805 """Set one or more values inplace. 

1806 

1807 Parameters 

1808 ---------- 

1809 key : int, ndarray, or slice 

1810 When called from, e.g. ``Series.__setitem__``, ``key`` will be 

1811 one of 

1812 

1813 * scalar int 

1814 * ndarray of integers. 

1815 * boolean ndarray 

1816 * slice object 

1817 

1818 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object 

1819 value or values to be set of ``key``. 

1820 

1821 Returns 

1822 ------- 

1823 None 

1824 """ 

1825 # GH50085: unwrap 1D indexers 

1826 if isinstance(key, tuple) and len(key) == 1: 

1827 key = key[0] 

1828 

1829 key = check_array_indexer(self, key) 

1830 value = self._maybe_convert_setitem_value(value) 

1831 

1832 if com.is_null_slice(key): 

1833 # fast path (GH50248) 

1834 data = self._if_else(True, value, self._pa_array) 

1835 

1836 elif is_integer(key): 

1837 # fast path 

1838 key = cast(int, key) 

1839 n = len(self) 

1840 if key < 0: 

1841 key += n 

1842 if not 0 <= key < n: 

1843 raise IndexError( 

1844 f"index {key} is out of bounds for axis 0 with size {n}" 

1845 ) 

1846 if isinstance(value, pa.Scalar): 

1847 value = value.as_py() 

1848 elif is_list_like(value): 

1849 raise ValueError("Length of indexer and values mismatch") 

1850 chunks = [ 

1851 *self._pa_array[:key].chunks, 

1852 pa.array([value], type=self._pa_array.type, from_pandas=True), 

1853 *self._pa_array[key + 1 :].chunks, 

1854 ] 

1855 data = pa.chunked_array(chunks).combine_chunks() 

1856 

1857 elif is_bool_dtype(key): 

1858 key = np.asarray(key, dtype=np.bool_) 

1859 data = self._replace_with_mask(self._pa_array, key, value) 

1860 

1861 elif is_scalar(value) or isinstance(value, pa.Scalar): 

1862 mask = np.zeros(len(self), dtype=np.bool_) 

1863 mask[key] = True 

1864 data = self._if_else(mask, value, self._pa_array) 

1865 

1866 else: 

1867 indices = np.arange(len(self))[key] 

1868 if len(indices) != len(value): 

1869 raise ValueError("Length of indexer and values mismatch") 

1870 if len(indices) == 0: 

1871 return 

1872 argsort = np.argsort(indices) 

1873 indices = indices[argsort] 

1874 value = value.take(argsort) 

1875 mask = np.zeros(len(self), dtype=np.bool_) 

1876 mask[indices] = True 

1877 data = self._replace_with_mask(self._pa_array, mask, value) 

1878 

1879 if isinstance(data, pa.Array): 

1880 data = pa.chunked_array([data]) 

1881 self._pa_array = data 

1882 

1883 def _rank_calc( 

1884 self, 

1885 *, 

1886 axis: AxisInt = 0, 

1887 method: str = "average", 

1888 na_option: str = "keep", 

1889 ascending: bool = True, 

1890 pct: bool = False, 

1891 ): 

1892 if axis != 0: 

1893 ranked = super()._rank( 

1894 axis=axis, 

1895 method=method, 

1896 na_option=na_option, 

1897 ascending=ascending, 

1898 pct=pct, 

1899 ) 

1900 # keep dtypes consistent with the implementation below 

1901 if method == "average" or pct: 

1902 pa_type = pa.float64() 

1903 else: 

1904 pa_type = pa.uint64() 

1905 result = pa.array(ranked, type=pa_type, from_pandas=True) 

1906 return result 

1907 

1908 data = self._pa_array.combine_chunks() 

1909 sort_keys = "ascending" if ascending else "descending" 

1910 null_placement = "at_start" if na_option == "top" else "at_end" 

1911 tiebreaker = "min" if method == "average" else method 

1912 

1913 result = pc.rank( 

1914 data, 

1915 sort_keys=sort_keys, 

1916 null_placement=null_placement, 

1917 tiebreaker=tiebreaker, 

1918 ) 

1919 

1920 if na_option == "keep": 

1921 mask = pc.is_null(self._pa_array) 

1922 null = pa.scalar(None, type=result.type) 

1923 result = pc.if_else(mask, null, result) 

1924 

1925 if method == "average": 

1926 result_max = pc.rank( 

1927 data, 

1928 sort_keys=sort_keys, 

1929 null_placement=null_placement, 

1930 tiebreaker="max", 

1931 ) 

1932 result_max = result_max.cast(pa.float64()) 

1933 result_min = result.cast(pa.float64()) 

1934 result = pc.divide(pc.add(result_min, result_max), 2) 

1935 

1936 if pct: 

1937 if not pa.types.is_floating(result.type): 

1938 result = result.cast(pa.float64()) 

1939 if method == "dense": 

1940 divisor = pc.max(result) 

1941 else: 

1942 divisor = pc.count(result) 

1943 result = pc.divide(result, divisor) 

1944 

1945 return result 

1946 

1947 def _rank( 

1948 self, 

1949 *, 

1950 axis: AxisInt = 0, 

1951 method: str = "average", 

1952 na_option: str = "keep", 

1953 ascending: bool = True, 

1954 pct: bool = False, 

1955 ): 

1956 """ 

1957 See Series.rank.__doc__. 

1958 """ 

1959 return type(self)( 

1960 self._rank_calc( 

1961 axis=axis, 

1962 method=method, 

1963 na_option=na_option, 

1964 ascending=ascending, 

1965 pct=pct, 

1966 ) 

1967 ) 

1968 

1969 def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self: 

1970 """ 

1971 Compute the quantiles of self for each quantile in `qs`. 

1972 

1973 Parameters 

1974 ---------- 

1975 qs : np.ndarray[float64] 

1976 interpolation: str 

1977 

1978 Returns 

1979 ------- 

1980 same type as self 

1981 """ 

1982 pa_dtype = self._pa_array.type 

1983 

1984 data = self._pa_array 

1985 if pa.types.is_temporal(pa_dtype): 

1986 # https://github.com/apache/arrow/issues/33769 in these cases 

1987 # we can cast to ints and back 

1988 nbits = pa_dtype.bit_width 

1989 if nbits == 32: 

1990 data = data.cast(pa.int32()) 

1991 else: 

1992 data = data.cast(pa.int64()) 

1993 

1994 result = pc.quantile(data, q=qs, interpolation=interpolation) 

1995 

1996 if pa.types.is_temporal(pa_dtype): 

1997 if pa.types.is_floating(result.type): 

1998 result = pc.floor(result) 

1999 nbits = pa_dtype.bit_width 

2000 if nbits == 32: 

2001 result = result.cast(pa.int32()) 

2002 else: 

2003 result = result.cast(pa.int64()) 

2004 result = result.cast(pa_dtype) 

2005 

2006 return type(self)(result) 

2007 

2008 def _mode(self, dropna: bool = True) -> Self: 

2009 """ 

2010 Returns the mode(s) of the ExtensionArray. 

2011 

2012 Always returns `ExtensionArray` even if only one value. 

2013 

2014 Parameters 

2015 ---------- 

2016 dropna : bool, default True 

2017 Don't consider counts of NA values. 

2018 

2019 Returns 

2020 ------- 

2021 same type as self 

2022 Sorted, if possible. 

2023 """ 

2024 pa_type = self._pa_array.type 

2025 if pa.types.is_temporal(pa_type): 

2026 nbits = pa_type.bit_width 

2027 if nbits == 32: 

2028 data = self._pa_array.cast(pa.int32()) 

2029 elif nbits == 64: 

2030 data = self._pa_array.cast(pa.int64()) 

2031 else: 

2032 raise NotImplementedError(pa_type) 

2033 else: 

2034 data = self._pa_array 

2035 

2036 if dropna: 

2037 data = data.drop_null() 

2038 

2039 res = pc.value_counts(data) 

2040 most_common = res.field("values").filter( 

2041 pc.equal(res.field("counts"), pc.max(res.field("counts"))) 

2042 ) 

2043 

2044 if pa.types.is_temporal(pa_type): 

2045 most_common = most_common.cast(pa_type) 

2046 

2047 most_common = most_common.take(pc.array_sort_indices(most_common)) 

2048 return type(self)(most_common) 

2049 

2050 def _maybe_convert_setitem_value(self, value): 

2051 """Maybe convert value to be pyarrow compatible.""" 

2052 try: 

2053 value = self._box_pa(value, self._pa_array.type) 

2054 except pa.ArrowTypeError as err: 

2055 msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" 

2056 raise TypeError(msg) from err 

2057 return value 

2058 

2059 def interpolate( 

2060 self, 

2061 *, 

2062 method: InterpolateOptions, 

2063 axis: int, 

2064 index, 

2065 limit, 

2066 limit_direction, 

2067 limit_area, 

2068 copy: bool, 

2069 **kwargs, 

2070 ) -> Self: 

2071 """ 

2072 See NDFrame.interpolate.__doc__. 

2073 """ 

2074 # NB: we return type(self) even if copy=False 

2075 mask = self.isna() 

2076 if self.dtype.kind == "f": 

2077 data = self._pa_array.to_numpy() 

2078 elif self.dtype.kind in "iu": 

2079 data = self.to_numpy(dtype="f8", na_value=0.0) 

2080 else: 

2081 raise NotImplementedError( 

2082 f"interpolate is not implemented for dtype={self.dtype}" 

2083 ) 

2084 

2085 missing.interpolate_2d_inplace( 

2086 data, 

2087 method=method, 

2088 axis=0, 

2089 index=index, 

2090 limit=limit, 

2091 limit_direction=limit_direction, 

2092 limit_area=limit_area, 

2093 mask=mask, 

2094 **kwargs, 

2095 ) 

2096 return type(self)(self._box_pa_array(pa.array(data, mask=mask))) 

2097 

2098 @classmethod 

2099 def _if_else( 

2100 cls, 

2101 cond: npt.NDArray[np.bool_] | bool, 

2102 left: ArrayLike | Scalar, 

2103 right: ArrayLike | Scalar, 

2104 ): 

2105 """ 

2106 Choose values based on a condition. 

2107 

2108 Analogous to pyarrow.compute.if_else, with logic 

2109 to fallback to numpy for unsupported types. 

2110 

2111 Parameters 

2112 ---------- 

2113 cond : npt.NDArray[np.bool_] or bool 

2114 left : ArrayLike | Scalar 

2115 right : ArrayLike | Scalar 

2116 

2117 Returns 

2118 ------- 

2119 pa.Array 

2120 """ 

2121 try: 

2122 return pc.if_else(cond, left, right) 

2123 except pa.ArrowNotImplementedError: 

2124 pass 

2125 

2126 def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: 

2127 if isinstance(value, (pa.Array, pa.ChunkedArray)): 

2128 pa_type = value.type 

2129 elif isinstance(value, pa.Scalar): 

2130 pa_type = value.type 

2131 value = value.as_py() 

2132 else: 

2133 pa_type = None 

2134 return np.array(value, dtype=object), pa_type 

2135 

2136 left, left_type = _to_numpy_and_type(left) 

2137 right, right_type = _to_numpy_and_type(right) 

2138 pa_type = left_type or right_type 

2139 result = np.where(cond, left, right) 

2140 return pa.array(result, type=pa_type, from_pandas=True) 

2141 

2142 @classmethod 

2143 def _replace_with_mask( 

2144 cls, 

2145 values: pa.Array | pa.ChunkedArray, 

2146 mask: npt.NDArray[np.bool_] | bool, 

2147 replacements: ArrayLike | Scalar, 

2148 ): 

2149 """ 

2150 Replace items selected with a mask. 

2151 

2152 Analogous to pyarrow.compute.replace_with_mask, with logic 

2153 to fallback to numpy for unsupported types. 

2154 

2155 Parameters 

2156 ---------- 

2157 values : pa.Array or pa.ChunkedArray 

2158 mask : npt.NDArray[np.bool_] or bool 

2159 replacements : ArrayLike or Scalar 

2160 Replacement value(s) 

2161 

2162 Returns 

2163 ------- 

2164 pa.Array or pa.ChunkedArray 

2165 """ 

2166 if isinstance(replacements, pa.ChunkedArray): 

2167 # replacements must be array or scalar, not ChunkedArray 

2168 replacements = replacements.combine_chunks() 

2169 if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type): 

2170 # GH#52059 replace_with_mask segfaults for chunked array 

2171 # https://github.com/apache/arrow/issues/34634 

2172 values = values.combine_chunks() 

2173 try: 

2174 return pc.replace_with_mask(values, mask, replacements) 

2175 except pa.ArrowNotImplementedError: 

2176 pass 

2177 if isinstance(replacements, pa.Array): 

2178 replacements = np.array(replacements, dtype=object) 

2179 elif isinstance(replacements, pa.Scalar): 

2180 replacements = replacements.as_py() 

2181 result = np.array(values, dtype=object) 

2182 result[mask] = replacements 

2183 return pa.array(result, type=values.type, from_pandas=True) 

2184 

2185 # ------------------------------------------------------------------ 

2186 # GroupBy Methods 

2187 

2188 def _to_masked(self): 

2189 pa_dtype = self._pa_array.type 

2190 

2191 if pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype): 

2192 na_value = 1 

2193 elif pa.types.is_boolean(pa_dtype): 

2194 na_value = True 

2195 else: 

2196 raise NotImplementedError 

2197 

2198 dtype = _arrow_dtype_mapping()[pa_dtype] 

2199 mask = self.isna() 

2200 arr = self.to_numpy(dtype=dtype.numpy_dtype, na_value=na_value) 

2201 return dtype.construct_array_type()(arr, mask) 

2202 

2203 def _groupby_op( 

2204 self, 

2205 *, 

2206 how: str, 

2207 has_dropped_na: bool, 

2208 min_count: int, 

2209 ngroups: int, 

2210 ids: npt.NDArray[np.intp], 

2211 **kwargs, 

2212 ): 

2213 if isinstance(self.dtype, StringDtype): 

2214 return super()._groupby_op( 

2215 how=how, 

2216 has_dropped_na=has_dropped_na, 

2217 min_count=min_count, 

2218 ngroups=ngroups, 

2219 ids=ids, 

2220 **kwargs, 

2221 ) 

2222 

2223 # maybe convert to a compatible dtype optimized for groupby 

2224 values: ExtensionArray 

2225 pa_type = self._pa_array.type 

2226 if pa.types.is_timestamp(pa_type): 

2227 values = self._to_datetimearray() 

2228 elif pa.types.is_duration(pa_type): 

2229 values = self._to_timedeltaarray() 

2230 else: 

2231 values = self._to_masked() 

2232 

2233 result = values._groupby_op( 

2234 how=how, 

2235 has_dropped_na=has_dropped_na, 

2236 min_count=min_count, 

2237 ngroups=ngroups, 

2238 ids=ids, 

2239 **kwargs, 

2240 ) 

2241 if isinstance(result, np.ndarray): 

2242 return result 

2243 return type(self)._from_sequence(result, copy=False) 

2244 

2245 def _apply_elementwise(self, func: Callable) -> list[list[Any]]: 

2246 """Apply a callable to each element while maintaining the chunking structure.""" 

2247 return [ 

2248 [ 

2249 None if val is None else func(val) 

2250 for val in chunk.to_numpy(zero_copy_only=False) 

2251 ] 

2252 for chunk in self._pa_array.iterchunks() 

2253 ] 

2254 

2255 def _str_count(self, pat: str, flags: int = 0): 

2256 if flags: 

2257 raise NotImplementedError(f"count not implemented with {flags=}") 

2258 return type(self)(pc.count_substring_regex(self._pa_array, pat)) 

2259 

2260 def _str_contains( 

2261 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True 

2262 ): 

2263 if flags: 

2264 raise NotImplementedError(f"contains not implemented with {flags=}") 

2265 

2266 if regex: 

2267 pa_contains = pc.match_substring_regex 

2268 else: 

2269 pa_contains = pc.match_substring 

2270 result = pa_contains(self._pa_array, pat, ignore_case=not case) 

2271 if not isna(na): 

2272 result = result.fill_null(na) 

2273 return type(self)(result) 

2274 

2275 def _str_startswith(self, pat: str | tuple[str, ...], na=None): 

2276 if isinstance(pat, str): 

2277 result = pc.starts_with(self._pa_array, pattern=pat) 

2278 else: 

2279 if len(pat) == 0: 

2280 # For empty tuple, pd.StringDtype() returns null for missing values 

2281 # and false for valid values. 

2282 result = pc.if_else(pc.is_null(self._pa_array), None, False) 

2283 else: 

2284 result = pc.starts_with(self._pa_array, pattern=pat[0]) 

2285 

2286 for p in pat[1:]: 

2287 result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) 

2288 if not isna(na): 

2289 result = result.fill_null(na) 

2290 return type(self)(result) 

2291 

2292 def _str_endswith(self, pat: str | tuple[str, ...], na=None): 

2293 if isinstance(pat, str): 

2294 result = pc.ends_with(self._pa_array, pattern=pat) 

2295 else: 

2296 if len(pat) == 0: 

2297 # For empty tuple, pd.StringDtype() returns null for missing values 

2298 # and false for valid values. 

2299 result = pc.if_else(pc.is_null(self._pa_array), None, False) 

2300 else: 

2301 result = pc.ends_with(self._pa_array, pattern=pat[0]) 

2302 

2303 for p in pat[1:]: 

2304 result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) 

2305 if not isna(na): 

2306 result = result.fill_null(na) 

2307 return type(self)(result) 

2308 

2309 def _str_replace( 

2310 self, 

2311 pat: str | re.Pattern, 

2312 repl: str | Callable, 

2313 n: int = -1, 

2314 case: bool = True, 

2315 flags: int = 0, 

2316 regex: bool = True, 

2317 ): 

2318 if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: 

2319 raise NotImplementedError( 

2320 "replace is not supported with a re.Pattern, callable repl, " 

2321 "case=False, or flags!=0" 

2322 ) 

2323 

2324 func = pc.replace_substring_regex if regex else pc.replace_substring 

2325 # https://github.com/apache/arrow/issues/39149 

2326 # GH 56404, unexpected behavior with negative max_replacements with pyarrow. 

2327 pa_max_replacements = None if n < 0 else n 

2328 result = func( 

2329 self._pa_array, 

2330 pattern=pat, 

2331 replacement=repl, 

2332 max_replacements=pa_max_replacements, 

2333 ) 

2334 return type(self)(result) 

2335 

2336 def _str_repeat(self, repeats: int | Sequence[int]): 

2337 if not isinstance(repeats, int): 

2338 raise NotImplementedError( 

2339 f"repeat is not implemented when repeats is {type(repeats).__name__}" 

2340 ) 

2341 else: 

2342 return type(self)(pc.binary_repeat(self._pa_array, repeats)) 

2343 

2344 def _str_match( 

2345 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None 

2346 ): 

2347 if not pat.startswith("^"): 

2348 pat = f"^{pat}" 

2349 return self._str_contains(pat, case, flags, na, regex=True) 

2350 

2351 def _str_fullmatch( 

2352 self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None 

2353 ): 

2354 if not pat.endswith("$") or pat.endswith("\\$"): 

2355 pat = f"{pat}$" 

2356 return self._str_match(pat, case, flags, na) 

2357 

2358 def _str_find(self, sub: str, start: int = 0, end: int | None = None): 

2359 if start != 0 and end is not None: 

2360 slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end) 

2361 result = pc.find_substring(slices, sub) 

2362 not_found = pc.equal(result, -1) 

2363 start_offset = max(0, start) 

2364 offset_result = pc.add(result, start_offset) 

2365 result = pc.if_else(not_found, result, offset_result) 

2366 elif start == 0 and end is None: 

2367 slices = self._pa_array 

2368 result = pc.find_substring(slices, sub) 

2369 else: 

2370 raise NotImplementedError( 

2371 f"find not implemented with {sub=}, {start=}, {end=}" 

2372 ) 

2373 return type(self)(result) 

2374 

2375 def _str_join(self, sep: str): 

2376 if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string( 

2377 self._pa_array.type 

2378 ): 

2379 result = self._apply_elementwise(list) 

2380 result = pa.chunked_array(result, type=pa.list_(pa.string())) 

2381 else: 

2382 result = self._pa_array 

2383 return type(self)(pc.binary_join(result, sep)) 

2384 

2385 def _str_partition(self, sep: str, expand: bool): 

2386 predicate = lambda val: val.partition(sep) 

2387 result = self._apply_elementwise(predicate) 

2388 return type(self)(pa.chunked_array(result)) 

2389 

2390 def _str_rpartition(self, sep: str, expand: bool): 

2391 predicate = lambda val: val.rpartition(sep) 

2392 result = self._apply_elementwise(predicate) 

2393 return type(self)(pa.chunked_array(result)) 

2394 

2395 def _str_slice( 

2396 self, start: int | None = None, stop: int | None = None, step: int | None = None 

2397 ): 

2398 if start is None: 

2399 start = 0 

2400 if step is None: 

2401 step = 1 

2402 return type(self)( 

2403 pc.utf8_slice_codeunits(self._pa_array, start=start, stop=stop, step=step) 

2404 ) 

2405 

2406 def _str_isalnum(self): 

2407 return type(self)(pc.utf8_is_alnum(self._pa_array)) 

2408 

2409 def _str_isalpha(self): 

2410 return type(self)(pc.utf8_is_alpha(self._pa_array)) 

2411 

2412 def _str_isdecimal(self): 

2413 return type(self)(pc.utf8_is_decimal(self._pa_array)) 

2414 

2415 def _str_isdigit(self): 

2416 return type(self)(pc.utf8_is_digit(self._pa_array)) 

2417 

2418 def _str_islower(self): 

2419 return type(self)(pc.utf8_is_lower(self._pa_array)) 

2420 

2421 def _str_isnumeric(self): 

2422 return type(self)(pc.utf8_is_numeric(self._pa_array)) 

2423 

2424 def _str_isspace(self): 

2425 return type(self)(pc.utf8_is_space(self._pa_array)) 

2426 

2427 def _str_istitle(self): 

2428 return type(self)(pc.utf8_is_title(self._pa_array)) 

2429 

2430 def _str_isupper(self): 

2431 return type(self)(pc.utf8_is_upper(self._pa_array)) 

2432 

2433 def _str_len(self): 

2434 return type(self)(pc.utf8_length(self._pa_array)) 

2435 

2436 def _str_lower(self): 

2437 return type(self)(pc.utf8_lower(self._pa_array)) 

2438 

2439 def _str_upper(self): 

2440 return type(self)(pc.utf8_upper(self._pa_array)) 

2441 

2442 def _str_strip(self, to_strip=None): 

2443 if to_strip is None: 

2444 result = pc.utf8_trim_whitespace(self._pa_array) 

2445 else: 

2446 result = pc.utf8_trim(self._pa_array, characters=to_strip) 

2447 return type(self)(result) 

2448 

2449 def _str_lstrip(self, to_strip=None): 

2450 if to_strip is None: 

2451 result = pc.utf8_ltrim_whitespace(self._pa_array) 

2452 else: 

2453 result = pc.utf8_ltrim(self._pa_array, characters=to_strip) 

2454 return type(self)(result) 

2455 

2456 def _str_rstrip(self, to_strip=None): 

2457 if to_strip is None: 

2458 result = pc.utf8_rtrim_whitespace(self._pa_array) 

2459 else: 

2460 result = pc.utf8_rtrim(self._pa_array, characters=to_strip) 

2461 return type(self)(result) 

2462 

2463 def _str_removeprefix(self, prefix: str): 

2464 if not pa_version_under13p0: 

2465 starts_with = pc.starts_with(self._pa_array, pattern=prefix) 

2466 removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix)) 

2467 result = pc.if_else(starts_with, removed, self._pa_array) 

2468 return type(self)(result) 

2469 predicate = lambda val: val.removeprefix(prefix) 

2470 result = self._apply_elementwise(predicate) 

2471 return type(self)(pa.chunked_array(result)) 

2472 

2473 def _str_casefold(self): 

2474 predicate = lambda val: val.casefold() 

2475 result = self._apply_elementwise(predicate) 

2476 return type(self)(pa.chunked_array(result)) 

2477 

2478 def _str_encode(self, encoding: str, errors: str = "strict"): 

2479 predicate = lambda val: val.encode(encoding, errors) 

2480 result = self._apply_elementwise(predicate) 

2481 return type(self)(pa.chunked_array(result)) 

2482 

2483 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): 

2484 if flags: 

2485 raise NotImplementedError("Only flags=0 is implemented.") 

2486 groups = re.compile(pat).groupindex.keys() 

2487 if len(groups) == 0: 

2488 raise ValueError(f"{pat=} must contain a symbolic group name.") 

2489 result = pc.extract_regex(self._pa_array, pat) 

2490 if expand: 

2491 return { 

2492 col: type(self)(pc.struct_field(result, [i])) 

2493 for col, i in zip(groups, range(result.type.num_fields)) 

2494 } 

2495 else: 

2496 return type(self)(pc.struct_field(result, [0])) 

2497 

2498 def _str_findall(self, pat: str, flags: int = 0): 

2499 regex = re.compile(pat, flags=flags) 

2500 predicate = lambda val: regex.findall(val) 

2501 result = self._apply_elementwise(predicate) 

2502 return type(self)(pa.chunked_array(result)) 

2503 

2504 def _str_get_dummies(self, sep: str = "|"): 

2505 split = pc.split_pattern(self._pa_array, sep) 

2506 flattened_values = pc.list_flatten(split) 

2507 uniques = flattened_values.unique() 

2508 uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques)) 

2509 lengths = pc.list_value_length(split).fill_null(0).to_numpy() 

2510 n_rows = len(self) 

2511 n_cols = len(uniques) 

2512 indices = pc.index_in(flattened_values, uniques_sorted).to_numpy() 

2513 indices = indices + np.arange(n_rows).repeat(lengths) * n_cols 

2514 dummies = np.zeros(n_rows * n_cols, dtype=np.bool_) 

2515 dummies[indices] = True 

2516 dummies = dummies.reshape((n_rows, n_cols)) 

2517 result = type(self)(pa.array(list(dummies))) 

2518 return result, uniques_sorted.to_pylist() 

2519 

2520 def _str_index(self, sub: str, start: int = 0, end: int | None = None): 

2521 predicate = lambda val: val.index(sub, start, end) 

2522 result = self._apply_elementwise(predicate) 

2523 return type(self)(pa.chunked_array(result)) 

2524 

2525 def _str_rindex(self, sub: str, start: int = 0, end: int | None = None): 

2526 predicate = lambda val: val.rindex(sub, start, end) 

2527 result = self._apply_elementwise(predicate) 

2528 return type(self)(pa.chunked_array(result)) 

2529 

2530 def _str_normalize(self, form: str): 

2531 predicate = lambda val: unicodedata.normalize(form, val) 

2532 result = self._apply_elementwise(predicate) 

2533 return type(self)(pa.chunked_array(result)) 

2534 

2535 def _str_rfind(self, sub: str, start: int = 0, end=None): 

2536 predicate = lambda val: val.rfind(sub, start, end) 

2537 result = self._apply_elementwise(predicate) 

2538 return type(self)(pa.chunked_array(result)) 

2539 

2540 def _str_split( 

2541 self, 

2542 pat: str | None = None, 

2543 n: int | None = -1, 

2544 expand: bool = False, 

2545 regex: bool | None = None, 

2546 ): 

2547 if n in {-1, 0}: 

2548 n = None 

2549 if pat is None: 

2550 split_func = pc.utf8_split_whitespace 

2551 elif regex: 

2552 split_func = functools.partial(pc.split_pattern_regex, pattern=pat) 

2553 else: 

2554 split_func = functools.partial(pc.split_pattern, pattern=pat) 

2555 return type(self)(split_func(self._pa_array, max_splits=n)) 

2556 

2557 def _str_rsplit(self, pat: str | None = None, n: int | None = -1): 

2558 if n in {-1, 0}: 

2559 n = None 

2560 if pat is None: 

2561 return type(self)( 

2562 pc.utf8_split_whitespace(self._pa_array, max_splits=n, reverse=True) 

2563 ) 

2564 else: 

2565 return type(self)( 

2566 pc.split_pattern(self._pa_array, pat, max_splits=n, reverse=True) 

2567 ) 

2568 

2569 def _str_translate(self, table: dict[int, str]): 

2570 predicate = lambda val: val.translate(table) 

2571 result = self._apply_elementwise(predicate) 

2572 return type(self)(pa.chunked_array(result)) 

2573 

2574 def _str_wrap(self, width: int, **kwargs): 

2575 kwargs["width"] = width 

2576 tw = textwrap.TextWrapper(**kwargs) 

2577 predicate = lambda val: "\n".join(tw.wrap(val)) 

2578 result = self._apply_elementwise(predicate) 

2579 return type(self)(pa.chunked_array(result)) 

2580 

2581 @property 

2582 def _dt_days(self): 

2583 return type(self)( 

2584 pa.array(self._to_timedeltaarray().days, from_pandas=True, type=pa.int32()) 

2585 ) 

2586 

2587 @property 

2588 def _dt_hours(self): 

2589 return type(self)( 

2590 pa.array( 

2591 [ 

2592 td.components.hours if td is not NaT else None 

2593 for td in self._to_timedeltaarray() 

2594 ], 

2595 type=pa.int32(), 

2596 ) 

2597 ) 

2598 

2599 @property 

2600 def _dt_minutes(self): 

2601 return type(self)( 

2602 pa.array( 

2603 [ 

2604 td.components.minutes if td is not NaT else None 

2605 for td in self._to_timedeltaarray() 

2606 ], 

2607 type=pa.int32(), 

2608 ) 

2609 ) 

2610 

2611 @property 

2612 def _dt_seconds(self): 

2613 return type(self)( 

2614 pa.array( 

2615 self._to_timedeltaarray().seconds, from_pandas=True, type=pa.int32() 

2616 ) 

2617 ) 

2618 

2619 @property 

2620 def _dt_milliseconds(self): 

2621 return type(self)( 

2622 pa.array( 

2623 [ 

2624 td.components.milliseconds if td is not NaT else None 

2625 for td in self._to_timedeltaarray() 

2626 ], 

2627 type=pa.int32(), 

2628 ) 

2629 ) 

2630 

2631 @property 

2632 def _dt_microseconds(self): 

2633 return type(self)( 

2634 pa.array( 

2635 self._to_timedeltaarray().microseconds, 

2636 from_pandas=True, 

2637 type=pa.int32(), 

2638 ) 

2639 ) 

2640 

2641 @property 

2642 def _dt_nanoseconds(self): 

2643 return type(self)( 

2644 pa.array( 

2645 self._to_timedeltaarray().nanoseconds, from_pandas=True, type=pa.int32() 

2646 ) 

2647 ) 

2648 

2649 def _dt_to_pytimedelta(self): 

2650 data = self._pa_array.to_pylist() 

2651 if self._dtype.pyarrow_dtype.unit == "ns": 

2652 data = [None if ts is None else ts.to_pytimedelta() for ts in data] 

2653 return np.array(data, dtype=object) 

2654 

2655 def _dt_total_seconds(self): 

2656 return type(self)( 

2657 pa.array(self._to_timedeltaarray().total_seconds(), from_pandas=True) 

2658 ) 

2659 

2660 def _dt_as_unit(self, unit: str): 

2661 if pa.types.is_date(self.dtype.pyarrow_dtype): 

2662 raise NotImplementedError("as_unit not implemented for date types") 

2663 pd_array = self._maybe_convert_datelike_array() 

2664 # Don't just cast _pa_array in order to follow pandas unit conversion rules 

2665 return type(self)(pa.array(pd_array.as_unit(unit), from_pandas=True)) 

2666 

2667 @property 

2668 def _dt_year(self): 

2669 return type(self)(pc.year(self._pa_array)) 

2670 

2671 @property 

2672 def _dt_day(self): 

2673 return type(self)(pc.day(self._pa_array)) 

2674 

2675 @property 

2676 def _dt_day_of_week(self): 

2677 return type(self)(pc.day_of_week(self._pa_array)) 

2678 

2679 _dt_dayofweek = _dt_day_of_week 

2680 _dt_weekday = _dt_day_of_week 

2681 

2682 @property 

2683 def _dt_day_of_year(self): 

2684 return type(self)(pc.day_of_year(self._pa_array)) 

2685 

2686 _dt_dayofyear = _dt_day_of_year 

2687 

2688 @property 

2689 def _dt_hour(self): 

2690 return type(self)(pc.hour(self._pa_array)) 

2691 

2692 def _dt_isocalendar(self): 

2693 return type(self)(pc.iso_calendar(self._pa_array)) 

2694 

2695 @property 

2696 def _dt_is_leap_year(self): 

2697 return type(self)(pc.is_leap_year(self._pa_array)) 

2698 

2699 @property 

2700 def _dt_is_month_start(self): 

2701 return type(self)(pc.equal(pc.day(self._pa_array), 1)) 

2702 

2703 @property 

2704 def _dt_is_month_end(self): 

2705 result = pc.equal( 

2706 pc.days_between( 

2707 pc.floor_temporal(self._pa_array, unit="day"), 

2708 pc.ceil_temporal(self._pa_array, unit="month"), 

2709 ), 

2710 1, 

2711 ) 

2712 return type(self)(result) 

2713 

2714 @property 

2715 def _dt_is_year_start(self): 

2716 return type(self)( 

2717 pc.and_( 

2718 pc.equal(pc.month(self._pa_array), 1), 

2719 pc.equal(pc.day(self._pa_array), 1), 

2720 ) 

2721 ) 

2722 

2723 @property 

2724 def _dt_is_year_end(self): 

2725 return type(self)( 

2726 pc.and_( 

2727 pc.equal(pc.month(self._pa_array), 12), 

2728 pc.equal(pc.day(self._pa_array), 31), 

2729 ) 

2730 ) 

2731 

2732 @property 

2733 def _dt_is_quarter_start(self): 

2734 result = pc.equal( 

2735 pc.floor_temporal(self._pa_array, unit="quarter"), 

2736 pc.floor_temporal(self._pa_array, unit="day"), 

2737 ) 

2738 return type(self)(result) 

2739 

2740 @property 

2741 def _dt_is_quarter_end(self): 

2742 result = pc.equal( 

2743 pc.days_between( 

2744 pc.floor_temporal(self._pa_array, unit="day"), 

2745 pc.ceil_temporal(self._pa_array, unit="quarter"), 

2746 ), 

2747 1, 

2748 ) 

2749 return type(self)(result) 

2750 

2751 @property 

2752 def _dt_days_in_month(self): 

2753 result = pc.days_between( 

2754 pc.floor_temporal(self._pa_array, unit="month"), 

2755 pc.ceil_temporal(self._pa_array, unit="month"), 

2756 ) 

2757 return type(self)(result) 

2758 

2759 _dt_daysinmonth = _dt_days_in_month 

2760 

2761 @property 

2762 def _dt_microsecond(self): 

2763 return type(self)(pc.microsecond(self._pa_array)) 

2764 

2765 @property 

2766 def _dt_minute(self): 

2767 return type(self)(pc.minute(self._pa_array)) 

2768 

2769 @property 

2770 def _dt_month(self): 

2771 return type(self)(pc.month(self._pa_array)) 

2772 

2773 @property 

2774 def _dt_nanosecond(self): 

2775 return type(self)(pc.nanosecond(self._pa_array)) 

2776 

2777 @property 

2778 def _dt_quarter(self): 

2779 return type(self)(pc.quarter(self._pa_array)) 

2780 

2781 @property 

2782 def _dt_second(self): 

2783 return type(self)(pc.second(self._pa_array)) 

2784 

2785 @property 

2786 def _dt_date(self): 

2787 return type(self)(self._pa_array.cast(pa.date32())) 

2788 

2789 @property 

2790 def _dt_time(self): 

2791 unit = ( 

2792 self.dtype.pyarrow_dtype.unit 

2793 if self.dtype.pyarrow_dtype.unit in {"us", "ns"} 

2794 else "ns" 

2795 ) 

2796 return type(self)(self._pa_array.cast(pa.time64(unit))) 

2797 

2798 @property 

2799 def _dt_tz(self): 

2800 return timezones.maybe_get_tz(self.dtype.pyarrow_dtype.tz) 

2801 

2802 @property 

2803 def _dt_unit(self): 

2804 return self.dtype.pyarrow_dtype.unit 

2805 

2806 def _dt_normalize(self): 

2807 return type(self)(pc.floor_temporal(self._pa_array, 1, "day")) 

2808 

2809 def _dt_strftime(self, format: str): 

2810 return type(self)(pc.strftime(self._pa_array, format=format)) 

2811 

2812 def _round_temporally( 

2813 self, 

2814 method: Literal["ceil", "floor", "round"], 

2815 freq, 

2816 ambiguous: TimeAmbiguous = "raise", 

2817 nonexistent: TimeNonexistent = "raise", 

2818 ): 

2819 if ambiguous != "raise": 

2820 raise NotImplementedError("ambiguous is not supported.") 

2821 if nonexistent != "raise": 

2822 raise NotImplementedError("nonexistent is not supported.") 

2823 offset = to_offset(freq) 

2824 if offset is None: 

2825 raise ValueError(f"Must specify a valid frequency: {freq}") 

2826 pa_supported_unit = { 

2827 "Y": "year", 

2828 "YS": "year", 

2829 "Q": "quarter", 

2830 "QS": "quarter", 

2831 "M": "month", 

2832 "MS": "month", 

2833 "W": "week", 

2834 "D": "day", 

2835 "h": "hour", 

2836 "min": "minute", 

2837 "s": "second", 

2838 "ms": "millisecond", 

2839 "us": "microsecond", 

2840 "ns": "nanosecond", 

2841 } 

2842 unit = pa_supported_unit.get(offset._prefix, None) 

2843 if unit is None: 

2844 raise ValueError(f"{freq=} is not supported") 

2845 multiple = offset.n 

2846 rounding_method = getattr(pc, f"{method}_temporal") 

2847 return type(self)(rounding_method(self._pa_array, multiple=multiple, unit=unit)) 

2848 

2849 def _dt_ceil( 

2850 self, 

2851 freq, 

2852 ambiguous: TimeAmbiguous = "raise", 

2853 nonexistent: TimeNonexistent = "raise", 

2854 ): 

2855 return self._round_temporally("ceil", freq, ambiguous, nonexistent) 

2856 

2857 def _dt_floor( 

2858 self, 

2859 freq, 

2860 ambiguous: TimeAmbiguous = "raise", 

2861 nonexistent: TimeNonexistent = "raise", 

2862 ): 

2863 return self._round_temporally("floor", freq, ambiguous, nonexistent) 

2864 

2865 def _dt_round( 

2866 self, 

2867 freq, 

2868 ambiguous: TimeAmbiguous = "raise", 

2869 nonexistent: TimeNonexistent = "raise", 

2870 ): 

2871 return self._round_temporally("round", freq, ambiguous, nonexistent) 

2872 

2873 def _dt_day_name(self, locale: str | None = None): 

2874 if locale is None: 

2875 locale = "C" 

2876 return type(self)(pc.strftime(self._pa_array, format="%A", locale=locale)) 

2877 

2878 def _dt_month_name(self, locale: str | None = None): 

2879 if locale is None: 

2880 locale = "C" 

2881 return type(self)(pc.strftime(self._pa_array, format="%B", locale=locale)) 

2882 

2883 def _dt_to_pydatetime(self): 

2884 if pa.types.is_date(self.dtype.pyarrow_dtype): 

2885 raise ValueError( 

2886 f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. " 

2887 "Convert to pyarrow timestamp type." 

2888 ) 

2889 data = self._pa_array.to_pylist() 

2890 if self._dtype.pyarrow_dtype.unit == "ns": 

2891 data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data] 

2892 return np.array(data, dtype=object) 

2893 

2894 def _dt_tz_localize( 

2895 self, 

2896 tz, 

2897 ambiguous: TimeAmbiguous = "raise", 

2898 nonexistent: TimeNonexistent = "raise", 

2899 ): 

2900 if ambiguous != "raise": 

2901 raise NotImplementedError(f"{ambiguous=} is not supported") 

2902 nonexistent_pa = { 

2903 "raise": "raise", 

2904 "shift_backward": "earliest", 

2905 "shift_forward": "latest", 

2906 }.get( 

2907 nonexistent, None # type: ignore[arg-type] 

2908 ) 

2909 if nonexistent_pa is None: 

2910 raise NotImplementedError(f"{nonexistent=} is not supported") 

2911 if tz is None: 

2912 result = self._pa_array.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit)) 

2913 else: 

2914 result = pc.assume_timezone( 

2915 self._pa_array, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa 

2916 ) 

2917 return type(self)(result) 

2918 

2919 def _dt_tz_convert(self, tz): 

2920 if self.dtype.pyarrow_dtype.tz is None: 

2921 raise TypeError( 

2922 "Cannot convert tz-naive timestamps, use tz_localize to localize" 

2923 ) 

2924 current_unit = self.dtype.pyarrow_dtype.unit 

2925 result = self._pa_array.cast(pa.timestamp(current_unit, tz)) 

2926 return type(self)(result) 

2927 

2928 

2929def transpose_homogeneous_pyarrow( 

2930 arrays: Sequence[ArrowExtensionArray], 

2931) -> list[ArrowExtensionArray]: 

2932 """Transpose arrow extension arrays in a list, but faster. 

2933 

2934 Input should be a list of arrays of equal length and all have the same 

2935 dtype. The caller is responsible for ensuring validity of input data. 

2936 """ 

2937 arrays = list(arrays) 

2938 nrows, ncols = len(arrays[0]), len(arrays) 

2939 indices = np.arange(nrows * ncols).reshape(ncols, nrows).T.flatten() 

2940 arr = pa.chunked_array([chunk for arr in arrays for chunk in arr._pa_array.chunks]) 

2941 arr = arr.take(indices) 

2942 return [ArrowExtensionArray(arr.slice(i * ncols, ncols)) for i in range(nrows)]