Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/arrow/array.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1003 statements  

1from __future__ import annotations 

2 

3from copy import deepcopy 

4import functools 

5import operator 

6import re 

7import sys 

8import textwrap 

9from typing import ( 

10 TYPE_CHECKING, 

11 Any, 

12 Callable, 

13 Literal, 

14 Sequence, 

15 TypeVar, 

16 cast, 

17) 

18import unicodedata 

19 

20import numpy as np 

21 

22from pandas._libs import lib 

23from pandas._typing import ( 

24 ArrayLike, 

25 AxisInt, 

26 Dtype, 

27 FillnaOptions, 

28 Iterator, 

29 NpDtype, 

30 PositionalIndexer, 

31 Scalar, 

32 SortKind, 

33 TakeIndexer, 

34 TimeAmbiguous, 

35 TimeNonexistent, 

36 npt, 

37) 

38from pandas.compat import ( 

39 pa_version_under7p0, 

40 pa_version_under8p0, 

41 pa_version_under9p0, 

42 pa_version_under11p0, 

43) 

44from pandas.util._decorators import doc 

45from pandas.util._validators import validate_fillna_kwargs 

46 

47from pandas.core.dtypes.common import ( 

48 is_array_like, 

49 is_bool_dtype, 

50 is_integer, 

51 is_integer_dtype, 

52 is_list_like, 

53 is_object_dtype, 

54 is_scalar, 

55) 

56from pandas.core.dtypes.dtypes import DatetimeTZDtype 

57from pandas.core.dtypes.missing import isna 

58 

59from pandas.core import roperator 

60from pandas.core.arraylike import OpsMixin 

61from pandas.core.arrays.base import ( 

62 ExtensionArray, 

63 ExtensionArraySupportsAnyAll, 

64) 

65import pandas.core.common as com 

66from pandas.core.indexers import ( 

67 check_array_indexer, 

68 unpack_tuple_and_ellipses, 

69 validate_indices, 

70) 

71from pandas.core.strings.base import BaseStringArrayMethods 

72 

73from pandas.tseries.frequencies import to_offset 

74 

75if not pa_version_under7p0: 

76 import pyarrow as pa 

77 import pyarrow.compute as pc 

78 

79 from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning 

80 from pandas.core.arrays.arrow.dtype import ArrowDtype 

81 

82 ARROW_CMP_FUNCS = { 

83 "eq": pc.equal, 

84 "ne": pc.not_equal, 

85 "lt": pc.less, 

86 "gt": pc.greater, 

87 "le": pc.less_equal, 

88 "ge": pc.greater_equal, 

89 } 

90 

91 ARROW_LOGICAL_FUNCS = { 

92 "and_": pc.and_kleene, 

93 "rand_": lambda x, y: pc.and_kleene(y, x), 

94 "or_": pc.or_kleene, 

95 "ror_": lambda x, y: pc.or_kleene(y, x), 

96 "xor": pc.xor, 

97 "rxor": lambda x, y: pc.xor(y, x), 

98 } 

99 

100 def cast_for_truediv( 

101 arrow_array: pa.ChunkedArray, pa_object: pa.Array | pa.Scalar 

102 ) -> pa.ChunkedArray: 

103 # Ensure int / int -> float mirroring Python/Numpy behavior 

104 # as pc.divide_checked(int, int) -> int 

105 if pa.types.is_integer(arrow_array.type) and pa.types.is_integer( 

106 pa_object.type 

107 ): 

108 return arrow_array.cast(pa.float64()) 

109 return arrow_array 

110 

111 def floordiv_compat( 

112 left: pa.ChunkedArray | pa.Array | pa.Scalar, 

113 right: pa.ChunkedArray | pa.Array | pa.Scalar, 

114 ) -> pa.ChunkedArray: 

115 # Ensure int // int -> int mirroring Python/Numpy behavior 

116 # as pc.floor(pc.divide_checked(int, int)) -> float 

117 result = pc.floor(pc.divide(left, right)) 

118 if pa.types.is_integer(left.type) and pa.types.is_integer(right.type): 

119 result = result.cast(left.type) 

120 return result 

121 

122 ARROW_ARITHMETIC_FUNCS = { 

123 "add": pc.add_checked, 

124 "radd": lambda x, y: pc.add_checked(y, x), 

125 "sub": pc.subtract_checked, 

126 "rsub": lambda x, y: pc.subtract_checked(y, x), 

127 "mul": pc.multiply_checked, 

128 "rmul": lambda x, y: pc.multiply_checked(y, x), 

129 "truediv": lambda x, y: pc.divide(cast_for_truediv(x, y), y), 

130 "rtruediv": lambda x, y: pc.divide(y, cast_for_truediv(x, y)), 

131 "floordiv": lambda x, y: floordiv_compat(x, y), 

132 "rfloordiv": lambda x, y: floordiv_compat(y, x), 

133 "mod": NotImplemented, 

134 "rmod": NotImplemented, 

135 "divmod": NotImplemented, 

136 "rdivmod": NotImplemented, 

137 "pow": pc.power_checked, 

138 "rpow": lambda x, y: pc.power_checked(y, x), 

139 } 

140 

141if TYPE_CHECKING: 

142 from pandas._typing import ( 

143 NumpySorter, 

144 NumpyValueArrayLike, 

145 ) 

146 

147 from pandas import Series 

148 

149ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") 

150 

151 

152def get_unit_from_pa_dtype(pa_dtype): 

153 # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804 

154 if pa_version_under11p0: 

155 unit = str(pa_dtype).split("[", 1)[-1][:-1] 

156 if unit not in ["s", "ms", "us", "ns"]: 

157 raise ValueError(pa_dtype) 

158 return unit 

159 return pa_dtype.unit 

160 

161 

162def to_pyarrow_type( 

163 dtype: ArrowDtype | pa.DataType | Dtype | None, 

164) -> pa.DataType | None: 

165 """ 

166 Convert dtype to a pyarrow type instance. 

167 """ 

168 if isinstance(dtype, ArrowDtype): 

169 return dtype.pyarrow_dtype 

170 elif isinstance(dtype, pa.DataType): 

171 return dtype 

172 elif isinstance(dtype, DatetimeTZDtype): 

173 return pa.timestamp(dtype.unit, dtype.tz) 

174 elif dtype: 

175 try: 

176 # Accepts python types too 

177 # Doesn't handle all numpy types 

178 return pa.from_numpy_dtype(dtype) 

179 except pa.ArrowNotImplementedError: 

180 pass 

181 return None 

182 

183 

184class ArrowExtensionArray( 

185 OpsMixin, ExtensionArraySupportsAnyAll, BaseStringArrayMethods 

186): 

187 """ 

188 Pandas ExtensionArray backed by a PyArrow ChunkedArray. 

189 

190 .. warning:: 

191 

192 ArrowExtensionArray is considered experimental. The implementation and 

193 parts of the API may change without warning. 

194 

195 Parameters 

196 ---------- 

197 values : pyarrow.Array or pyarrow.ChunkedArray 

198 

199 Attributes 

200 ---------- 

201 None 

202 

203 Methods 

204 ------- 

205 None 

206 

207 Returns 

208 ------- 

209 ArrowExtensionArray 

210 

211 Notes 

212 ----- 

213 Most methods are implemented using `pyarrow compute functions. <https://arrow.apache.org/docs/python/api/compute.html>`__ 

214 Some methods may either raise an exception or raise a ``PerformanceWarning`` if an 

215 associated compute function is not available based on the installed version of PyArrow. 

216 

217 Please install the latest version of PyArrow to enable the best functionality and avoid 

218 potential bugs in prior versions of PyArrow. 

219 

220 Examples 

221 -------- 

222 Create an ArrowExtensionArray with :func:`pandas.array`: 

223 

224 >>> pd.array([1, 1, None], dtype="int64[pyarrow]") 

225 <ArrowExtensionArray> 

226 [1, 1, <NA>] 

227 Length: 3, dtype: int64[pyarrow] 

228 """ # noqa: E501 (http link too long) 

229 

230 _data: pa.ChunkedArray 

231 _dtype: ArrowDtype 

232 

233 def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: 

234 if pa_version_under7p0: 

235 msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray." 

236 raise ImportError(msg) 

237 if isinstance(values, pa.Array): 

238 self._data = pa.chunked_array([values]) 

239 elif isinstance(values, pa.ChunkedArray): 

240 self._data = values 

241 else: 

242 raise ValueError( 

243 f"Unsupported type '{type(values)}' for ArrowExtensionArray" 

244 ) 

245 self._dtype = ArrowDtype(self._data.type) 

246 

247 @classmethod 

248 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): 

249 """ 

250 Construct a new ExtensionArray from a sequence of scalars. 

251 """ 

252 pa_dtype = to_pyarrow_type(dtype) 

253 if ( 

254 isinstance(scalars, np.ndarray) 

255 and isinstance(dtype, ArrowDtype) 

256 and ( 

257 pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype) 

258 ) 

259 ): 

260 # See https://github.com/apache/arrow/issues/35289 

261 scalars = scalars.tolist() 

262 

263 if isinstance(scalars, cls): 

264 scalars = scalars._data 

265 elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): 

266 if copy and is_array_like(scalars): 

267 # pa array should not get updated when numpy array is updated 

268 scalars = deepcopy(scalars) 

269 try: 

270 scalars = pa.array(scalars, type=pa_dtype, from_pandas=True) 

271 except pa.ArrowInvalid: 

272 # GH50430: let pyarrow infer type, then cast 

273 scalars = pa.array(scalars, from_pandas=True) 

274 if pa_dtype: 

275 if pa.types.is_dictionary(pa_dtype): 

276 scalars = scalars.dictionary_encode() 

277 else: 

278 scalars = scalars.cast(pa_dtype) 

279 arr = cls(scalars) 

280 if pa.types.is_duration(scalars.type) and scalars.null_count > 0: 

281 # GH52843: upstream bug for duration types when originally 

282 # constructed with data containing numpy NaT. 

283 # https://github.com/apache/arrow/issues/35088 

284 arr = arr.fillna(arr.dtype.na_value) 

285 return arr 

286 

287 @classmethod 

288 def _from_sequence_of_strings( 

289 cls, strings, *, dtype: Dtype | None = None, copy: bool = False 

290 ): 

291 """ 

292 Construct a new ExtensionArray from a sequence of strings. 

293 """ 

294 pa_type = to_pyarrow_type(dtype) 

295 if ( 

296 pa_type is None 

297 or pa.types.is_binary(pa_type) 

298 or pa.types.is_string(pa_type) 

299 ): 

300 # pa_type is None: Let pa.array infer 

301 # pa_type is string/binary: scalars already correct type 

302 scalars = strings 

303 elif pa.types.is_timestamp(pa_type): 

304 from pandas.core.tools.datetimes import to_datetime 

305 

306 scalars = to_datetime(strings, errors="raise") 

307 elif pa.types.is_date(pa_type): 

308 from pandas.core.tools.datetimes import to_datetime 

309 

310 scalars = to_datetime(strings, errors="raise").date 

311 elif pa.types.is_duration(pa_type): 

312 from pandas.core.tools.timedeltas import to_timedelta 

313 

314 scalars = to_timedelta(strings, errors="raise") 

315 if pa_type.unit != "ns": 

316 # GH51175: test_from_sequence_of_strings_pa_array 

317 # attempt to parse as int64 reflecting pyarrow's 

318 # duration to string casting behavior 

319 mask = isna(scalars) 

320 if not isinstance(strings, (pa.Array, pa.ChunkedArray)): 

321 strings = pa.array(strings, type=pa.string(), from_pandas=True) 

322 strings = pc.if_else(mask, None, strings) 

323 try: 

324 scalars = strings.cast(pa.int64()) 

325 except pa.ArrowInvalid: 

326 pass 

327 elif pa.types.is_time(pa_type): 

328 from pandas.core.tools.times import to_time 

329 

330 # "coerce" to allow "null times" (None) to not raise 

331 scalars = to_time(strings, errors="coerce") 

332 elif pa.types.is_boolean(pa_type): 

333 from pandas.core.arrays import BooleanArray 

334 

335 scalars = BooleanArray._from_sequence_of_strings(strings).to_numpy() 

336 elif ( 

337 pa.types.is_integer(pa_type) 

338 or pa.types.is_floating(pa_type) 

339 or pa.types.is_decimal(pa_type) 

340 ): 

341 from pandas.core.tools.numeric import to_numeric 

342 

343 scalars = to_numeric(strings, errors="raise") 

344 else: 

345 raise NotImplementedError( 

346 f"Converting strings to {pa_type} is not implemented." 

347 ) 

348 return cls._from_sequence(scalars, dtype=pa_type, copy=copy) 

349 

350 def __getitem__(self, item: PositionalIndexer): 

351 """Select a subset of self. 

352 

353 Parameters 

354 ---------- 

355 item : int, slice, or ndarray 

356 * int: The position in 'self' to get. 

357 * slice: A slice object, where 'start', 'stop', and 'step' are 

358 integers or None 

359 * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' 

360 

361 Returns 

362 ------- 

363 item : scalar or ExtensionArray 

364 

365 Notes 

366 ----- 

367 For scalar ``item``, return a scalar value suitable for the array's 

368 type. This should be an instance of ``self.dtype.type``. 

369 For slice ``key``, return an instance of ``ExtensionArray``, even 

370 if the slice is length 0 or 1. 

371 For a boolean mask, return an instance of ``ExtensionArray``, filtered 

372 to the values where ``item`` is True. 

373 """ 

374 item = check_array_indexer(self, item) 

375 

376 if isinstance(item, np.ndarray): 

377 if not len(item): 

378 # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] 

379 if self._dtype.name == "string" and self._dtype.storage == "pyarrow": 

380 pa_dtype = pa.string() 

381 else: 

382 pa_dtype = self._dtype.pyarrow_dtype 

383 return type(self)(pa.chunked_array([], type=pa_dtype)) 

384 elif is_integer_dtype(item.dtype): 

385 return self.take(item) 

386 elif is_bool_dtype(item.dtype): 

387 return type(self)(self._data.filter(item)) 

388 else: 

389 raise IndexError( 

390 "Only integers, slices and integer or " 

391 "boolean arrays are valid indices." 

392 ) 

393 elif isinstance(item, tuple): 

394 item = unpack_tuple_and_ellipses(item) 

395 

396 if item is Ellipsis: 

397 # TODO: should be handled by pyarrow? 

398 item = slice(None) 

399 

400 if is_scalar(item) and not is_integer(item): 

401 # e.g. "foo" or 2.5 

402 # exception message copied from numpy 

403 raise IndexError( 

404 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " 

405 r"(`None`) and integer or boolean arrays are valid indices" 

406 ) 

407 # We are not an array indexer, so maybe e.g. a slice or integer 

408 # indexer. We dispatch to pyarrow. 

409 value = self._data[item] 

410 if isinstance(value, pa.ChunkedArray): 

411 return type(self)(value) 

412 else: 

413 scalar = value.as_py() 

414 if scalar is None: 

415 return self._dtype.na_value 

416 else: 

417 return scalar 

418 

419 def __iter__(self) -> Iterator[Any]: 

420 """ 

421 Iterate over elements of the array. 

422 """ 

423 na_value = self._dtype.na_value 

424 for value in self._data: 

425 val = value.as_py() 

426 if val is None: 

427 yield na_value 

428 else: 

429 yield val 

430 

431 def __arrow_array__(self, type=None): 

432 """Convert myself to a pyarrow ChunkedArray.""" 

433 return self._data 

434 

435 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: 

436 """Correctly construct numpy arrays when passed to `np.asarray()`.""" 

437 return self.to_numpy(dtype=dtype) 

438 

439 def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

440 return type(self)(pc.invert(self._data)) 

441 

442 def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

443 return type(self)(pc.negate_checked(self._data)) 

444 

445 def __pos__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

446 return type(self)(self._data) 

447 

448 def __abs__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

449 return type(self)(pc.abs_checked(self._data)) 

450 

451 # GH 42600: __getstate__/__setstate__ not necessary once 

452 # https://issues.apache.org/jira/browse/ARROW-10739 is addressed 

453 def __getstate__(self): 

454 state = self.__dict__.copy() 

455 state["_data"] = self._data.combine_chunks() 

456 return state 

457 

458 def __setstate__(self, state) -> None: 

459 state["_data"] = pa.chunked_array(state["_data"]) 

460 self.__dict__.update(state) 

461 

462 def _cmp_method(self, other, op): 

463 from pandas.core.arrays.masked import BaseMaskedArray 

464 

465 pc_func = ARROW_CMP_FUNCS[op.__name__] 

466 if isinstance(other, ArrowExtensionArray): 

467 result = pc_func(self._data, other._data) 

468 elif isinstance(other, (np.ndarray, list)): 

469 result = pc_func(self._data, other) 

470 elif isinstance(other, BaseMaskedArray): 

471 # GH 52625 

472 result = pc_func(self._data, other.__arrow_array__()) 

473 elif is_scalar(other): 

474 try: 

475 result = pc_func(self._data, pa.scalar(other)) 

476 except (pa.lib.ArrowNotImplementedError, pa.lib.ArrowInvalid): 

477 mask = isna(self) | isna(other) 

478 valid = ~mask 

479 result = np.zeros(len(self), dtype="bool") 

480 result[valid] = op(np.array(self)[valid], other) 

481 result = pa.array(result, type=pa.bool_()) 

482 result = pc.if_else(valid, result, None) 

483 else: 

484 raise NotImplementedError( 

485 f"{op.__name__} not implemented for {type(other)}" 

486 ) 

487 return ArrowExtensionArray(result) 

488 

489 def _evaluate_op_method(self, other, op, arrow_funcs): 

490 from pandas.core.arrays.masked import BaseMaskedArray 

491 

492 pa_type = self._data.type 

493 if (pa.types.is_string(pa_type) or pa.types.is_binary(pa_type)) and op in [ 

494 operator.add, 

495 roperator.radd, 

496 ]: 

497 length = self._data.length() 

498 

499 seps: list[str] | list[bytes] 

500 if pa.types.is_string(pa_type): 

501 seps = [""] * length 

502 else: 

503 seps = [b""] * length 

504 

505 if is_scalar(other): 

506 other = [other] * length 

507 elif isinstance(other, type(self)): 

508 other = other._data 

509 if op is operator.add: 

510 result = pc.binary_join_element_wise(self._data, other, seps) 

511 else: 

512 result = pc.binary_join_element_wise(other, self._data, seps) 

513 return type(self)(result) 

514 

515 pc_func = arrow_funcs[op.__name__] 

516 if pc_func is NotImplemented: 

517 raise NotImplementedError(f"{op.__name__} not implemented.") 

518 if isinstance(other, ArrowExtensionArray): 

519 result = pc_func(self._data, other._data) 

520 elif isinstance(other, (np.ndarray, list)): 

521 result = pc_func(self._data, pa.array(other, from_pandas=True)) 

522 elif isinstance(other, BaseMaskedArray): 

523 # GH 52625 

524 result = pc_func(self._data, other.__arrow_array__()) 

525 elif is_scalar(other): 

526 if isna(other) and op.__name__ in ARROW_LOGICAL_FUNCS: 

527 # pyarrow kleene ops require null to be typed 

528 pa_scalar = pa.scalar(None, type=self._data.type) 

529 else: 

530 pa_scalar = pa.scalar(other) 

531 result = pc_func(self._data, pa_scalar) 

532 else: 

533 raise NotImplementedError( 

534 f"{op.__name__} not implemented for {type(other)}" 

535 ) 

536 return type(self)(result) 

537 

538 def _logical_method(self, other, op): 

539 return self._evaluate_op_method(other, op, ARROW_LOGICAL_FUNCS) 

540 

541 def _arith_method(self, other, op): 

542 return self._evaluate_op_method(other, op, ARROW_ARITHMETIC_FUNCS) 

543 

544 def equals(self, other) -> bool: 

545 if not isinstance(other, ArrowExtensionArray): 

546 return False 

547 # I'm told that pyarrow makes __eq__ behave like pandas' equals; 

548 # TODO: is this documented somewhere? 

549 return self._data == other._data 

550 

551 @property 

552 def dtype(self) -> ArrowDtype: 

553 """ 

554 An instance of 'ExtensionDtype'. 

555 """ 

556 return self._dtype 

557 

558 @property 

559 def nbytes(self) -> int: 

560 """ 

561 The number of bytes needed to store this object in memory. 

562 """ 

563 return self._data.nbytes 

564 

565 def __len__(self) -> int: 

566 """ 

567 Length of this array. 

568 

569 Returns 

570 ------- 

571 length : int 

572 """ 

573 return len(self._data) 

574 

575 def __contains__(self, key) -> bool: 

576 # https://github.com/pandas-dev/pandas/pull/51307#issuecomment-1426372604 

577 if isna(key) and key is not self.dtype.na_value: 

578 if self.dtype.kind == "f" and lib.is_float(key) and isna(key): 

579 return pc.any(pc.is_nan(self._data)).as_py() 

580 

581 # e.g. date or timestamp types we do not allow None here to match pd.NA 

582 return False 

583 # TODO: maybe complex? object? 

584 

585 return bool(super().__contains__(key)) 

586 

587 @property 

588 def _hasna(self) -> bool: 

589 return self._data.null_count > 0 

590 

591 def isna(self) -> npt.NDArray[np.bool_]: 

592 """ 

593 Boolean NumPy array indicating if each value is missing. 

594 

595 This should return a 1-D array the same length as 'self'. 

596 """ 

597 return self._data.is_null().to_numpy() 

598 

599 def any(self, *, skipna: bool = True, **kwargs): 

600 """ 

601 Return whether any element is truthy. 

602 

603 Returns False unless there is at least one element that is truthy. 

604 By default, NAs are skipped. If ``skipna=False`` is specified and 

605 missing values are present, similar :ref:`Kleene logic <boolean.kleene>` 

606 is used as for logical operations. 

607 

608 Parameters 

609 ---------- 

610 skipna : bool, default True 

611 Exclude NA values. If the entire array is NA and `skipna` is 

612 True, then the result will be False, as for an empty array. 

613 If `skipna` is False, the result will still be True if there is 

614 at least one element that is truthy, otherwise NA will be returned 

615 if there are NA's present. 

616 

617 Returns 

618 ------- 

619 bool or :attr:`pandas.NA` 

620 

621 See Also 

622 -------- 

623 ArrowExtensionArray.all : Return whether all elements are truthy. 

624 

625 Examples 

626 -------- 

627 The result indicates whether any element is truthy (and by default 

628 skips NAs): 

629 

630 >>> pd.array([True, False, True], dtype="boolean[pyarrow]").any() 

631 True 

632 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any() 

633 True 

634 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any() 

635 False 

636 >>> pd.array([], dtype="boolean[pyarrow]").any() 

637 False 

638 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").any() 

639 False 

640 >>> pd.array([pd.NA], dtype="float64[pyarrow]").any() 

641 False 

642 

643 With ``skipna=False``, the result can be NA if this is logically 

644 required (whether ``pd.NA`` is True or False influences the result): 

645 

646 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) 

647 True 

648 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) 

649 True 

650 >>> pd.array([False, False, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) 

651 <NA> 

652 >>> pd.array([0, 0, pd.NA], dtype="boolean[pyarrow]").any(skipna=False) 

653 <NA> 

654 """ 

655 return self._reduce("any", skipna=skipna, **kwargs) 

656 

657 def all(self, *, skipna: bool = True, **kwargs): 

658 """ 

659 Return whether all elements are truthy. 

660 

661 Returns True unless there is at least one element that is falsey. 

662 By default, NAs are skipped. If ``skipna=False`` is specified and 

663 missing values are present, similar :ref:`Kleene logic <boolean.kleene>` 

664 is used as for logical operations. 

665 

666 Parameters 

667 ---------- 

668 skipna : bool, default True 

669 Exclude NA values. If the entire array is NA and `skipna` is 

670 True, then the result will be True, as for an empty array. 

671 If `skipna` is False, the result will still be False if there is 

672 at least one element that is falsey, otherwise NA will be returned 

673 if there are NA's present. 

674 

675 Returns 

676 ------- 

677 bool or :attr:`pandas.NA` 

678 

679 See Also 

680 -------- 

681 ArrowExtensionArray.any : Return whether any element is truthy. 

682 

683 Examples 

684 -------- 

685 The result indicates whether all elements are truthy (and by default 

686 skips NAs): 

687 

688 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all() 

689 True 

690 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all() 

691 True 

692 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all() 

693 False 

694 >>> pd.array([], dtype="boolean[pyarrow]").all() 

695 True 

696 >>> pd.array([pd.NA], dtype="boolean[pyarrow]").all() 

697 True 

698 >>> pd.array([pd.NA], dtype="float64[pyarrow]").all() 

699 True 

700 

701 With ``skipna=False``, the result can be NA if this is logically 

702 required (whether ``pd.NA`` is True or False influences the result): 

703 

704 >>> pd.array([True, True, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) 

705 <NA> 

706 >>> pd.array([1, 1, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) 

707 <NA> 

708 >>> pd.array([True, False, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) 

709 False 

710 >>> pd.array([1, 0, pd.NA], dtype="boolean[pyarrow]").all(skipna=False) 

711 False 

712 """ 

713 return self._reduce("all", skipna=skipna, **kwargs) 

714 

715 def argsort( 

716 self, 

717 *, 

718 ascending: bool = True, 

719 kind: SortKind = "quicksort", 

720 na_position: str = "last", 

721 **kwargs, 

722 ) -> np.ndarray: 

723 order = "ascending" if ascending else "descending" 

724 null_placement = {"last": "at_end", "first": "at_start"}.get(na_position, None) 

725 if null_placement is None: 

726 raise ValueError(f"invalid na_position: {na_position}") 

727 

728 result = pc.array_sort_indices( 

729 self._data, order=order, null_placement=null_placement 

730 ) 

731 np_result = result.to_numpy() 

732 return np_result.astype(np.intp, copy=False) 

733 

734 def _argmin_max(self, skipna: bool, method: str) -> int: 

735 if self._data.length() in (0, self._data.null_count) or ( 

736 self._hasna and not skipna 

737 ): 

738 # For empty or all null, pyarrow returns -1 but pandas expects TypeError 

739 # For skipna=False and data w/ null, pandas expects NotImplementedError 

740 # let ExtensionArray.arg{max|min} raise 

741 return getattr(super(), f"arg{method}")(skipna=skipna) 

742 

743 data = self._data 

744 if pa.types.is_duration(data.type): 

745 data = data.cast(pa.int64()) 

746 

747 value = getattr(pc, method)(data, skip_nulls=skipna) 

748 return pc.index(data, value).as_py() 

749 

750 def argmin(self, skipna: bool = True) -> int: 

751 return self._argmin_max(skipna, "min") 

752 

753 def argmax(self, skipna: bool = True) -> int: 

754 return self._argmin_max(skipna, "max") 

755 

756 def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

757 """ 

758 Return a shallow copy of the array. 

759 

760 Underlying ChunkedArray is immutable, so a deep copy is unnecessary. 

761 

762 Returns 

763 ------- 

764 type(self) 

765 """ 

766 return type(self)(self._data) 

767 

768 def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

769 """ 

770 Return ArrowExtensionArray without NA values. 

771 

772 Returns 

773 ------- 

774 ArrowExtensionArray 

775 """ 

776 return type(self)(pc.drop_null(self._data)) 

777 

778 @doc(ExtensionArray.fillna) 

779 def fillna( 

780 self: ArrowExtensionArrayT, 

781 value: object | ArrayLike | None = None, 

782 method: FillnaOptions | None = None, 

783 limit: int | None = None, 

784 ) -> ArrowExtensionArrayT: 

785 value, method = validate_fillna_kwargs(value, method) 

786 

787 if limit is not None: 

788 return super().fillna(value=value, method=method, limit=limit) 

789 

790 if method is not None: 

791 fallback_performancewarning() 

792 return super().fillna(value=value, method=method, limit=limit) 

793 

794 if is_array_like(value): 

795 value = cast(ArrayLike, value) 

796 if len(value) != len(self): 

797 raise ValueError( 

798 f"Length of 'value' does not match. Got ({len(value)}) " 

799 f" expected {len(self)}" 

800 ) 

801 

802 def convert_fill_value(value, pa_type, dtype): 

803 if value is None: 

804 return value 

805 if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): 

806 return value 

807 if is_array_like(value): 

808 pa_box = pa.array 

809 else: 

810 pa_box = pa.scalar 

811 try: 

812 value = pa_box(value, type=pa_type, from_pandas=True) 

813 except pa.ArrowTypeError as err: 

814 msg = f"Invalid value '{str(value)}' for dtype {dtype}" 

815 raise TypeError(msg) from err 

816 return value 

817 

818 fill_value = convert_fill_value(value, self._data.type, self.dtype) 

819 

820 try: 

821 if method is None: 

822 return type(self)(pc.fill_null(self._data, fill_value=fill_value)) 

823 elif method == "pad": 

824 return type(self)(pc.fill_null_forward(self._data)) 

825 elif method == "backfill": 

826 return type(self)(pc.fill_null_backward(self._data)) 

827 except pa.ArrowNotImplementedError: 

828 # ArrowNotImplementedError: Function 'coalesce' has no kernel 

829 # matching input types (duration[ns], duration[ns]) 

830 # TODO: remove try/except wrapper if/when pyarrow implements 

831 # a kernel for duration types. 

832 pass 

833 

834 return super().fillna(value=value, method=method, limit=limit) 

835 

836 def isin(self, values) -> npt.NDArray[np.bool_]: 

837 # short-circuit to return all False array. 

838 if not len(values): 

839 return np.zeros(len(self), dtype=bool) 

840 

841 result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True)) 

842 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls 

843 # to False 

844 return np.array(result, dtype=np.bool_) 

845 

846 def _values_for_factorize(self) -> tuple[np.ndarray, Any]: 

847 """ 

848 Return an array and missing value suitable for factorization. 

849 

850 Returns 

851 ------- 

852 values : ndarray 

853 na_value : pd.NA 

854 

855 Notes 

856 ----- 

857 The values returned by this method are also used in 

858 :func:`pandas.util.hash_pandas_object`. 

859 """ 

860 values = self._data.to_numpy() 

861 return values, self.dtype.na_value 

862 

863 @doc(ExtensionArray.factorize) 

864 def factorize( 

865 self, 

866 use_na_sentinel: bool = True, 

867 ) -> tuple[np.ndarray, ExtensionArray]: 

868 null_encoding = "mask" if use_na_sentinel else "encode" 

869 

870 pa_type = self._data.type 

871 if pa.types.is_duration(pa_type): 

872 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 

873 data = self._data.cast(pa.int64()) 

874 else: 

875 data = self._data 

876 

877 if pa.types.is_dictionary(data.type): 

878 encoded = data 

879 else: 

880 encoded = data.dictionary_encode(null_encoding=null_encoding) 

881 if encoded.length() == 0: 

882 indices = np.array([], dtype=np.intp) 

883 uniques = type(self)(pa.chunked_array([], type=encoded.type.value_type)) 

884 else: 

885 pa_indices = encoded.combine_chunks().indices 

886 if pa_indices.null_count > 0: 

887 pa_indices = pc.fill_null(pa_indices, -1) 

888 indices = pa_indices.to_numpy(zero_copy_only=False, writable=True).astype( 

889 np.intp, copy=False 

890 ) 

891 uniques = type(self)(encoded.chunk(0).dictionary) 

892 

893 if pa.types.is_duration(pa_type): 

894 uniques = cast(ArrowExtensionArray, uniques.astype(self.dtype)) 

895 return indices, uniques 

896 

897 def reshape(self, *args, **kwargs): 

898 raise NotImplementedError( 

899 f"{type(self)} does not support reshape " 

900 f"as backed by a 1D pyarrow.ChunkedArray." 

901 ) 

902 

903 def round( 

904 self: ArrowExtensionArrayT, decimals: int = 0, *args, **kwargs 

905 ) -> ArrowExtensionArrayT: 

906 """ 

907 Round each value in the array a to the given number of decimals. 

908 

909 Parameters 

910 ---------- 

911 decimals : int, default 0 

912 Number of decimal places to round to. If decimals is negative, 

913 it specifies the number of positions to the left of the decimal point. 

914 *args, **kwargs 

915 Additional arguments and keywords have no effect. 

916 

917 Returns 

918 ------- 

919 ArrowExtensionArray 

920 Rounded values of the ArrowExtensionArray. 

921 

922 See Also 

923 -------- 

924 DataFrame.round : Round values of a DataFrame. 

925 Series.round : Round values of a Series. 

926 """ 

927 return type(self)(pc.round(self._data, ndigits=decimals)) 

928 

929 @doc(ExtensionArray.searchsorted) 

930 def searchsorted( 

931 self, 

932 value: NumpyValueArrayLike | ExtensionArray, 

933 side: Literal["left", "right"] = "left", 

934 sorter: NumpySorter = None, 

935 ) -> npt.NDArray[np.intp] | np.intp: 

936 if self._hasna: 

937 raise ValueError( 

938 "searchsorted requires array to be sorted, which is impossible " 

939 "with NAs present." 

940 ) 

941 if isinstance(value, ExtensionArray): 

942 value = value.astype(object) 

943 # Base class searchsorted would cast to object, which is *much* slower. 

944 return self.to_numpy().searchsorted(value, side=side, sorter=sorter) 

945 

946 def take( 

947 self, 

948 indices: TakeIndexer, 

949 allow_fill: bool = False, 

950 fill_value: Any = None, 

951 ) -> ArrowExtensionArray: 

952 """ 

953 Take elements from an array. 

954 

955 Parameters 

956 ---------- 

957 indices : sequence of int or one-dimensional np.ndarray of int 

958 Indices to be taken. 

959 allow_fill : bool, default False 

960 How to handle negative values in `indices`. 

961 

962 * False: negative values in `indices` indicate positional indices 

963 from the right (the default). This is similar to 

964 :func:`numpy.take`. 

965 

966 * True: negative values in `indices` indicate 

967 missing values. These values are set to `fill_value`. Any other 

968 other negative values raise a ``ValueError``. 

969 

970 fill_value : any, optional 

971 Fill value to use for NA-indices when `allow_fill` is True. 

972 This may be ``None``, in which case the default NA value for 

973 the type, ``self.dtype.na_value``, is used. 

974 

975 For many ExtensionArrays, there will be two representations of 

976 `fill_value`: a user-facing "boxed" scalar, and a low-level 

977 physical NA value. `fill_value` should be the user-facing version, 

978 and the implementation should handle translating that to the 

979 physical version for processing the take if necessary. 

980 

981 Returns 

982 ------- 

983 ExtensionArray 

984 

985 Raises 

986 ------ 

987 IndexError 

988 When the indices are out of bounds for the array. 

989 ValueError 

990 When `indices` contains negative values other than ``-1`` 

991 and `allow_fill` is True. 

992 

993 See Also 

994 -------- 

995 numpy.take 

996 api.extensions.take 

997 

998 Notes 

999 ----- 

1000 ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, 

1001 ``iloc``, when `indices` is a sequence of values. Additionally, 

1002 it's called by :meth:`Series.reindex`, or any other method 

1003 that causes realignment, with a `fill_value`. 

1004 """ 

1005 # TODO: Remove once we got rid of the (indices < 0) check 

1006 if not is_array_like(indices): 

1007 indices_array = np.asanyarray(indices) 

1008 else: 

1009 # error: Incompatible types in assignment (expression has type 

1010 # "Sequence[int]", variable has type "ndarray") 

1011 indices_array = indices # type: ignore[assignment] 

1012 

1013 if len(self._data) == 0 and (indices_array >= 0).any(): 

1014 raise IndexError("cannot do a non-empty take") 

1015 if indices_array.size > 0 and indices_array.max() >= len(self._data): 

1016 raise IndexError("out of bounds value in 'indices'.") 

1017 

1018 if allow_fill: 

1019 fill_mask = indices_array < 0 

1020 if fill_mask.any(): 

1021 validate_indices(indices_array, len(self._data)) 

1022 # TODO(ARROW-9433): Treat negative indices as NULL 

1023 indices_array = pa.array(indices_array, mask=fill_mask) 

1024 result = self._data.take(indices_array) 

1025 if isna(fill_value): 

1026 return type(self)(result) 

1027 # TODO: ArrowNotImplementedError: Function fill_null has no 

1028 # kernel matching input types (array[string], scalar[string]) 

1029 result = type(self)(result) 

1030 result[fill_mask] = fill_value 

1031 return result 

1032 # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) 

1033 else: 

1034 # Nothing to fill 

1035 return type(self)(self._data.take(indices)) 

1036 else: # allow_fill=False 

1037 # TODO(ARROW-9432): Treat negative indices as indices from the right. 

1038 if (indices_array < 0).any(): 

1039 # Don't modify in-place 

1040 indices_array = np.copy(indices_array) 

1041 indices_array[indices_array < 0] += len(self._data) 

1042 return type(self)(self._data.take(indices_array)) 

1043 

1044 @doc(ExtensionArray.to_numpy) 

1045 def to_numpy( 

1046 self, 

1047 dtype: npt.DTypeLike | None = None, 

1048 copy: bool = False, 

1049 na_value: object = lib.no_default, 

1050 ) -> np.ndarray: 

1051 if dtype is None and self._hasna: 

1052 dtype = object 

1053 if na_value is lib.no_default: 

1054 na_value = self.dtype.na_value 

1055 

1056 pa_type = self._data.type 

1057 if pa.types.is_temporal(pa_type) and not pa.types.is_date(pa_type): 

1058 # temporal types with units and/or timezones currently 

1059 # require pandas/python scalars to pass all tests 

1060 # TODO: improve performance (this is slow) 

1061 result = np.array(list(self), dtype=dtype) 

1062 elif is_object_dtype(dtype) and self._hasna: 

1063 result = np.empty(len(self), dtype=object) 

1064 mask = ~self.isna() 

1065 result[mask] = np.asarray(self[mask]._data) 

1066 elif pa.types.is_null(self._data.type): 

1067 result = np.asarray(self._data, dtype=dtype) 

1068 if not isna(na_value): 

1069 result[:] = na_value 

1070 return result 

1071 elif self._hasna: 

1072 data = self.copy() 

1073 data[self.isna()] = na_value 

1074 return np.asarray(data._data, dtype=dtype) 

1075 else: 

1076 result = np.asarray(self._data, dtype=dtype) 

1077 if copy: 

1078 result = result.copy() 

1079 if self._hasna: 

1080 result[self.isna()] = na_value 

1081 return result 

1082 

1083 def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: 

1084 """ 

1085 Compute the ArrowExtensionArray of unique values. 

1086 

1087 Returns 

1088 ------- 

1089 ArrowExtensionArray 

1090 """ 

1091 pa_type = self._data.type 

1092 

1093 if pa.types.is_duration(pa_type): 

1094 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 

1095 data = self._data.cast(pa.int64()) 

1096 else: 

1097 data = self._data 

1098 

1099 pa_result = pc.unique(data) 

1100 

1101 if pa.types.is_duration(pa_type): 

1102 pa_result = pa_result.cast(pa_type) 

1103 

1104 return type(self)(pa_result) 

1105 

1106 def value_counts(self, dropna: bool = True) -> Series: 

1107 """ 

1108 Return a Series containing counts of each unique value. 

1109 

1110 Parameters 

1111 ---------- 

1112 dropna : bool, default True 

1113 Don't include counts of missing values. 

1114 

1115 Returns 

1116 ------- 

1117 counts : Series 

1118 

1119 See Also 

1120 -------- 

1121 Series.value_counts 

1122 """ 

1123 pa_type = self._data.type 

1124 if pa.types.is_duration(pa_type): 

1125 # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323 

1126 data = self._data.cast(pa.int64()) 

1127 else: 

1128 data = self._data 

1129 

1130 from pandas import ( 

1131 Index, 

1132 Series, 

1133 ) 

1134 

1135 vc = data.value_counts() 

1136 

1137 values = vc.field(0) 

1138 counts = vc.field(1) 

1139 if dropna and data.null_count > 0: 

1140 mask = values.is_valid() 

1141 values = values.filter(mask) 

1142 counts = counts.filter(mask) 

1143 

1144 if pa.types.is_duration(pa_type): 

1145 values = values.cast(pa_type) 

1146 

1147 counts = ArrowExtensionArray(counts) 

1148 

1149 index = Index(type(self)(values)) 

1150 

1151 return Series(counts, index=index, name="count", copy=False) 

1152 

1153 @classmethod 

1154 def _concat_same_type( 

1155 cls: type[ArrowExtensionArrayT], to_concat 

1156 ) -> ArrowExtensionArrayT: 

1157 """ 

1158 Concatenate multiple ArrowExtensionArrays. 

1159 

1160 Parameters 

1161 ---------- 

1162 to_concat : sequence of ArrowExtensionArrays 

1163 

1164 Returns 

1165 ------- 

1166 ArrowExtensionArray 

1167 """ 

1168 chunks = [array for ea in to_concat for array in ea._data.iterchunks()] 

1169 if to_concat[0].dtype == "string": 

1170 # StringDtype has no attrivute pyarrow_dtype 

1171 pa_dtype = pa.string() 

1172 else: 

1173 pa_dtype = to_concat[0].dtype.pyarrow_dtype 

1174 arr = pa.chunked_array(chunks, type=pa_dtype) 

1175 return cls(arr) 

1176 

1177 def _accumulate( 

1178 self, name: str, *, skipna: bool = True, **kwargs 

1179 ) -> ArrowExtensionArray | ExtensionArray: 

1180 """ 

1181 Return an ExtensionArray performing an accumulation operation. 

1182 

1183 The underlying data type might change. 

1184 

1185 Parameters 

1186 ---------- 

1187 name : str 

1188 Name of the function, supported values are: 

1189 - cummin 

1190 - cummax 

1191 - cumsum 

1192 - cumprod 

1193 skipna : bool, default True 

1194 If True, skip NA values. 

1195 **kwargs 

1196 Additional keyword arguments passed to the accumulation function. 

1197 Currently, there is no supported kwarg. 

1198 

1199 Returns 

1200 ------- 

1201 array 

1202 

1203 Raises 

1204 ------ 

1205 NotImplementedError : subclass does not define accumulations 

1206 """ 

1207 pyarrow_name = { 

1208 "cumsum": "cumulative_sum_checked", 

1209 }.get(name, name) 

1210 pyarrow_meth = getattr(pc, pyarrow_name, None) 

1211 if pyarrow_meth is None: 

1212 return super()._accumulate(name, skipna=skipna, **kwargs) 

1213 

1214 data_to_accum = self._data 

1215 

1216 pa_dtype = data_to_accum.type 

1217 if pa.types.is_duration(pa_dtype): 

1218 data_to_accum = data_to_accum.cast(pa.int64()) 

1219 

1220 result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs) 

1221 

1222 if pa.types.is_duration(pa_dtype): 

1223 result = result.cast(pa_dtype) 

1224 

1225 return type(self)(result) 

1226 

1227 def _reduce(self, name: str, *, skipna: bool = True, **kwargs): 

1228 """ 

1229 Return a scalar result of performing the reduction operation. 

1230 

1231 Parameters 

1232 ---------- 

1233 name : str 

1234 Name of the function, supported values are: 

1235 { any, all, min, max, sum, mean, median, prod, 

1236 std, var, sem, kurt, skew }. 

1237 skipna : bool, default True 

1238 If True, skip NaN values. 

1239 **kwargs 

1240 Additional keyword arguments passed to the reduction function. 

1241 Currently, `ddof` is the only supported kwarg. 

1242 

1243 Returns 

1244 ------- 

1245 scalar 

1246 

1247 Raises 

1248 ------ 

1249 TypeError : subclass does not define reductions 

1250 """ 

1251 pa_type = self._data.type 

1252 

1253 data_to_reduce = self._data 

1254 

1255 if name in ["any", "all"] and ( 

1256 pa.types.is_integer(pa_type) 

1257 or pa.types.is_floating(pa_type) 

1258 or pa.types.is_duration(pa_type) 

1259 or pa.types.is_decimal(pa_type) 

1260 ): 

1261 # pyarrow only supports any/all for boolean dtype, we allow 

1262 # for other dtypes, matching our non-pyarrow behavior 

1263 

1264 if pa.types.is_duration(pa_type): 

1265 data_to_cmp = self._data.cast(pa.int64()) 

1266 else: 

1267 data_to_cmp = self._data 

1268 

1269 not_eq = pc.not_equal(data_to_cmp, 0) 

1270 data_to_reduce = not_eq 

1271 

1272 elif name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): 

1273 data_to_reduce = self._data.cast(pa.int64()) 

1274 

1275 elif name in ["median", "mean", "std", "sem"] and pa.types.is_temporal(pa_type): 

1276 nbits = pa_type.bit_width 

1277 if nbits == 32: 

1278 data_to_reduce = self._data.cast(pa.int32()) 

1279 else: 

1280 data_to_reduce = self._data.cast(pa.int64()) 

1281 

1282 if name == "sem": 

1283 

1284 def pyarrow_meth(data, skip_nulls, **kwargs): 

1285 numerator = pc.stddev(data, skip_nulls=skip_nulls, **kwargs) 

1286 denominator = pc.sqrt_checked(pc.count(self._data)) 

1287 return pc.divide_checked(numerator, denominator) 

1288 

1289 else: 

1290 pyarrow_name = { 

1291 "median": "quantile", 

1292 "prod": "product", 

1293 "std": "stddev", 

1294 "var": "variance", 

1295 }.get(name, name) 

1296 # error: Incompatible types in assignment 

1297 # (expression has type "Optional[Any]", variable has type 

1298 # "Callable[[Any, Any, KwArg(Any)], Any]") 

1299 pyarrow_meth = getattr(pc, pyarrow_name, None) # type: ignore[assignment] 

1300 if pyarrow_meth is None: 

1301 # Let ExtensionArray._reduce raise the TypeError 

1302 return super()._reduce(name, skipna=skipna, **kwargs) 

1303 

1304 # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0 

1305 if name in ["any", "all"] and "min_count" not in kwargs: 

1306 kwargs["min_count"] = 0 

1307 elif name == "median": 

1308 # GH 52679: Use quantile instead of approximate_median 

1309 kwargs["q"] = 0.5 

1310 

1311 try: 

1312 result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs) 

1313 except (AttributeError, NotImplementedError, TypeError) as err: 

1314 msg = ( 

1315 f"'{type(self).__name__}' with dtype {self.dtype} " 

1316 f"does not support reduction '{name}' with pyarrow " 

1317 f"version {pa.__version__}. '{name}' may be supported by " 

1318 f"upgrading pyarrow." 

1319 ) 

1320 raise TypeError(msg) from err 

1321 if name == "median": 

1322 # GH 52679: Use quantile instead of approximate_median; returns array 

1323 result = result[0] 

1324 if pc.is_null(result).as_py(): 

1325 return self.dtype.na_value 

1326 

1327 if name in ["min", "max", "sum"] and pa.types.is_duration(pa_type): 

1328 result = result.cast(pa_type) 

1329 if name in ["median", "mean"] and pa.types.is_temporal(pa_type): 

1330 result = result.cast(pa_type) 

1331 if name in ["std", "sem"] and pa.types.is_temporal(pa_type): 

1332 result = result.cast(pa.int64()) 

1333 if pa.types.is_duration(pa_type): 

1334 result = result.cast(pa_type) 

1335 elif pa.types.is_time(pa_type): 

1336 unit = get_unit_from_pa_dtype(pa_type) 

1337 result = result.cast(pa.duration(unit)) 

1338 elif pa.types.is_date(pa_type): 

1339 # go with closest available unit, i.e. "s" 

1340 result = result.cast(pa.duration("s")) 

1341 else: 

1342 # i.e. timestamp 

1343 result = result.cast(pa.duration(pa_type.unit)) 

1344 

1345 return result.as_py() 

1346 

1347 def __setitem__(self, key, value) -> None: 

1348 """Set one or more values inplace. 

1349 

1350 Parameters 

1351 ---------- 

1352 key : int, ndarray, or slice 

1353 When called from, e.g. ``Series.__setitem__``, ``key`` will be 

1354 one of 

1355 

1356 * scalar int 

1357 * ndarray of integers. 

1358 * boolean ndarray 

1359 * slice object 

1360 

1361 value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object 

1362 value or values to be set of ``key``. 

1363 

1364 Returns 

1365 ------- 

1366 None 

1367 """ 

1368 # GH50085: unwrap 1D indexers 

1369 if isinstance(key, tuple) and len(key) == 1: 

1370 key = key[0] 

1371 

1372 key = check_array_indexer(self, key) 

1373 value = self._maybe_convert_setitem_value(value) 

1374 

1375 if com.is_null_slice(key): 

1376 # fast path (GH50248) 

1377 data = self._if_else(True, value, self._data) 

1378 

1379 elif is_integer(key): 

1380 # fast path 

1381 key = cast(int, key) 

1382 n = len(self) 

1383 if key < 0: 

1384 key += n 

1385 if not 0 <= key < n: 

1386 raise IndexError( 

1387 f"index {key} is out of bounds for axis 0 with size {n}" 

1388 ) 

1389 if is_list_like(value): 

1390 raise ValueError("Length of indexer and values mismatch") 

1391 elif isinstance(value, pa.Scalar): 

1392 value = value.as_py() 

1393 chunks = [ 

1394 *self._data[:key].chunks, 

1395 pa.array([value], type=self._data.type, from_pandas=True), 

1396 *self._data[key + 1 :].chunks, 

1397 ] 

1398 data = pa.chunked_array(chunks).combine_chunks() 

1399 

1400 elif is_bool_dtype(key): 

1401 key = np.asarray(key, dtype=np.bool_) 

1402 data = self._replace_with_mask(self._data, key, value) 

1403 

1404 elif is_scalar(value) or isinstance(value, pa.Scalar): 

1405 mask = np.zeros(len(self), dtype=np.bool_) 

1406 mask[key] = True 

1407 data = self._if_else(mask, value, self._data) 

1408 

1409 else: 

1410 indices = np.arange(len(self))[key] 

1411 if len(indices) != len(value): 

1412 raise ValueError("Length of indexer and values mismatch") 

1413 if len(indices) == 0: 

1414 return 

1415 argsort = np.argsort(indices) 

1416 indices = indices[argsort] 

1417 value = value.take(argsort) 

1418 mask = np.zeros(len(self), dtype=np.bool_) 

1419 mask[indices] = True 

1420 data = self._replace_with_mask(self._data, mask, value) 

1421 

1422 if isinstance(data, pa.Array): 

1423 data = pa.chunked_array([data]) 

1424 self._data = data 

1425 

1426 def _rank( 

1427 self, 

1428 *, 

1429 axis: AxisInt = 0, 

1430 method: str = "average", 

1431 na_option: str = "keep", 

1432 ascending: bool = True, 

1433 pct: bool = False, 

1434 ): 

1435 """ 

1436 See Series.rank.__doc__. 

1437 """ 

1438 if pa_version_under9p0 or axis != 0: 

1439 ranked = super()._rank( 

1440 axis=axis, 

1441 method=method, 

1442 na_option=na_option, 

1443 ascending=ascending, 

1444 pct=pct, 

1445 ) 

1446 # keep dtypes consistent with the implementation below 

1447 if method == "average" or pct: 

1448 pa_type = pa.float64() 

1449 else: 

1450 pa_type = pa.uint64() 

1451 result = pa.array(ranked, type=pa_type, from_pandas=True) 

1452 return type(self)(result) 

1453 

1454 data = self._data.combine_chunks() 

1455 sort_keys = "ascending" if ascending else "descending" 

1456 null_placement = "at_start" if na_option == "top" else "at_end" 

1457 tiebreaker = "min" if method == "average" else method 

1458 

1459 result = pc.rank( 

1460 data, 

1461 sort_keys=sort_keys, 

1462 null_placement=null_placement, 

1463 tiebreaker=tiebreaker, 

1464 ) 

1465 

1466 if na_option == "keep": 

1467 mask = pc.is_null(self._data) 

1468 null = pa.scalar(None, type=result.type) 

1469 result = pc.if_else(mask, null, result) 

1470 

1471 if method == "average": 

1472 result_max = pc.rank( 

1473 data, 

1474 sort_keys=sort_keys, 

1475 null_placement=null_placement, 

1476 tiebreaker="max", 

1477 ) 

1478 result_max = result_max.cast(pa.float64()) 

1479 result_min = result.cast(pa.float64()) 

1480 result = pc.divide(pc.add(result_min, result_max), 2) 

1481 

1482 if pct: 

1483 if not pa.types.is_floating(result.type): 

1484 result = result.cast(pa.float64()) 

1485 if method == "dense": 

1486 divisor = pc.max(result) 

1487 else: 

1488 divisor = pc.count(result) 

1489 result = pc.divide(result, divisor) 

1490 

1491 return type(self)(result) 

1492 

1493 def _quantile( 

1494 self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str 

1495 ) -> ArrowExtensionArrayT: 

1496 """ 

1497 Compute the quantiles of self for each quantile in `qs`. 

1498 

1499 Parameters 

1500 ---------- 

1501 qs : np.ndarray[float64] 

1502 interpolation: str 

1503 

1504 Returns 

1505 ------- 

1506 same type as self 

1507 """ 

1508 pa_dtype = self._data.type 

1509 

1510 data = self._data 

1511 if pa.types.is_temporal(pa_dtype): 

1512 # https://github.com/apache/arrow/issues/33769 in these cases 

1513 # we can cast to ints and back 

1514 nbits = pa_dtype.bit_width 

1515 if nbits == 32: 

1516 data = data.cast(pa.int32()) 

1517 else: 

1518 data = data.cast(pa.int64()) 

1519 

1520 result = pc.quantile(data, q=qs, interpolation=interpolation) 

1521 

1522 if pa.types.is_temporal(pa_dtype): 

1523 nbits = pa_dtype.bit_width 

1524 if nbits == 32: 

1525 result = result.cast(pa.int32()) 

1526 else: 

1527 result = result.cast(pa.int64()) 

1528 result = result.cast(pa_dtype) 

1529 

1530 return type(self)(result) 

1531 

1532 def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT: 

1533 """ 

1534 Returns the mode(s) of the ExtensionArray. 

1535 

1536 Always returns `ExtensionArray` even if only one value. 

1537 

1538 Parameters 

1539 ---------- 

1540 dropna : bool, default True 

1541 Don't consider counts of NA values. 

1542 

1543 Returns 

1544 ------- 

1545 same type as self 

1546 Sorted, if possible. 

1547 """ 

1548 pa_type = self._data.type 

1549 if pa.types.is_temporal(pa_type): 

1550 nbits = pa_type.bit_width 

1551 if nbits == 32: 

1552 data = self._data.cast(pa.int32()) 

1553 elif nbits == 64: 

1554 data = self._data.cast(pa.int64()) 

1555 else: 

1556 raise NotImplementedError(pa_type) 

1557 else: 

1558 data = self._data 

1559 

1560 if dropna: 

1561 data = data.drop_null() 

1562 

1563 res = pc.value_counts(data) 

1564 most_common = res.field("values").filter( 

1565 pc.equal(res.field("counts"), pc.max(res.field("counts"))) 

1566 ) 

1567 

1568 if pa.types.is_temporal(pa_type): 

1569 most_common = most_common.cast(pa_type) 

1570 

1571 return type(self)(most_common) 

1572 

1573 def _maybe_convert_setitem_value(self, value): 

1574 """Maybe convert value to be pyarrow compatible.""" 

1575 if value is None: 

1576 return value 

1577 if isinstance(value, (pa.Scalar, pa.Array, pa.ChunkedArray)): 

1578 return value 

1579 if is_list_like(value): 

1580 pa_box = pa.array 

1581 else: 

1582 pa_box = pa.scalar 

1583 try: 

1584 value = pa_box(value, type=self._data.type, from_pandas=True) 

1585 except pa.ArrowTypeError as err: 

1586 msg = f"Invalid value '{str(value)}' for dtype {self.dtype}" 

1587 raise TypeError(msg) from err 

1588 return value 

1589 

1590 @classmethod 

1591 def _if_else( 

1592 cls, 

1593 cond: npt.NDArray[np.bool_] | bool, 

1594 left: ArrayLike | Scalar, 

1595 right: ArrayLike | Scalar, 

1596 ): 

1597 """ 

1598 Choose values based on a condition. 

1599 

1600 Analogous to pyarrow.compute.if_else, with logic 

1601 to fallback to numpy for unsupported types. 

1602 

1603 Parameters 

1604 ---------- 

1605 cond : npt.NDArray[np.bool_] or bool 

1606 left : ArrayLike | Scalar 

1607 right : ArrayLike | Scalar 

1608 

1609 Returns 

1610 ------- 

1611 pa.Array 

1612 """ 

1613 try: 

1614 return pc.if_else(cond, left, right) 

1615 except pa.ArrowNotImplementedError: 

1616 pass 

1617 

1618 def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]: 

1619 if isinstance(value, (pa.Array, pa.ChunkedArray)): 

1620 pa_type = value.type 

1621 elif isinstance(value, pa.Scalar): 

1622 pa_type = value.type 

1623 value = value.as_py() 

1624 else: 

1625 pa_type = None 

1626 return np.array(value, dtype=object), pa_type 

1627 

1628 left, left_type = _to_numpy_and_type(left) 

1629 right, right_type = _to_numpy_and_type(right) 

1630 pa_type = left_type or right_type 

1631 result = np.where(cond, left, right) 

1632 return pa.array(result, type=pa_type, from_pandas=True) 

1633 

1634 @classmethod 

1635 def _replace_with_mask( 

1636 cls, 

1637 values: pa.Array | pa.ChunkedArray, 

1638 mask: npt.NDArray[np.bool_] | bool, 

1639 replacements: ArrayLike | Scalar, 

1640 ): 

1641 """ 

1642 Replace items selected with a mask. 

1643 

1644 Analogous to pyarrow.compute.replace_with_mask, with logic 

1645 to fallback to numpy for unsupported types. 

1646 

1647 Parameters 

1648 ---------- 

1649 values : pa.Array or pa.ChunkedArray 

1650 mask : npt.NDArray[np.bool_] or bool 

1651 replacements : ArrayLike or Scalar 

1652 Replacement value(s) 

1653 

1654 Returns 

1655 ------- 

1656 pa.Array or pa.ChunkedArray 

1657 """ 

1658 if isinstance(replacements, pa.ChunkedArray): 

1659 # replacements must be array or scalar, not ChunkedArray 

1660 replacements = replacements.combine_chunks() 

1661 if pa_version_under8p0: 

1662 # pc.replace_with_mask seems to be a bit unreliable for versions < 8.0: 

1663 # version <= 7: segfaults with various types 

1664 # version <= 6: fails to replace nulls 

1665 if isinstance(replacements, pa.Array): 

1666 indices = np.full(len(values), None) 

1667 indices[mask] = np.arange(len(replacements)) 

1668 indices = pa.array(indices, type=pa.int64()) 

1669 replacements = replacements.take(indices) 

1670 return cls._if_else(mask, replacements, values) 

1671 if isinstance(values, pa.ChunkedArray) and pa.types.is_boolean(values.type): 

1672 # GH#52059 replace_with_mask segfaults for chunked array 

1673 # https://github.com/apache/arrow/issues/34634 

1674 values = values.combine_chunks() 

1675 try: 

1676 return pc.replace_with_mask(values, mask, replacements) 

1677 except pa.ArrowNotImplementedError: 

1678 pass 

1679 if isinstance(replacements, pa.Array): 

1680 replacements = np.array(replacements, dtype=object) 

1681 elif isinstance(replacements, pa.Scalar): 

1682 replacements = replacements.as_py() 

1683 result = np.array(values, dtype=object) 

1684 result[mask] = replacements 

1685 return pa.array(result, type=values.type, from_pandas=True) 

1686 

1687 def _apply_elementwise(self, func: Callable) -> list[list[Any]]: 

1688 """Apply a callable to each element while maintaining the chunking structure.""" 

1689 return [ 

1690 [ 

1691 None if val is None else func(val) 

1692 for val in chunk.to_numpy(zero_copy_only=False) 

1693 ] 

1694 for chunk in self._data.iterchunks() 

1695 ] 

1696 

1697 def _str_count(self, pat: str, flags: int = 0): 

1698 if flags: 

1699 raise NotImplementedError(f"count not implemented with {flags=}") 

1700 return type(self)(pc.count_substring_regex(self._data, pat)) 

1701 

1702 def _str_pad( 

1703 self, 

1704 width: int, 

1705 side: Literal["left", "right", "both"] = "left", 

1706 fillchar: str = " ", 

1707 ): 

1708 if side == "left": 

1709 pa_pad = pc.utf8_lpad 

1710 elif side == "right": 

1711 pa_pad = pc.utf8_rpad 

1712 elif side == "both": 

1713 pa_pad = pc.utf8_center 

1714 else: 

1715 raise ValueError( 

1716 f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'" 

1717 ) 

1718 return type(self)(pa_pad(self._data, width=width, padding=fillchar)) 

1719 

1720 def _str_contains( 

1721 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True 

1722 ): 

1723 if flags: 

1724 raise NotImplementedError(f"contains not implemented with {flags=}") 

1725 

1726 if regex: 

1727 pa_contains = pc.match_substring_regex 

1728 else: 

1729 pa_contains = pc.match_substring 

1730 result = pa_contains(self._data, pat, ignore_case=not case) 

1731 if not isna(na): 

1732 result = result.fill_null(na) 

1733 return type(self)(result) 

1734 

1735 def _str_startswith(self, pat: str, na=None): 

1736 result = pc.starts_with(self._data, pattern=pat) 

1737 if not isna(na): 

1738 result = result.fill_null(na) 

1739 return type(self)(result) 

1740 

1741 def _str_endswith(self, pat: str, na=None): 

1742 result = pc.ends_with(self._data, pattern=pat) 

1743 if not isna(na): 

1744 result = result.fill_null(na) 

1745 return type(self)(result) 

1746 

1747 def _str_replace( 

1748 self, 

1749 pat: str | re.Pattern, 

1750 repl: str | Callable, 

1751 n: int = -1, 

1752 case: bool = True, 

1753 flags: int = 0, 

1754 regex: bool = True, 

1755 ): 

1756 if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: 

1757 raise NotImplementedError( 

1758 "replace is not supported with a re.Pattern, callable repl, " 

1759 "case=False, or flags!=0" 

1760 ) 

1761 

1762 func = pc.replace_substring_regex if regex else pc.replace_substring 

1763 result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) 

1764 return type(self)(result) 

1765 

1766 def _str_repeat(self, repeats: int | Sequence[int]): 

1767 if not isinstance(repeats, int): 

1768 raise NotImplementedError( 

1769 f"repeat is not implemented when repeats is {type(repeats).__name__}" 

1770 ) 

1771 elif pa_version_under7p0: 

1772 raise NotImplementedError("repeat is not implemented for pyarrow < 7") 

1773 else: 

1774 return type(self)(pc.binary_repeat(self._data, repeats)) 

1775 

1776 def _str_match( 

1777 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None 

1778 ): 

1779 if not pat.startswith("^"): 

1780 pat = f"^{pat}" 

1781 return self._str_contains(pat, case, flags, na, regex=True) 

1782 

1783 def _str_fullmatch( 

1784 self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None 

1785 ): 

1786 if not pat.endswith("$") or pat.endswith("//$"): 

1787 pat = f"{pat}$" 

1788 return self._str_match(pat, case, flags, na) 

1789 

1790 def _str_find(self, sub: str, start: int = 0, end: int | None = None): 

1791 if start != 0 and end is not None: 

1792 slices = pc.utf8_slice_codeunits(self._data, start, stop=end) 

1793 result = pc.find_substring(slices, sub) 

1794 not_found = pc.equal(result, -1) 

1795 offset_result = pc.add(result, end - start) 

1796 result = pc.if_else(not_found, result, offset_result) 

1797 elif start == 0 and end is None: 

1798 slices = self._data 

1799 result = pc.find_substring(slices, sub) 

1800 else: 

1801 raise NotImplementedError( 

1802 f"find not implemented with {sub=}, {start=}, {end=}" 

1803 ) 

1804 return type(self)(result) 

1805 

1806 def _str_get(self, i: int): 

1807 lengths = pc.utf8_length(self._data) 

1808 if i >= 0: 

1809 out_of_bounds = pc.greater_equal(i, lengths) 

1810 start = i 

1811 stop = i + 1 

1812 step = 1 

1813 else: 

1814 out_of_bounds = pc.greater(-i, lengths) 

1815 start = i 

1816 stop = i - 1 

1817 step = -1 

1818 not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True)) 

1819 selected = pc.utf8_slice_codeunits( 

1820 self._data, start=start, stop=stop, step=step 

1821 ) 

1822 result = pa.array([None] * self._data.length(), type=self._data.type) 

1823 result = pc.if_else(not_out_of_bounds, selected, result) 

1824 return type(self)(result) 

1825 

1826 def _str_join(self, sep: str): 

1827 return type(self)(pc.binary_join(self._data, sep)) 

1828 

1829 def _str_partition(self, sep: str, expand: bool): 

1830 predicate = lambda val: val.partition(sep) 

1831 result = self._apply_elementwise(predicate) 

1832 return type(self)(pa.chunked_array(result)) 

1833 

1834 def _str_rpartition(self, sep: str, expand: bool): 

1835 predicate = lambda val: val.rpartition(sep) 

1836 result = self._apply_elementwise(predicate) 

1837 return type(self)(pa.chunked_array(result)) 

1838 

1839 def _str_slice( 

1840 self, start: int | None = None, stop: int | None = None, step: int | None = None 

1841 ): 

1842 if start is None: 

1843 start = 0 

1844 if step is None: 

1845 step = 1 

1846 return type(self)( 

1847 pc.utf8_slice_codeunits(self._data, start=start, stop=stop, step=step) 

1848 ) 

1849 

1850 def _str_slice_replace( 

1851 self, start: int | None = None, stop: int | None = None, repl: str | None = None 

1852 ): 

1853 if repl is None: 

1854 repl = "" 

1855 if start is None: 

1856 start = 0 

1857 return type(self)(pc.utf8_replace_slice(self._data, start, stop, repl)) 

1858 

1859 def _str_isalnum(self): 

1860 return type(self)(pc.utf8_is_alnum(self._data)) 

1861 

1862 def _str_isalpha(self): 

1863 return type(self)(pc.utf8_is_alpha(self._data)) 

1864 

1865 def _str_isdecimal(self): 

1866 return type(self)(pc.utf8_is_decimal(self._data)) 

1867 

1868 def _str_isdigit(self): 

1869 return type(self)(pc.utf8_is_digit(self._data)) 

1870 

1871 def _str_islower(self): 

1872 return type(self)(pc.utf8_is_lower(self._data)) 

1873 

1874 def _str_isnumeric(self): 

1875 return type(self)(pc.utf8_is_numeric(self._data)) 

1876 

1877 def _str_isspace(self): 

1878 return type(self)(pc.utf8_is_space(self._data)) 

1879 

1880 def _str_istitle(self): 

1881 return type(self)(pc.utf8_is_title(self._data)) 

1882 

1883 def _str_capitalize(self): 

1884 return type(self)(pc.utf8_capitalize(self._data)) 

1885 

1886 def _str_title(self): 

1887 return type(self)(pc.utf8_title(self._data)) 

1888 

1889 def _str_isupper(self): 

1890 return type(self)(pc.utf8_is_upper(self._data)) 

1891 

1892 def _str_swapcase(self): 

1893 return type(self)(pc.utf8_swapcase(self._data)) 

1894 

1895 def _str_len(self): 

1896 return type(self)(pc.utf8_length(self._data)) 

1897 

1898 def _str_lower(self): 

1899 return type(self)(pc.utf8_lower(self._data)) 

1900 

1901 def _str_upper(self): 

1902 return type(self)(pc.utf8_upper(self._data)) 

1903 

1904 def _str_strip(self, to_strip=None): 

1905 if to_strip is None: 

1906 result = pc.utf8_trim_whitespace(self._data) 

1907 else: 

1908 result = pc.utf8_trim(self._data, characters=to_strip) 

1909 return type(self)(result) 

1910 

1911 def _str_lstrip(self, to_strip=None): 

1912 if to_strip is None: 

1913 result = pc.utf8_ltrim_whitespace(self._data) 

1914 else: 

1915 result = pc.utf8_ltrim(self._data, characters=to_strip) 

1916 return type(self)(result) 

1917 

1918 def _str_rstrip(self, to_strip=None): 

1919 if to_strip is None: 

1920 result = pc.utf8_rtrim_whitespace(self._data) 

1921 else: 

1922 result = pc.utf8_rtrim(self._data, characters=to_strip) 

1923 return type(self)(result) 

1924 

1925 def _str_removeprefix(self, prefix: str): 

1926 # TODO: Should work once https://github.com/apache/arrow/issues/14991 is fixed 

1927 # starts_with = pc.starts_with(self._data, pattern=prefix) 

1928 # removed = pc.utf8_slice_codeunits(self._data, len(prefix)) 

1929 # result = pc.if_else(starts_with, removed, self._data) 

1930 # return type(self)(result) 

1931 if sys.version_info < (3, 9): 

1932 # NOTE pyupgrade will remove this when we run it with --py39-plus 

1933 # so don't remove the unnecessary `else` statement below 

1934 from pandas.util._str_methods import removeprefix 

1935 

1936 predicate = functools.partial(removeprefix, prefix=prefix) 

1937 else: 

1938 predicate = lambda val: val.removeprefix(prefix) 

1939 result = self._apply_elementwise(predicate) 

1940 return type(self)(pa.chunked_array(result)) 

1941 

1942 def _str_removesuffix(self, suffix: str): 

1943 ends_with = pc.ends_with(self._data, pattern=suffix) 

1944 removed = pc.utf8_slice_codeunits(self._data, 0, stop=-len(suffix)) 

1945 result = pc.if_else(ends_with, removed, self._data) 

1946 return type(self)(result) 

1947 

1948 def _str_casefold(self): 

1949 predicate = lambda val: val.casefold() 

1950 result = self._apply_elementwise(predicate) 

1951 return type(self)(pa.chunked_array(result)) 

1952 

1953 def _str_encode(self, encoding: str, errors: str = "strict"): 

1954 predicate = lambda val: val.encode(encoding, errors) 

1955 result = self._apply_elementwise(predicate) 

1956 return type(self)(pa.chunked_array(result)) 

1957 

1958 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): 

1959 raise NotImplementedError( 

1960 "str.extract not supported with pd.ArrowDtype(pa.string())." 

1961 ) 

1962 

1963 def _str_findall(self, pat: str, flags: int = 0): 

1964 regex = re.compile(pat, flags=flags) 

1965 predicate = lambda val: regex.findall(val) 

1966 result = self._apply_elementwise(predicate) 

1967 return type(self)(pa.chunked_array(result)) 

1968 

1969 def _str_get_dummies(self, sep: str = "|"): 

1970 split = pc.split_pattern(self._data, sep).combine_chunks() 

1971 uniques = split.flatten().unique() 

1972 uniques_sorted = uniques.take(pa.compute.array_sort_indices(uniques)) 

1973 result_data = [] 

1974 for lst in split.to_pylist(): 

1975 if lst is None: 

1976 result_data.append([False] * len(uniques_sorted)) 

1977 else: 

1978 res = pc.is_in(uniques_sorted, pa.array(set(lst))) 

1979 result_data.append(res.to_pylist()) 

1980 result = type(self)(pa.array(result_data)) 

1981 return result, uniques_sorted.to_pylist() 

1982 

1983 def _str_index(self, sub: str, start: int = 0, end: int | None = None): 

1984 predicate = lambda val: val.index(sub, start, end) 

1985 result = self._apply_elementwise(predicate) 

1986 return type(self)(pa.chunked_array(result)) 

1987 

1988 def _str_rindex(self, sub: str, start: int = 0, end: int | None = None): 

1989 predicate = lambda val: val.rindex(sub, start, end) 

1990 result = self._apply_elementwise(predicate) 

1991 return type(self)(pa.chunked_array(result)) 

1992 

1993 def _str_normalize(self, form: str): 

1994 predicate = lambda val: unicodedata.normalize(form, val) 

1995 result = self._apply_elementwise(predicate) 

1996 return type(self)(pa.chunked_array(result)) 

1997 

1998 def _str_rfind(self, sub: str, start: int = 0, end=None): 

1999 predicate = lambda val: val.rfind(sub, start, end) 

2000 result = self._apply_elementwise(predicate) 

2001 return type(self)(pa.chunked_array(result)) 

2002 

2003 def _str_split( 

2004 self, 

2005 pat: str | None = None, 

2006 n: int | None = -1, 

2007 expand: bool = False, 

2008 regex: bool | None = None, 

2009 ): 

2010 if n in {-1, 0}: 

2011 n = None 

2012 if regex: 

2013 split_func = pc.split_pattern_regex 

2014 else: 

2015 split_func = pc.split_pattern 

2016 return type(self)(split_func(self._data, pat, max_splits=n)) 

2017 

2018 def _str_rsplit(self, pat: str | None = None, n: int | None = -1): 

2019 if n in {-1, 0}: 

2020 n = None 

2021 return type(self)(pc.split_pattern(self._data, pat, max_splits=n, reverse=True)) 

2022 

2023 def _str_translate(self, table: dict[int, str]): 

2024 predicate = lambda val: val.translate(table) 

2025 result = self._apply_elementwise(predicate) 

2026 return type(self)(pa.chunked_array(result)) 

2027 

2028 def _str_wrap(self, width: int, **kwargs): 

2029 kwargs["width"] = width 

2030 tw = textwrap.TextWrapper(**kwargs) 

2031 predicate = lambda val: "\n".join(tw.wrap(val)) 

2032 result = self._apply_elementwise(predicate) 

2033 return type(self)(pa.chunked_array(result)) 

2034 

2035 @property 

2036 def _dt_year(self): 

2037 return type(self)(pc.year(self._data)) 

2038 

2039 @property 

2040 def _dt_day(self): 

2041 return type(self)(pc.day(self._data)) 

2042 

2043 @property 

2044 def _dt_day_of_week(self): 

2045 return type(self)(pc.day_of_week(self._data)) 

2046 

2047 _dt_dayofweek = _dt_day_of_week 

2048 _dt_weekday = _dt_day_of_week 

2049 

2050 @property 

2051 def _dt_day_of_year(self): 

2052 return type(self)(pc.day_of_year(self._data)) 

2053 

2054 _dt_dayofyear = _dt_day_of_year 

2055 

2056 @property 

2057 def _dt_hour(self): 

2058 return type(self)(pc.hour(self._data)) 

2059 

2060 def _dt_isocalendar(self): 

2061 return type(self)(pc.iso_calendar(self._data)) 

2062 

2063 @property 

2064 def _dt_is_leap_year(self): 

2065 return type(self)(pc.is_leap_year(self._data)) 

2066 

2067 @property 

2068 def _dt_microsecond(self): 

2069 return type(self)(pc.microsecond(self._data)) 

2070 

2071 @property 

2072 def _dt_minute(self): 

2073 return type(self)(pc.minute(self._data)) 

2074 

2075 @property 

2076 def _dt_month(self): 

2077 return type(self)(pc.month(self._data)) 

2078 

2079 @property 

2080 def _dt_nanosecond(self): 

2081 return type(self)(pc.nanosecond(self._data)) 

2082 

2083 @property 

2084 def _dt_quarter(self): 

2085 return type(self)(pc.quarter(self._data)) 

2086 

2087 @property 

2088 def _dt_second(self): 

2089 return type(self)(pc.second(self._data)) 

2090 

2091 @property 

2092 def _dt_date(self): 

2093 return type(self)(self._data.cast(pa.date32())) 

2094 

2095 @property 

2096 def _dt_time(self): 

2097 unit = ( 

2098 self.dtype.pyarrow_dtype.unit 

2099 if self.dtype.pyarrow_dtype.unit in {"us", "ns"} 

2100 else "ns" 

2101 ) 

2102 return type(self)(self._data.cast(pa.time64(unit))) 

2103 

2104 @property 

2105 def _dt_tz(self): 

2106 return self.dtype.pyarrow_dtype.tz 

2107 

2108 def _dt_strftime(self, format: str): 

2109 return type(self)(pc.strftime(self._data, format=format)) 

2110 

2111 def _round_temporally( 

2112 self, 

2113 method: Literal["ceil", "floor", "round"], 

2114 freq, 

2115 ambiguous: TimeAmbiguous = "raise", 

2116 nonexistent: TimeNonexistent = "raise", 

2117 ): 

2118 if ambiguous != "raise": 

2119 raise NotImplementedError("ambiguous is not supported.") 

2120 if nonexistent != "raise": 

2121 raise NotImplementedError("nonexistent is not supported.") 

2122 offset = to_offset(freq) 

2123 if offset is None: 

2124 raise ValueError(f"Must specify a valid frequency: {freq}") 

2125 pa_supported_unit = { 

2126 "A": "year", 

2127 "AS": "year", 

2128 "Q": "quarter", 

2129 "QS": "quarter", 

2130 "M": "month", 

2131 "MS": "month", 

2132 "W": "week", 

2133 "D": "day", 

2134 "H": "hour", 

2135 "T": "minute", 

2136 "S": "second", 

2137 "L": "millisecond", 

2138 "U": "microsecond", 

2139 "N": "nanosecond", 

2140 } 

2141 unit = pa_supported_unit.get(offset._prefix, None) 

2142 if unit is None: 

2143 raise ValueError(f"{freq=} is not supported") 

2144 multiple = offset.n 

2145 rounding_method = getattr(pc, f"{method}_temporal") 

2146 return type(self)(rounding_method(self._data, multiple=multiple, unit=unit)) 

2147 

2148 def _dt_ceil( 

2149 self, 

2150 freq, 

2151 ambiguous: TimeAmbiguous = "raise", 

2152 nonexistent: TimeNonexistent = "raise", 

2153 ): 

2154 return self._round_temporally("ceil", freq, ambiguous, nonexistent) 

2155 

2156 def _dt_floor( 

2157 self, 

2158 freq, 

2159 ambiguous: TimeAmbiguous = "raise", 

2160 nonexistent: TimeNonexistent = "raise", 

2161 ): 

2162 return self._round_temporally("floor", freq, ambiguous, nonexistent) 

2163 

2164 def _dt_round( 

2165 self, 

2166 freq, 

2167 ambiguous: TimeAmbiguous = "raise", 

2168 nonexistent: TimeNonexistent = "raise", 

2169 ): 

2170 return self._round_temporally("round", freq, ambiguous, nonexistent) 

2171 

2172 def _dt_to_pydatetime(self): 

2173 if pa.types.is_date(self.dtype.pyarrow_dtype): 

2174 raise ValueError( 

2175 f"to_pydatetime cannot be called with {self.dtype.pyarrow_dtype} type. " 

2176 "Convert to pyarrow timestamp type." 

2177 ) 

2178 data = self._data.to_pylist() 

2179 if self._dtype.pyarrow_dtype.unit == "ns": 

2180 data = [None if ts is None else ts.to_pydatetime(warn=False) for ts in data] 

2181 return np.array(data, dtype=object) 

2182 

2183 def _dt_tz_localize( 

2184 self, 

2185 tz, 

2186 ambiguous: TimeAmbiguous = "raise", 

2187 nonexistent: TimeNonexistent = "raise", 

2188 ): 

2189 if ambiguous != "raise": 

2190 raise NotImplementedError(f"{ambiguous=} is not supported") 

2191 nonexistent_pa = { 

2192 "raise": "raise", 

2193 "shift_backward": "earliest", 

2194 "shift_forward": "latest", 

2195 }.get( 

2196 nonexistent, None # type: ignore[arg-type] 

2197 ) 

2198 if nonexistent_pa is None: 

2199 raise NotImplementedError(f"{nonexistent=} is not supported") 

2200 if tz is None: 

2201 result = self._data.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit)) 

2202 else: 

2203 result = pc.assume_timezone( 

2204 self._data, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa 

2205 ) 

2206 return type(self)(result)