1from __future__ import annotations
2
3from typing import Literal
4
5import numpy as np
6
7from pandas.compat import pa_version_under10p1
8
9if not pa_version_under10p1:
10 import pyarrow as pa
11 import pyarrow.compute as pc
12
13
14class ArrowStringArrayMixin:
15 _pa_array = None
16
17 def __init__(self, *args, **kwargs) -> None:
18 raise NotImplementedError
19
20 def _str_pad(
21 self,
22 width: int,
23 side: Literal["left", "right", "both"] = "left",
24 fillchar: str = " ",
25 ):
26 if side == "left":
27 pa_pad = pc.utf8_lpad
28 elif side == "right":
29 pa_pad = pc.utf8_rpad
30 elif side == "both":
31 pa_pad = pc.utf8_center
32 else:
33 raise ValueError(
34 f"Invalid side: {side}. Side must be one of 'left', 'right', 'both'"
35 )
36 return type(self)(pa_pad(self._pa_array, width=width, padding=fillchar))
37
38 def _str_get(self, i: int):
39 lengths = pc.utf8_length(self._pa_array)
40 if i >= 0:
41 out_of_bounds = pc.greater_equal(i, lengths)
42 start = i
43 stop = i + 1
44 step = 1
45 else:
46 out_of_bounds = pc.greater(-i, lengths)
47 start = i
48 stop = i - 1
49 step = -1
50 not_out_of_bounds = pc.invert(out_of_bounds.fill_null(True))
51 selected = pc.utf8_slice_codeunits(
52 self._pa_array, start=start, stop=stop, step=step
53 )
54 null_value = pa.scalar(
55 None, type=self._pa_array.type # type: ignore[attr-defined]
56 )
57 result = pc.if_else(not_out_of_bounds, selected, null_value)
58 return type(self)(result)
59
60 def _str_slice_replace(
61 self, start: int | None = None, stop: int | None = None, repl: str | None = None
62 ):
63 if repl is None:
64 repl = ""
65 if start is None:
66 start = 0
67 if stop is None:
68 stop = np.iinfo(np.int64).max
69 return type(self)(pc.utf8_replace_slice(self._pa_array, start, stop, repl))
70
71 def _str_capitalize(self):
72 return type(self)(pc.utf8_capitalize(self._pa_array))
73
74 def _str_title(self):
75 return type(self)(pc.utf8_title(self._pa_array))
76
77 def _str_swapcase(self):
78 return type(self)(pc.utf8_swapcase(self._pa_array))
79
80 def _str_removesuffix(self, suffix: str):
81 ends_with = pc.ends_with(self._pa_array, pattern=suffix)
82 removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
83 result = pc.if_else(ends_with, removed, self._pa_array)
84 return type(self)(result)