1from __future__ import annotations
2
3import functools
4import re
5import textwrap
6from typing import (
7 TYPE_CHECKING,
8 Callable,
9 Literal,
10 cast,
11)
12import unicodedata
13
14import numpy as np
15
16from pandas._libs import lib
17import pandas._libs.missing as libmissing
18import pandas._libs.ops as libops
19
20from pandas.core.dtypes.missing import isna
21
22from pandas.core.strings.base import BaseStringArrayMethods
23
24if TYPE_CHECKING:
25 from collections.abc import Sequence
26
27 from pandas._typing import (
28 NpDtype,
29 Scalar,
30 )
31
32 from pandas import Series
33
34
35class ObjectStringArrayMixin(BaseStringArrayMethods):
36 """
37 String Methods operating on object-dtype ndarrays.
38 """
39
40 _str_na_value = np.nan
41
42 def __len__(self) -> int:
43 # For typing, _str_map relies on the object being sized.
44 raise NotImplementedError
45
46 def _str_map(
47 self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True
48 ):
49 """
50 Map a callable over valid elements of the array.
51
52 Parameters
53 ----------
54 f : Callable
55 A function to call on each non-NA element.
56 na_value : Scalar, optional
57 The value to set for NA values. Might also be used for the
58 fill value if the callable `f` raises an exception.
59 This defaults to ``self._str_na_value`` which is ``np.nan``
60 for object-dtype and Categorical and ``pd.NA`` for StringArray.
61 dtype : Dtype, optional
62 The dtype of the result array.
63 convert : bool, default True
64 Whether to call `maybe_convert_objects` on the resulting ndarray
65 """
66 if dtype is None:
67 dtype = np.dtype("object")
68 if na_value is None:
69 na_value = self._str_na_value
70
71 if not len(self):
72 return np.array([], dtype=dtype)
73
74 arr = np.asarray(self, dtype=object)
75 mask = isna(arr)
76 map_convert = convert and not np.all(mask)
77 try:
78 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
79 except (TypeError, AttributeError) as err:
80 # Reraise the exception if callable `f` got wrong number of args.
81 # The user may want to be warned by this, instead of getting NaN
82 p_err = (
83 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
84 r"(?(3)required )positional arguments?"
85 )
86
87 if len(err.args) >= 1 and re.search(p_err, err.args[0]):
88 # FIXME: this should be totally avoidable
89 raise err
90
91 def g(x):
92 # This type of fallback behavior can be removed once
93 # we remove object-dtype .str accessor.
94 try:
95 return f(x)
96 except (TypeError, AttributeError):
97 return na_value
98
99 return self._str_map(g, na_value=na_value, dtype=dtype)
100 if not isinstance(result, np.ndarray):
101 return result
102 if na_value is not np.nan:
103 np.putmask(result, mask, na_value)
104 if convert and result.dtype == object:
105 result = lib.maybe_convert_objects(result)
106 return result
107
108 def _str_count(self, pat, flags: int = 0):
109 regex = re.compile(pat, flags=flags)
110 f = lambda x: len(regex.findall(x))
111 return self._str_map(f, dtype="int64")
112
113 def _str_pad(
114 self,
115 width: int,
116 side: Literal["left", "right", "both"] = "left",
117 fillchar: str = " ",
118 ):
119 if side == "left":
120 f = lambda x: x.rjust(width, fillchar)
121 elif side == "right":
122 f = lambda x: x.ljust(width, fillchar)
123 elif side == "both":
124 f = lambda x: x.center(width, fillchar)
125 else: # pragma: no cover
126 raise ValueError("Invalid side")
127 return self._str_map(f)
128
129 def _str_contains(
130 self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
131 ):
132 if regex:
133 if not case:
134 flags |= re.IGNORECASE
135
136 pat = re.compile(pat, flags=flags)
137
138 f = lambda x: pat.search(x) is not None
139 else:
140 if case:
141 f = lambda x: pat in x
142 else:
143 upper_pat = pat.upper()
144 f = lambda x: upper_pat in x.upper()
145 return self._str_map(f, na, dtype=np.dtype("bool"))
146
147 def _str_startswith(self, pat, na=None):
148 f = lambda x: x.startswith(pat)
149 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
150
151 def _str_endswith(self, pat, na=None):
152 f = lambda x: x.endswith(pat)
153 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
154
155 def _str_replace(
156 self,
157 pat: str | re.Pattern,
158 repl: str | Callable,
159 n: int = -1,
160 case: bool = True,
161 flags: int = 0,
162 regex: bool = True,
163 ):
164 if case is False:
165 # add case flag, if provided
166 flags |= re.IGNORECASE
167
168 if regex or flags or callable(repl):
169 if not isinstance(pat, re.Pattern):
170 if regex is False:
171 pat = re.escape(pat)
172 pat = re.compile(pat, flags=flags)
173
174 n = n if n >= 0 else 0
175 f = lambda x: pat.sub(repl=repl, string=x, count=n)
176 else:
177 f = lambda x: x.replace(pat, repl, n)
178
179 return self._str_map(f, dtype=str)
180
181 def _str_repeat(self, repeats: int | Sequence[int]):
182 if lib.is_integer(repeats):
183 rint = cast(int, repeats)
184
185 def scalar_rep(x):
186 try:
187 return bytes.__mul__(x, rint)
188 except TypeError:
189 return str.__mul__(x, rint)
190
191 return self._str_map(scalar_rep, dtype=str)
192 else:
193 from pandas.core.arrays.string_ import BaseStringArray
194
195 def rep(x, r):
196 if x is libmissing.NA:
197 return x
198 try:
199 return bytes.__mul__(x, r)
200 except TypeError:
201 return str.__mul__(x, r)
202
203 result = libops.vec_binop(
204 np.asarray(self),
205 np.asarray(repeats, dtype=object),
206 rep,
207 )
208 if isinstance(self, BaseStringArray):
209 # Not going through map, so we have to do this here.
210 result = type(self)._from_sequence(result, dtype=self.dtype)
211 return result
212
213 def _str_match(
214 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
215 ):
216 if not case:
217 flags |= re.IGNORECASE
218
219 regex = re.compile(pat, flags=flags)
220
221 f = lambda x: regex.match(x) is not None
222 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
223
224 def _str_fullmatch(
225 self,
226 pat: str | re.Pattern,
227 case: bool = True,
228 flags: int = 0,
229 na: Scalar | None = None,
230 ):
231 if not case:
232 flags |= re.IGNORECASE
233
234 regex = re.compile(pat, flags=flags)
235
236 f = lambda x: regex.fullmatch(x) is not None
237 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
238
239 def _str_encode(self, encoding, errors: str = "strict"):
240 f = lambda x: x.encode(encoding, errors=errors)
241 return self._str_map(f, dtype=object)
242
243 def _str_find(self, sub, start: int = 0, end=None):
244 return self._str_find_(sub, start, end, side="left")
245
246 def _str_rfind(self, sub, start: int = 0, end=None):
247 return self._str_find_(sub, start, end, side="right")
248
249 def _str_find_(self, sub, start, end, side):
250 if side == "left":
251 method = "find"
252 elif side == "right":
253 method = "rfind"
254 else: # pragma: no cover
255 raise ValueError("Invalid side")
256
257 if end is None:
258 f = lambda x: getattr(x, method)(sub, start)
259 else:
260 f = lambda x: getattr(x, method)(sub, start, end)
261 return self._str_map(f, dtype="int64")
262
263 def _str_findall(self, pat, flags: int = 0):
264 regex = re.compile(pat, flags=flags)
265 return self._str_map(regex.findall, dtype="object")
266
267 def _str_get(self, i):
268 def f(x):
269 if isinstance(x, dict):
270 return x.get(i)
271 elif len(x) > i >= -len(x):
272 return x[i]
273 return self._str_na_value
274
275 return self._str_map(f)
276
277 def _str_index(self, sub, start: int = 0, end=None):
278 if end:
279 f = lambda x: x.index(sub, start, end)
280 else:
281 f = lambda x: x.index(sub, start, end)
282 return self._str_map(f, dtype="int64")
283
284 def _str_rindex(self, sub, start: int = 0, end=None):
285 if end:
286 f = lambda x: x.rindex(sub, start, end)
287 else:
288 f = lambda x: x.rindex(sub, start, end)
289 return self._str_map(f, dtype="int64")
290
291 def _str_join(self, sep: str):
292 return self._str_map(sep.join)
293
294 def _str_partition(self, sep: str, expand):
295 result = self._str_map(lambda x: x.partition(sep), dtype="object")
296 return result
297
298 def _str_rpartition(self, sep: str, expand):
299 return self._str_map(lambda x: x.rpartition(sep), dtype="object")
300
301 def _str_len(self):
302 return self._str_map(len, dtype="int64")
303
304 def _str_slice(self, start=None, stop=None, step=None):
305 obj = slice(start, stop, step)
306 return self._str_map(lambda x: x[obj])
307
308 def _str_slice_replace(self, start=None, stop=None, repl=None):
309 if repl is None:
310 repl = ""
311
312 def f(x):
313 if x[start:stop] == "":
314 local_stop = start
315 else:
316 local_stop = stop
317 y = ""
318 if start is not None:
319 y += x[:start]
320 y += repl
321 if stop is not None:
322 y += x[local_stop:]
323 return y
324
325 return self._str_map(f)
326
327 def _str_split(
328 self,
329 pat: str | re.Pattern | None = None,
330 n=-1,
331 expand: bool = False,
332 regex: bool | None = None,
333 ):
334 if pat is None:
335 if n is None or n == 0:
336 n = -1
337 f = lambda x: x.split(pat, n)
338 else:
339 new_pat: str | re.Pattern
340 if regex is True or isinstance(pat, re.Pattern):
341 new_pat = re.compile(pat)
342 elif regex is False:
343 new_pat = pat
344 # regex is None so link to old behavior #43563
345 else:
346 if len(pat) == 1:
347 new_pat = pat
348 else:
349 new_pat = re.compile(pat)
350
351 if isinstance(new_pat, re.Pattern):
352 if n is None or n == -1:
353 n = 0
354 f = lambda x: new_pat.split(x, maxsplit=n)
355 else:
356 if n is None or n == 0:
357 n = -1
358 f = lambda x: x.split(pat, n)
359 return self._str_map(f, dtype=object)
360
361 def _str_rsplit(self, pat=None, n=-1):
362 if n is None or n == 0:
363 n = -1
364 f = lambda x: x.rsplit(pat, n)
365 return self._str_map(f, dtype="object")
366
367 def _str_translate(self, table):
368 return self._str_map(lambda x: x.translate(table))
369
370 def _str_wrap(self, width: int, **kwargs):
371 kwargs["width"] = width
372 tw = textwrap.TextWrapper(**kwargs)
373 return self._str_map(lambda s: "\n".join(tw.wrap(s)))
374
375 def _str_get_dummies(self, sep: str = "|"):
376 from pandas import Series
377
378 arr = Series(self).fillna("")
379 try:
380 arr = sep + arr + sep
381 except (TypeError, NotImplementedError):
382 arr = sep + arr.astype(str) + sep
383
384 tags: set[str] = set()
385 for ts in Series(arr, copy=False).str.split(sep):
386 tags.update(ts)
387 tags2 = sorted(tags - {""})
388
389 dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)
390
391 def _isin(test_elements: str, element: str) -> bool:
392 return element in test_elements
393
394 for i, t in enumerate(tags2):
395 pat = sep + t + sep
396 dummies[:, i] = lib.map_infer(
397 arr.to_numpy(), functools.partial(_isin, element=pat)
398 )
399 return dummies, tags2
400
401 def _str_upper(self):
402 return self._str_map(lambda x: x.upper())
403
404 def _str_isalnum(self):
405 return self._str_map(str.isalnum, dtype="bool")
406
407 def _str_isalpha(self):
408 return self._str_map(str.isalpha, dtype="bool")
409
410 def _str_isdecimal(self):
411 return self._str_map(str.isdecimal, dtype="bool")
412
413 def _str_isdigit(self):
414 return self._str_map(str.isdigit, dtype="bool")
415
416 def _str_islower(self):
417 return self._str_map(str.islower, dtype="bool")
418
419 def _str_isnumeric(self):
420 return self._str_map(str.isnumeric, dtype="bool")
421
422 def _str_isspace(self):
423 return self._str_map(str.isspace, dtype="bool")
424
425 def _str_istitle(self):
426 return self._str_map(str.istitle, dtype="bool")
427
428 def _str_isupper(self):
429 return self._str_map(str.isupper, dtype="bool")
430
431 def _str_capitalize(self):
432 return self._str_map(str.capitalize)
433
434 def _str_casefold(self):
435 return self._str_map(str.casefold)
436
437 def _str_title(self):
438 return self._str_map(str.title)
439
440 def _str_swapcase(self):
441 return self._str_map(str.swapcase)
442
443 def _str_lower(self):
444 return self._str_map(str.lower)
445
446 def _str_normalize(self, form):
447 f = lambda x: unicodedata.normalize(form, x)
448 return self._str_map(f)
449
450 def _str_strip(self, to_strip=None):
451 return self._str_map(lambda x: x.strip(to_strip))
452
453 def _str_lstrip(self, to_strip=None):
454 return self._str_map(lambda x: x.lstrip(to_strip))
455
456 def _str_rstrip(self, to_strip=None):
457 return self._str_map(lambda x: x.rstrip(to_strip))
458
459 def _str_removeprefix(self, prefix: str) -> Series:
460 # outstanding question on whether to use native methods for users on Python 3.9+
461 # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770,
462 # in which case we could do return self._str_map(str.removeprefix)
463
464 def removeprefix(text: str) -> str:
465 if text.startswith(prefix):
466 return text[len(prefix) :]
467 return text
468
469 return self._str_map(removeprefix)
470
471 def _str_removesuffix(self, suffix: str) -> Series:
472 return self._str_map(lambda x: x.removesuffix(suffix))
473
474 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
475 regex = re.compile(pat, flags=flags)
476 na_value = self._str_na_value
477
478 if not expand:
479
480 def g(x):
481 m = regex.search(x)
482 return m.groups()[0] if m else na_value
483
484 return self._str_map(g, convert=False)
485
486 empty_row = [na_value] * regex.groups
487
488 def f(x):
489 if not isinstance(x, str):
490 return empty_row
491 m = regex.search(x)
492 if m:
493 return [na_value if item is None else item for item in m.groups()]
494 else:
495 return empty_row
496
497 return [f(val) for val in np.asarray(self)]