1from __future__ import annotations
2
3import functools
4import re
5import sys
6import textwrap
7from typing import (
8 TYPE_CHECKING,
9 Callable,
10 Literal,
11)
12import unicodedata
13
14import numpy as np
15
16from pandas._libs import lib
17import pandas._libs.missing as libmissing
18import pandas._libs.ops as libops
19from pandas._typing import (
20 NpDtype,
21 Scalar,
22)
23
24from pandas.core.dtypes.common import is_scalar
25from pandas.core.dtypes.missing import isna
26
27from pandas.core.strings.base import BaseStringArrayMethods
28
29if TYPE_CHECKING:
30 from pandas import Series
31
32
33class ObjectStringArrayMixin(BaseStringArrayMethods):
34 """
35 String Methods operating on object-dtype ndarrays.
36 """
37
38 _str_na_value = np.nan
39
40 def __len__(self) -> int:
41 # For typing, _str_map relies on the object being sized.
42 raise NotImplementedError
43
44 def _str_map(
45 self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True
46 ):
47 """
48 Map a callable over valid elements of the array.
49
50 Parameters
51 ----------
52 f : Callable
53 A function to call on each non-NA element.
54 na_value : Scalar, optional
55 The value to set for NA values. Might also be used for the
56 fill value if the callable `f` raises an exception.
57 This defaults to ``self._str_na_value`` which is ``np.nan``
58 for object-dtype and Categorical and ``pd.NA`` for StringArray.
59 dtype : Dtype, optional
60 The dtype of the result array.
61 convert : bool, default True
62 Whether to call `maybe_convert_objects` on the resulting ndarray
63 """
64 if dtype is None:
65 dtype = np.dtype("object")
66 if na_value is None:
67 na_value = self._str_na_value
68
69 if not len(self):
70 return np.array([], dtype=dtype)
71
72 arr = np.asarray(self, dtype=object)
73 mask = isna(arr)
74 map_convert = convert and not np.all(mask)
75 try:
76 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
77 except (TypeError, AttributeError) as err:
78 # Reraise the exception if callable `f` got wrong number of args.
79 # The user may want to be warned by this, instead of getting NaN
80 p_err = (
81 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
82 r"(?(3)required )positional arguments?"
83 )
84
85 if len(err.args) >= 1 and re.search(p_err, err.args[0]):
86 # FIXME: this should be totally avoidable
87 raise err
88
89 def g(x):
90 # This type of fallback behavior can be removed once
91 # we remove object-dtype .str accessor.
92 try:
93 return f(x)
94 except (TypeError, AttributeError):
95 return na_value
96
97 return self._str_map(g, na_value=na_value, dtype=dtype)
98 if not isinstance(result, np.ndarray):
99 return result
100 if na_value is not np.nan:
101 np.putmask(result, mask, na_value)
102 if convert and result.dtype == object:
103 result = lib.maybe_convert_objects(result)
104 return result
105
106 def _str_count(self, pat, flags: int = 0):
107 regex = re.compile(pat, flags=flags)
108 f = lambda x: len(regex.findall(x))
109 return self._str_map(f, dtype="int64")
110
111 def _str_pad(
112 self,
113 width,
114 side: Literal["left", "right", "both"] = "left",
115 fillchar: str = " ",
116 ):
117 if side == "left":
118 f = lambda x: x.rjust(width, fillchar)
119 elif side == "right":
120 f = lambda x: x.ljust(width, fillchar)
121 elif side == "both":
122 f = lambda x: x.center(width, fillchar)
123 else: # pragma: no cover
124 raise ValueError("Invalid side")
125 return self._str_map(f)
126
127 def _str_contains(
128 self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
129 ):
130 if regex:
131 if not case:
132 flags |= re.IGNORECASE
133
134 pat = re.compile(pat, flags=flags)
135
136 f = lambda x: pat.search(x) is not None
137 else:
138 if case:
139 f = lambda x: pat in x
140 else:
141 upper_pat = pat.upper()
142 f = lambda x: upper_pat in x.upper()
143 return self._str_map(f, na, dtype=np.dtype("bool"))
144
145 def _str_startswith(self, pat, na=None):
146 f = lambda x: x.startswith(pat)
147 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
148
149 def _str_endswith(self, pat, na=None):
150 f = lambda x: x.endswith(pat)
151 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
152
153 def _str_replace(
154 self,
155 pat: str | re.Pattern,
156 repl: str | Callable,
157 n: int = -1,
158 case: bool = True,
159 flags: int = 0,
160 regex: bool = True,
161 ):
162 if case is False:
163 # add case flag, if provided
164 flags |= re.IGNORECASE
165
166 if regex or flags or callable(repl):
167 if not isinstance(pat, re.Pattern):
168 if regex is False:
169 pat = re.escape(pat)
170 pat = re.compile(pat, flags=flags)
171
172 n = n if n >= 0 else 0
173 f = lambda x: pat.sub(repl=repl, string=x, count=n)
174 else:
175 f = lambda x: x.replace(pat, repl, n)
176
177 return self._str_map(f, dtype=str)
178
179 def _str_repeat(self, repeats):
180 if is_scalar(repeats):
181
182 def scalar_rep(x):
183 try:
184 return bytes.__mul__(x, repeats)
185 except TypeError:
186 return str.__mul__(x, repeats)
187
188 return self._str_map(scalar_rep, dtype=str)
189 else:
190 from pandas.core.arrays.string_ import BaseStringArray
191
192 def rep(x, r):
193 if x is libmissing.NA:
194 return x
195 try:
196 return bytes.__mul__(x, r)
197 except TypeError:
198 return str.__mul__(x, r)
199
200 repeats = np.asarray(repeats, dtype=object)
201 result = libops.vec_binop(np.asarray(self), repeats, rep)
202 if isinstance(self, BaseStringArray):
203 # Not going through map, so we have to do this here.
204 result = type(self)._from_sequence(result)
205 return result
206
207 def _str_match(
208 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
209 ):
210 if not case:
211 flags |= re.IGNORECASE
212
213 regex = re.compile(pat, flags=flags)
214
215 f = lambda x: regex.match(x) is not None
216 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
217
218 def _str_fullmatch(
219 self,
220 pat: str | re.Pattern,
221 case: bool = True,
222 flags: int = 0,
223 na: Scalar | None = None,
224 ):
225 if not case:
226 flags |= re.IGNORECASE
227
228 regex = re.compile(pat, flags=flags)
229
230 f = lambda x: regex.fullmatch(x) is not None
231 return self._str_map(f, na_value=na, dtype=np.dtype(bool))
232
233 def _str_encode(self, encoding, errors: str = "strict"):
234 f = lambda x: x.encode(encoding, errors=errors)
235 return self._str_map(f, dtype=object)
236
237 def _str_find(self, sub, start: int = 0, end=None):
238 return self._str_find_(sub, start, end, side="left")
239
240 def _str_rfind(self, sub, start: int = 0, end=None):
241 return self._str_find_(sub, start, end, side="right")
242
243 def _str_find_(self, sub, start, end, side):
244 if side == "left":
245 method = "find"
246 elif side == "right":
247 method = "rfind"
248 else: # pragma: no cover
249 raise ValueError("Invalid side")
250
251 if end is None:
252 f = lambda x: getattr(x, method)(sub, start)
253 else:
254 f = lambda x: getattr(x, method)(sub, start, end)
255 return self._str_map(f, dtype="int64")
256
257 def _str_findall(self, pat, flags: int = 0):
258 regex = re.compile(pat, flags=flags)
259 return self._str_map(regex.findall, dtype="object")
260
261 def _str_get(self, i):
262 def f(x):
263 if isinstance(x, dict):
264 return x.get(i)
265 elif len(x) > i >= -len(x):
266 return x[i]
267 return self._str_na_value
268
269 return self._str_map(f)
270
271 def _str_index(self, sub, start: int = 0, end=None):
272 if end:
273 f = lambda x: x.index(sub, start, end)
274 else:
275 f = lambda x: x.index(sub, start, end)
276 return self._str_map(f, dtype="int64")
277
278 def _str_rindex(self, sub, start: int = 0, end=None):
279 if end:
280 f = lambda x: x.rindex(sub, start, end)
281 else:
282 f = lambda x: x.rindex(sub, start, end)
283 return self._str_map(f, dtype="int64")
284
285 def _str_join(self, sep):
286 return self._str_map(sep.join)
287
288 def _str_partition(self, sep, expand):
289 result = self._str_map(lambda x: x.partition(sep), dtype="object")
290 return result
291
292 def _str_rpartition(self, sep, expand):
293 return self._str_map(lambda x: x.rpartition(sep), dtype="object")
294
295 def _str_len(self):
296 return self._str_map(len, dtype="int64")
297
298 def _str_slice(self, start=None, stop=None, step=None):
299 obj = slice(start, stop, step)
300 return self._str_map(lambda x: x[obj])
301
302 def _str_slice_replace(self, start=None, stop=None, repl=None):
303 if repl is None:
304 repl = ""
305
306 def f(x):
307 if x[start:stop] == "":
308 local_stop = start
309 else:
310 local_stop = stop
311 y = ""
312 if start is not None:
313 y += x[:start]
314 y += repl
315 if stop is not None:
316 y += x[local_stop:]
317 return y
318
319 return self._str_map(f)
320
321 def _str_split(
322 self,
323 pat: str | re.Pattern | None = None,
324 n=-1,
325 expand: bool = False,
326 regex: bool | None = None,
327 ):
328 if pat is None:
329 if n is None or n == 0:
330 n = -1
331 f = lambda x: x.split(pat, n)
332 else:
333 new_pat: str | re.Pattern
334 if regex is True or isinstance(pat, re.Pattern):
335 new_pat = re.compile(pat)
336 elif regex is False:
337 new_pat = pat
338 # regex is None so link to old behavior #43563
339 else:
340 if len(pat) == 1:
341 new_pat = pat
342 else:
343 new_pat = re.compile(pat)
344
345 if isinstance(new_pat, re.Pattern):
346 if n is None or n == -1:
347 n = 0
348 f = lambda x: new_pat.split(x, maxsplit=n)
349 else:
350 if n is None or n == 0:
351 n = -1
352 f = lambda x: x.split(pat, n)
353 return self._str_map(f, dtype=object)
354
355 def _str_rsplit(self, pat=None, n=-1):
356 if n is None or n == 0:
357 n = -1
358 f = lambda x: x.rsplit(pat, n)
359 return self._str_map(f, dtype="object")
360
361 def _str_translate(self, table):
362 return self._str_map(lambda x: x.translate(table))
363
364 def _str_wrap(self, width, **kwargs):
365 kwargs["width"] = width
366 tw = textwrap.TextWrapper(**kwargs)
367 return self._str_map(lambda s: "\n".join(tw.wrap(s)))
368
369 def _str_get_dummies(self, sep: str = "|"):
370 from pandas import Series
371
372 arr = Series(self).fillna("")
373 try:
374 arr = sep + arr + sep
375 except (TypeError, NotImplementedError):
376 arr = sep + arr.astype(str) + sep
377
378 tags: set[str] = set()
379 for ts in Series(arr, copy=False).str.split(sep):
380 tags.update(ts)
381 tags2 = sorted(tags - {""})
382
383 dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)
384
385 def _isin(test_elements: str, element: str) -> bool:
386 return element in test_elements
387
388 for i, t in enumerate(tags2):
389 pat = sep + t + sep
390 dummies[:, i] = lib.map_infer(
391 arr.to_numpy(), functools.partial(_isin, element=pat)
392 )
393 return dummies, tags2
394
395 def _str_upper(self):
396 return self._str_map(lambda x: x.upper())
397
398 def _str_isalnum(self):
399 return self._str_map(str.isalnum, dtype="bool")
400
401 def _str_isalpha(self):
402 return self._str_map(str.isalpha, dtype="bool")
403
404 def _str_isdecimal(self):
405 return self._str_map(str.isdecimal, dtype="bool")
406
407 def _str_isdigit(self):
408 return self._str_map(str.isdigit, dtype="bool")
409
410 def _str_islower(self):
411 return self._str_map(str.islower, dtype="bool")
412
413 def _str_isnumeric(self):
414 return self._str_map(str.isnumeric, dtype="bool")
415
416 def _str_isspace(self):
417 return self._str_map(str.isspace, dtype="bool")
418
419 def _str_istitle(self):
420 return self._str_map(str.istitle, dtype="bool")
421
422 def _str_isupper(self):
423 return self._str_map(str.isupper, dtype="bool")
424
425 def _str_capitalize(self):
426 return self._str_map(str.capitalize)
427
428 def _str_casefold(self):
429 return self._str_map(str.casefold)
430
431 def _str_title(self):
432 return self._str_map(str.title)
433
434 def _str_swapcase(self):
435 return self._str_map(str.swapcase)
436
437 def _str_lower(self):
438 return self._str_map(str.lower)
439
440 def _str_normalize(self, form):
441 f = lambda x: unicodedata.normalize(form, x)
442 return self._str_map(f)
443
444 def _str_strip(self, to_strip=None):
445 return self._str_map(lambda x: x.strip(to_strip))
446
447 def _str_lstrip(self, to_strip=None):
448 return self._str_map(lambda x: x.lstrip(to_strip))
449
450 def _str_rstrip(self, to_strip=None):
451 return self._str_map(lambda x: x.rstrip(to_strip))
452
453 def _str_removeprefix(self, prefix: str) -> Series:
454 # outstanding question on whether to use native methods for users on Python 3.9+
455 # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770,
456 # in which case we could do return self._str_map(str.removeprefix)
457
458 def removeprefix(text: str) -> str:
459 if text.startswith(prefix):
460 return text[len(prefix) :]
461 return text
462
463 return self._str_map(removeprefix)
464
465 def _str_removesuffix(self, suffix: str) -> Series:
466 if sys.version_info < (3, 9):
467 # NOTE pyupgrade will remove this when we run it with --py39-plus
468 # so don't remove the unnecessary `else` statement below
469 from pandas.util._str_methods import removesuffix
470
471 return self._str_map(functools.partial(removesuffix, suffix=suffix))
472 else:
473 return self._str_map(lambda x: x.removesuffix(suffix))
474
475 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
476 regex = re.compile(pat, flags=flags)
477 na_value = self._str_na_value
478
479 if not expand:
480
481 def g(x):
482 m = regex.search(x)
483 return m.groups()[0] if m else na_value
484
485 return self._str_map(g, convert=False)
486
487 empty_row = [na_value] * regex.groups
488
489 def f(x):
490 if not isinstance(x, str):
491 return empty_row
492 m = regex.search(x)
493 if m:
494 return [na_value if item is None else item for item in m.groups()]
495 else:
496 return empty_row
497
498 return [f(val) for val in np.asarray(self)]