1from __future__ import annotations
2
3import codecs
4from functools import wraps
5import re
6from typing import (
7 TYPE_CHECKING,
8 Callable,
9 Literal,
10 cast,
11)
12import warnings
13
14import numpy as np
15
16from pandas._libs import lib
17from pandas._typing import (
18 AlignJoin,
19 DtypeObj,
20 F,
21 Scalar,
22 npt,
23)
24from pandas.util._decorators import Appender
25from pandas.util._exceptions import find_stack_level
26
27from pandas.core.dtypes.common import (
28 ensure_object,
29 is_bool_dtype,
30 is_integer,
31 is_list_like,
32 is_object_dtype,
33 is_re,
34)
35from pandas.core.dtypes.dtypes import (
36 ArrowDtype,
37 CategoricalDtype,
38)
39from pandas.core.dtypes.generic import (
40 ABCDataFrame,
41 ABCIndex,
42 ABCMultiIndex,
43 ABCSeries,
44)
45from pandas.core.dtypes.missing import isna
46
47from pandas.core.arrays import ExtensionArray
48from pandas.core.base import NoNewAttributesMixin
49from pandas.core.construction import extract_array
50
51if TYPE_CHECKING:
52 from collections.abc import (
53 Hashable,
54 Iterator,
55 )
56
57 from pandas import (
58 DataFrame,
59 Index,
60 Series,
61 )
62
63_shared_docs: dict[str, str] = {}
64_cpython_optimized_encoders = (
65 "utf-8",
66 "utf8",
67 "latin-1",
68 "latin1",
69 "iso-8859-1",
70 "mbcs",
71 "ascii",
72)
73_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")
74
75
76def forbid_nonstring_types(
77 forbidden: list[str] | None, name: str | None = None
78) -> Callable[[F], F]:
79 """
80 Decorator to forbid specific types for a method of StringMethods.
81
82 For calling `.str.{method}` on a Series or Index, it is necessary to first
83 initialize the :class:`StringMethods` object, and then call the method.
84 However, different methods allow different input types, and so this can not
85 be checked during :meth:`StringMethods.__init__`, but must be done on a
86 per-method basis. This decorator exists to facilitate this process, and
87 make it explicit which (inferred) types are disallowed by the method.
88
89 :meth:`StringMethods.__init__` allows the *union* of types its different
90 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),
91 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].
92
93 The default string types ['string', 'empty'] are allowed for all methods.
94 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method
95 then needs to forbid the types it is not intended for.
96
97 Parameters
98 ----------
99 forbidden : list-of-str or None
100 List of forbidden non-string types, may be one or more of
101 `['bytes', 'mixed', 'mixed-integer']`.
102 name : str, default None
103 Name of the method to use in the error message. By default, this is
104 None, in which case the name from the method being wrapped will be
105 copied. However, for working with further wrappers (like _pat_wrapper
106 and _noarg_wrapper), it is necessary to specify the name.
107
108 Returns
109 -------
110 func : wrapper
111 The method to which the decorator is applied, with an added check that
112 enforces the inferred type to not be in the list of forbidden types.
113
114 Raises
115 ------
116 TypeError
117 If the inferred type of the underlying data is in `forbidden`.
118 """
119 # deal with None
120 forbidden = [] if forbidden is None else forbidden
121
122 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(
123 forbidden
124 )
125
126 def _forbid_nonstring_types(func: F) -> F:
127 func_name = func.__name__ if name is None else name
128
129 @wraps(func)
130 def wrapper(self, *args, **kwargs):
131 if self._inferred_dtype not in allowed_types:
132 msg = (
133 f"Cannot use .str.{func_name} with values of "
134 f"inferred dtype '{self._inferred_dtype}'."
135 )
136 raise TypeError(msg)
137 return func(self, *args, **kwargs)
138
139 wrapper.__name__ = func_name
140 return cast(F, wrapper)
141
142 return _forbid_nonstring_types
143
144
145def _map_and_wrap(name: str | None, docstring: str | None):
146 @forbid_nonstring_types(["bytes"], name=name)
147 def wrapper(self):
148 result = getattr(self._data.array, f"_str_{name}")()
149 return self._wrap_result(
150 result, returns_string=name not in ("isnumeric", "isdecimal")
151 )
152
153 wrapper.__doc__ = docstring
154 return wrapper
155
156
157class StringMethods(NoNewAttributesMixin):
158 """
159 Vectorized string functions for Series and Index.
160
161 NAs stay NA unless handled otherwise by a particular method.
162 Patterned after Python's string methods, with some inspiration from
163 R's stringr package.
164
165 Examples
166 --------
167 >>> s = pd.Series(["A_Str_Series"])
168 >>> s
169 0 A_Str_Series
170 dtype: object
171
172 >>> s.str.split("_")
173 0 [A, Str, Series]
174 dtype: object
175
176 >>> s.str.replace("_", "")
177 0 AStrSeries
178 dtype: object
179 """
180
181 # Note: see the docstring in pandas.core.strings.__init__
182 # for an explanation of the implementation.
183 # TODO: Dispatch all the methods
184 # Currently the following are not dispatched to the array
185 # * cat
186 # * extractall
187
188 def __init__(self, data) -> None:
189 from pandas.core.arrays.string_ import StringDtype
190
191 self._inferred_dtype = self._validate(data)
192 self._is_categorical = isinstance(data.dtype, CategoricalDtype)
193 self._is_string = isinstance(data.dtype, StringDtype)
194 self._data = data
195
196 self._index = self._name = None
197 if isinstance(data, ABCSeries):
198 self._index = data.index
199 self._name = data.name
200
201 # ._values.categories works for both Series/Index
202 self._parent = data._values.categories if self._is_categorical else data
203 # save orig to blow up categoricals to the right type
204 self._orig = data
205 self._freeze()
206
207 @staticmethod
208 def _validate(data):
209 """
210 Auxiliary function for StringMethods, infers and checks dtype of data.
211
212 This is a "first line of defence" at the creation of the StringMethods-
213 object, and just checks that the dtype is in the
214 *union* of the allowed types over all string methods below; this
215 restriction is then refined on a per-method basis using the decorator
216 @forbid_nonstring_types (more info in the corresponding docstring).
217
218 This really should exclude all series/index with any non-string values,
219 but that isn't practical for performance reasons until we have a str
220 dtype (GH 9343 / 13877)
221
222 Parameters
223 ----------
224 data : The content of the Series
225
226 Returns
227 -------
228 dtype : inferred dtype of data
229 """
230 if isinstance(data, ABCMultiIndex):
231 raise AttributeError(
232 "Can only use .str accessor with Index, not MultiIndex"
233 )
234
235 # see _libs/lib.pyx for list of inferred types
236 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]
237
238 data = extract_array(data)
239
240 values = getattr(data, "categories", data) # categorical / normal
241
242 inferred_dtype = lib.infer_dtype(values, skipna=True)
243
244 if inferred_dtype not in allowed_types:
245 raise AttributeError("Can only use .str accessor with string values!")
246 return inferred_dtype
247
248 def __getitem__(self, key):
249 result = self._data.array._str_getitem(key)
250 return self._wrap_result(result)
251
252 def __iter__(self) -> Iterator:
253 raise TypeError(f"'{type(self).__name__}' object is not iterable")
254
255 def _wrap_result(
256 self,
257 result,
258 name=None,
259 expand: bool | None = None,
260 fill_value=np.nan,
261 returns_string: bool = True,
262 returns_bool: bool = False,
263 dtype=None,
264 ):
265 from pandas import (
266 Index,
267 MultiIndex,
268 )
269
270 if not hasattr(result, "ndim") or not hasattr(result, "dtype"):
271 if isinstance(result, ABCDataFrame):
272 result = result.__finalize__(self._orig, name="str")
273 return result
274 assert result.ndim < 3
275
276 # We can be wrapping a string / object / categorical result, in which
277 # case we'll want to return the same dtype as the input.
278 # Or we can be wrapping a numeric output, in which case we don't want
279 # to return a StringArray.
280 # Ideally the array method returns the right array type.
281 if expand is None:
282 # infer from ndim if expand is not specified
283 expand = result.ndim != 1
284 elif expand is True and not isinstance(self._orig, ABCIndex):
285 # required when expand=True is explicitly specified
286 # not needed when inferred
287 if isinstance(result.dtype, ArrowDtype):
288 import pyarrow as pa
289
290 from pandas.compat import pa_version_under11p0
291
292 from pandas.core.arrays.arrow.array import ArrowExtensionArray
293
294 value_lengths = pa.compute.list_value_length(result._pa_array)
295 max_len = pa.compute.max(value_lengths).as_py()
296 min_len = pa.compute.min(value_lengths).as_py()
297 if result._hasna:
298 # ArrowExtensionArray.fillna doesn't work for list scalars
299 result = ArrowExtensionArray(
300 result._pa_array.fill_null([None] * max_len)
301 )
302 if min_len < max_len:
303 # append nulls to each scalar list element up to max_len
304 if not pa_version_under11p0:
305 result = ArrowExtensionArray(
306 pa.compute.list_slice(
307 result._pa_array,
308 start=0,
309 stop=max_len,
310 return_fixed_size_list=True,
311 )
312 )
313 else:
314 all_null = np.full(max_len, fill_value=None, dtype=object)
315 values = result.to_numpy()
316 new_values = []
317 for row in values:
318 if len(row) < max_len:
319 nulls = all_null[: max_len - len(row)]
320 row = np.append(row, nulls)
321 new_values.append(row)
322 pa_type = result._pa_array.type
323 result = ArrowExtensionArray(pa.array(new_values, type=pa_type))
324 if name is not None:
325 labels = name
326 else:
327 labels = range(max_len)
328 result = (
329 pa.compute.list_flatten(result._pa_array)
330 .to_numpy()
331 .reshape(len(result), max_len)
332 )
333 result = {
334 label: ArrowExtensionArray(pa.array(res))
335 for label, res in zip(labels, result.T)
336 }
337 elif is_object_dtype(result):
338
339 def cons_row(x):
340 if is_list_like(x):
341 return x
342 else:
343 return [x]
344
345 result = [cons_row(x) for x in result]
346 if result and not self._is_string:
347 # propagate nan values to match longest sequence (GH 18450)
348 max_len = max(len(x) for x in result)
349 result = [
350 x * max_len if len(x) == 0 or x[0] is np.nan else x
351 for x in result
352 ]
353
354 if not isinstance(expand, bool):
355 raise ValueError("expand must be True or False")
356
357 if expand is False:
358 # if expand is False, result should have the same name
359 # as the original otherwise specified
360 if name is None:
361 name = getattr(result, "name", None)
362 if name is None:
363 # do not use logical or, _orig may be a DataFrame
364 # which has "name" column
365 name = self._orig.name
366
367 # Wait until we are sure result is a Series or Index before
368 # checking attributes (GH 12180)
369 if isinstance(self._orig, ABCIndex):
370 # if result is a boolean np.array, return the np.array
371 # instead of wrapping it into a boolean Index (GH 8875)
372 if is_bool_dtype(result):
373 return result
374
375 if expand:
376 result = list(result)
377 out: Index = MultiIndex.from_tuples(result, names=name)
378 if out.nlevels == 1:
379 # We had all tuples of length-one, which are
380 # better represented as a regular Index.
381 out = out.get_level_values(0)
382 return out
383 else:
384 return Index(result, name=name, dtype=dtype)
385 else:
386 index = self._orig.index
387 # This is a mess.
388 _dtype: DtypeObj | str | None = dtype
389 vdtype = getattr(result, "dtype", None)
390 if self._is_string:
391 if is_bool_dtype(vdtype):
392 _dtype = result.dtype
393 elif returns_string:
394 _dtype = self._orig.dtype
395 else:
396 _dtype = vdtype
397 elif vdtype is not None:
398 _dtype = vdtype
399
400 if expand:
401 cons = self._orig._constructor_expanddim
402 result = cons(result, columns=name, index=index, dtype=_dtype)
403 else:
404 # Must be a Series
405 cons = self._orig._constructor
406 result = cons(result, name=name, index=index, dtype=_dtype)
407 result = result.__finalize__(self._orig, method="str")
408 if name is not None and result.ndim == 1:
409 # __finalize__ might copy over the original name, but we may
410 # want the new name (e.g. str.extract).
411 result.name = name
412 return result
413
414 def _get_series_list(self, others):
415 """
416 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
417 into a list of Series (elements without an index must match the length
418 of the calling Series/Index).
419
420 Parameters
421 ----------
422 others : Series, DataFrame, np.ndarray, list-like or list-like of
423 Objects that are either Series, Index or np.ndarray (1-dim).
424
425 Returns
426 -------
427 list of Series
428 Others transformed into list of Series.
429 """
430 from pandas import (
431 DataFrame,
432 Series,
433 )
434
435 # self._orig is either Series or Index
436 idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index
437
438 # Generally speaking, all objects without an index inherit the index
439 # `idx` of the calling Series/Index - i.e. must have matching length.
440 # Objects with an index (i.e. Series/Index/DataFrame) keep their own.
441 if isinstance(others, ABCSeries):
442 return [others]
443 elif isinstance(others, ABCIndex):
444 return [Series(others, index=idx, dtype=others.dtype)]
445 elif isinstance(others, ABCDataFrame):
446 return [others[x] for x in others]
447 elif isinstance(others, np.ndarray) and others.ndim == 2:
448 others = DataFrame(others, index=idx)
449 return [others[x] for x in others]
450 elif is_list_like(others, allow_sets=False):
451 try:
452 others = list(others) # ensure iterators do not get read twice etc
453 except TypeError:
454 # e.g. ser.str, raise below
455 pass
456 else:
457 # in case of list-like `others`, all elements must be
458 # either Series/Index/np.ndarray (1-dim)...
459 if all(
460 isinstance(x, (ABCSeries, ABCIndex, ExtensionArray))
461 or (isinstance(x, np.ndarray) and x.ndim == 1)
462 for x in others
463 ):
464 los: list[Series] = []
465 while others: # iterate through list and append each element
466 los = los + self._get_series_list(others.pop(0))
467 return los
468 # ... or just strings
469 elif all(not is_list_like(x) for x in others):
470 return [Series(others, index=idx)]
471 raise TypeError(
472 "others must be Series, Index, DataFrame, np.ndarray "
473 "or list-like (either containing only strings or "
474 "containing only objects of type Series/Index/"
475 "np.ndarray[1-dim])"
476 )
477
478 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])
479 def cat(
480 self,
481 others=None,
482 sep: str | None = None,
483 na_rep=None,
484 join: AlignJoin = "left",
485 ) -> str | Series | Index:
486 """
487 Concatenate strings in the Series/Index with given separator.
488
489 If `others` is specified, this function concatenates the Series/Index
490 and elements of `others` element-wise.
491 If `others` is not passed, then all values in the Series/Index are
492 concatenated into a single string with a given `sep`.
493
494 Parameters
495 ----------
496 others : Series, Index, DataFrame, np.ndarray or list-like
497 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
498 other list-likes of strings must have the same length as the
499 calling Series/Index, with the exception of indexed objects (i.e.
500 Series/Index/DataFrame) if `join` is not None.
501
502 If others is a list-like that contains a combination of Series,
503 Index or np.ndarray (1-dim), then all elements will be unpacked and
504 must satisfy the above criteria individually.
505
506 If others is None, the method returns the concatenation of all
507 strings in the calling Series/Index.
508 sep : str, default ''
509 The separator between the different elements/columns. By default
510 the empty string `''` is used.
511 na_rep : str or None, default None
512 Representation that is inserted for all missing values:
513
514 - If `na_rep` is None, and `others` is None, missing values in the
515 Series/Index are omitted from the result.
516 - If `na_rep` is None, and `others` is not None, a row containing a
517 missing value in any of the columns (before concatenation) will
518 have a missing value in the result.
519 join : {'left', 'right', 'outer', 'inner'}, default 'left'
520 Determines the join-style between the calling Series/Index and any
521 Series/Index/DataFrame in `others` (objects without an index need
522 to match the length of the calling Series/Index). To disable
523 alignment, use `.values` on any Series/Index/DataFrame in `others`.
524
525 Returns
526 -------
527 str, Series or Index
528 If `others` is None, `str` is returned, otherwise a `Series/Index`
529 (same type as caller) of objects is returned.
530
531 See Also
532 --------
533 split : Split each string in the Series/Index.
534 join : Join lists contained as elements in the Series/Index.
535
536 Examples
537 --------
538 When not passing `others`, all values are concatenated into a single
539 string:
540
541 >>> s = pd.Series(['a', 'b', np.nan, 'd'])
542 >>> s.str.cat(sep=' ')
543 'a b d'
544
545 By default, NA values in the Series are ignored. Using `na_rep`, they
546 can be given a representation:
547
548 >>> s.str.cat(sep=' ', na_rep='?')
549 'a b ? d'
550
551 If `others` is specified, corresponding values are concatenated with
552 the separator. Result will be a Series of strings.
553
554 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
555 0 a,A
556 1 b,B
557 2 NaN
558 3 d,D
559 dtype: object
560
561 Missing values will remain missing in the result, but can again be
562 represented using `na_rep`
563
564 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
565 0 a,A
566 1 b,B
567 2 -,C
568 3 d,D
569 dtype: object
570
571 If `sep` is not specified, the values are concatenated without
572 separation.
573
574 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
575 0 aA
576 1 bB
577 2 -C
578 3 dD
579 dtype: object
580
581 Series with different indexes can be aligned before concatenation. The
582 `join`-keyword works as in other methods.
583
584 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
585 >>> s.str.cat(t, join='left', na_rep='-')
586 0 aa
587 1 b-
588 2 -c
589 3 dd
590 dtype: object
591 >>>
592 >>> s.str.cat(t, join='outer', na_rep='-')
593 0 aa
594 1 b-
595 2 -c
596 3 dd
597 4 -e
598 dtype: object
599 >>>
600 >>> s.str.cat(t, join='inner', na_rep='-')
601 0 aa
602 2 -c
603 3 dd
604 dtype: object
605 >>>
606 >>> s.str.cat(t, join='right', na_rep='-')
607 3 dd
608 0 aa
609 4 -e
610 2 -c
611 dtype: object
612
613 For more examples, see :ref:`here <text.concatenate>`.
614 """
615 # TODO: dispatch
616 from pandas import (
617 Index,
618 Series,
619 concat,
620 )
621
622 if isinstance(others, str):
623 raise ValueError("Did you mean to supply a `sep` keyword?")
624 if sep is None:
625 sep = ""
626
627 if isinstance(self._orig, ABCIndex):
628 data = Series(self._orig, index=self._orig, dtype=self._orig.dtype)
629 else: # Series
630 data = self._orig
631
632 # concatenate Series/Index with itself if no "others"
633 if others is None:
634 # error: Incompatible types in assignment (expression has type
635 # "ndarray", variable has type "Series")
636 data = ensure_object(data) # type: ignore[assignment]
637 na_mask = isna(data)
638 if na_rep is None and na_mask.any():
639 return sep.join(data[~na_mask])
640 elif na_rep is not None and na_mask.any():
641 return sep.join(np.where(na_mask, na_rep, data))
642 else:
643 return sep.join(data)
644
645 try:
646 # turn anything in "others" into lists of Series
647 others = self._get_series_list(others)
648 except ValueError as err: # do not catch TypeError raised by _get_series_list
649 raise ValueError(
650 "If `others` contains arrays or lists (or other "
651 "list-likes without an index), these must all be "
652 "of the same length as the calling Series/Index."
653 ) from err
654
655 # align if required
656 if any(not data.index.equals(x.index) for x in others):
657 # Need to add keys for uniqueness in case of duplicate columns
658 others = concat(
659 others,
660 axis=1,
661 join=(join if join == "inner" else "outer"),
662 keys=range(len(others)),
663 sort=False,
664 copy=False,
665 )
666 data, others = data.align(others, join=join)
667 others = [others[x] for x in others] # again list of Series
668
669 all_cols = [ensure_object(x) for x in [data] + others]
670 na_masks = np.array([isna(x) for x in all_cols])
671 union_mask = np.logical_or.reduce(na_masks, axis=0)
672
673 if na_rep is None and union_mask.any():
674 # no na_rep means NaNs for all rows where any column has a NaN
675 # only necessary if there are actually any NaNs
676 result = np.empty(len(data), dtype=object)
677 np.putmask(result, union_mask, np.nan)
678
679 not_masked = ~union_mask
680 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)
681 elif na_rep is not None and union_mask.any():
682 # fill NaNs with na_rep in case there are actually any NaNs
683 all_cols = [
684 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)
685 ]
686 result = cat_safe(all_cols, sep)
687 else:
688 # no NaNs - can just concatenate
689 result = cat_safe(all_cols, sep)
690
691 out: Index | Series
692 if isinstance(self._orig.dtype, CategoricalDtype):
693 # We need to infer the new categories.
694 dtype = self._orig.dtype.categories.dtype
695 else:
696 dtype = self._orig.dtype
697 if isinstance(self._orig, ABCIndex):
698 # add dtype for case that result is all-NA
699 if isna(result).all():
700 dtype = object # type: ignore[assignment]
701
702 out = Index(result, dtype=dtype, name=self._orig.name)
703 else: # Series
704 res_ser = Series(
705 result, dtype=dtype, index=data.index, name=self._orig.name, copy=False
706 )
707 out = res_ser.__finalize__(self._orig, method="str_cat")
708 return out
709
710 _shared_docs[
711 "str_split"
712 ] = r"""
713 Split strings around given separator/delimiter.
714
715 Splits the string in the Series/Index from the %(side)s,
716 at the specified delimiter string.
717
718 Parameters
719 ----------
720 pat : str%(pat_regex)s, optional
721 %(pat_description)s.
722 If not specified, split on whitespace.
723 n : int, default -1 (all)
724 Limit number of splits in output.
725 ``None``, 0 and -1 will be interpreted as return all splits.
726 expand : bool, default False
727 Expand the split strings into separate columns.
728
729 - If ``True``, return DataFrame/MultiIndex expanding dimensionality.
730 - If ``False``, return Series/Index, containing lists of strings.
731 %(regex_argument)s
732 Returns
733 -------
734 Series, Index, DataFrame or MultiIndex
735 Type matches caller unless ``expand=True`` (see Notes).
736 %(raises_split)s
737 See Also
738 --------
739 Series.str.split : Split strings around given separator/delimiter.
740 Series.str.rsplit : Splits string around given separator/delimiter,
741 starting from the right.
742 Series.str.join : Join lists contained as elements in the Series/Index
743 with passed delimiter.
744 str.split : Standard library version for split.
745 str.rsplit : Standard library version for rsplit.
746
747 Notes
748 -----
749 The handling of the `n` keyword depends on the number of found splits:
750
751 - If found splits > `n`, make first `n` splits only
752 - If found splits <= `n`, make all splits
753 - If for a certain row the number of found splits < `n`,
754 append `None` for padding up to `n` if ``expand=True``
755
756 If using ``expand=True``, Series and Index callers return DataFrame and
757 MultiIndex objects, respectively.
758 %(regex_pat_note)s
759 Examples
760 --------
761 >>> s = pd.Series(
762 ... [
763 ... "this is a regular sentence",
764 ... "https://docs.python.org/3/tutorial/index.html",
765 ... np.nan
766 ... ]
767 ... )
768 >>> s
769 0 this is a regular sentence
770 1 https://docs.python.org/3/tutorial/index.html
771 2 NaN
772 dtype: object
773
774 In the default setting, the string is split by whitespace.
775
776 >>> s.str.split()
777 0 [this, is, a, regular, sentence]
778 1 [https://docs.python.org/3/tutorial/index.html]
779 2 NaN
780 dtype: object
781
782 Without the `n` parameter, the outputs of `rsplit` and `split`
783 are identical.
784
785 >>> s.str.rsplit()
786 0 [this, is, a, regular, sentence]
787 1 [https://docs.python.org/3/tutorial/index.html]
788 2 NaN
789 dtype: object
790
791 The `n` parameter can be used to limit the number of splits on the
792 delimiter. The outputs of `split` and `rsplit` are different.
793
794 >>> s.str.split(n=2)
795 0 [this, is, a regular sentence]
796 1 [https://docs.python.org/3/tutorial/index.html]
797 2 NaN
798 dtype: object
799
800 >>> s.str.rsplit(n=2)
801 0 [this is a, regular, sentence]
802 1 [https://docs.python.org/3/tutorial/index.html]
803 2 NaN
804 dtype: object
805
806 The `pat` parameter can be used to split by other characters.
807
808 >>> s.str.split(pat="/")
809 0 [this is a regular sentence]
810 1 [https:, , docs.python.org, 3, tutorial, index...
811 2 NaN
812 dtype: object
813
814 When using ``expand=True``, the split elements will expand out into
815 separate columns. If NaN is present, it is propagated throughout
816 the columns during the split.
817
818 >>> s.str.split(expand=True)
819 0 1 2 3 4
820 0 this is a regular sentence
821 1 https://docs.python.org/3/tutorial/index.html None None None None
822 2 NaN NaN NaN NaN NaN
823
824 For slightly more complex use cases like splitting the html document name
825 from a url, a combination of parameter settings can be used.
826
827 >>> s.str.rsplit("/", n=1, expand=True)
828 0 1
829 0 this is a regular sentence None
830 1 https://docs.python.org/3/tutorial index.html
831 2 NaN NaN
832 %(regex_examples)s"""
833
834 @Appender(
835 _shared_docs["str_split"]
836 % {
837 "side": "beginning",
838 "pat_regex": " or compiled regex",
839 "pat_description": "String or regular expression to split on",
840 "regex_argument": """
841 regex : bool, default None
842 Determines if the passed-in pattern is a regular expression:
843
844 - If ``True``, assumes the passed-in pattern is a regular expression
845 - If ``False``, treats the pattern as a literal string.
846 - If ``None`` and `pat` length is 1, treats `pat` as a literal string.
847 - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.
848 - Cannot be set to False if `pat` is a compiled regex
849
850 .. versionadded:: 1.4.0
851 """,
852 "raises_split": """
853 Raises
854 ------
855 ValueError
856 * if `regex` is False and `pat` is a compiled regex
857 """,
858 "regex_pat_note": """
859 Use of `regex =False` with a `pat` as a compiled regex will raise an error.
860 """,
861 "method": "split",
862 "regex_examples": r"""
863 Remember to escape special characters when explicitly using regular expressions.
864
865 >>> s = pd.Series(["foo and bar plus baz"])
866 >>> s.str.split(r"and|plus", expand=True)
867 0 1 2
868 0 foo bar baz
869
870 Regular expressions can be used to handle urls or file names.
871 When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled
872 as a regex only if ``len(pat) != 1``.
873
874 >>> s = pd.Series(['foojpgbar.jpg'])
875 >>> s.str.split(r".", expand=True)
876 0 1
877 0 foojpgbar jpg
878
879 >>> s.str.split(r"\.jpg", expand=True)
880 0 1
881 0 foojpgbar
882
883 When ``regex=True``, `pat` is interpreted as a regex
884
885 >>> s.str.split(r"\.jpg", regex=True, expand=True)
886 0 1
887 0 foojpgbar
888
889 A compiled regex can be passed as `pat`
890
891 >>> import re
892 >>> s.str.split(re.compile(r"\.jpg"), expand=True)
893 0 1
894 0 foojpgbar
895
896 When ``regex=False``, `pat` is interpreted as the string itself
897
898 >>> s.str.split(r"\.jpg", regex=False, expand=True)
899 0
900 0 foojpgbar.jpg
901 """,
902 }
903 )
904 @forbid_nonstring_types(["bytes"])
905 def split(
906 self,
907 pat: str | re.Pattern | None = None,
908 *,
909 n=-1,
910 expand: bool = False,
911 regex: bool | None = None,
912 ):
913 if regex is False and is_re(pat):
914 raise ValueError(
915 "Cannot use a compiled regex as replacement pattern with regex=False"
916 )
917 if is_re(pat):
918 regex = True
919 result = self._data.array._str_split(pat, n, expand, regex)
920 if self._data.dtype == "category":
921 dtype = self._data.dtype.categories.dtype
922 else:
923 dtype = object if self._data.dtype == object else None
924 return self._wrap_result(
925 result, expand=expand, returns_string=expand, dtype=dtype
926 )
927
928 @Appender(
929 _shared_docs["str_split"]
930 % {
931 "side": "end",
932 "pat_regex": "",
933 "pat_description": "String to split on",
934 "regex_argument": "",
935 "raises_split": "",
936 "regex_pat_note": "",
937 "method": "rsplit",
938 "regex_examples": "",
939 }
940 )
941 @forbid_nonstring_types(["bytes"])
942 def rsplit(self, pat=None, *, n=-1, expand: bool = False):
943 result = self._data.array._str_rsplit(pat, n=n)
944 dtype = object if self._data.dtype == object else None
945 return self._wrap_result(
946 result, expand=expand, returns_string=expand, dtype=dtype
947 )
948
949 _shared_docs[
950 "str_partition"
951 ] = """
952 Split the string at the %(side)s occurrence of `sep`.
953
954 This method splits the string at the %(side)s occurrence of `sep`,
955 and returns 3 elements containing the part before the separator,
956 the separator itself, and the part after the separator.
957 If the separator is not found, return %(return)s.
958
959 Parameters
960 ----------
961 sep : str, default whitespace
962 String to split on.
963 expand : bool, default True
964 If True, return DataFrame/MultiIndex expanding dimensionality.
965 If False, return Series/Index.
966
967 Returns
968 -------
969 DataFrame/MultiIndex or Series/Index of objects
970
971 See Also
972 --------
973 %(also)s
974 Series.str.split : Split strings around given separators.
975 str.partition : Standard library version.
976
977 Examples
978 --------
979
980 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
981 >>> s
982 0 Linda van der Berg
983 1 George Pitt-Rivers
984 dtype: object
985
986 >>> s.str.partition()
987 0 1 2
988 0 Linda van der Berg
989 1 George Pitt-Rivers
990
991 To partition by the last space instead of the first one:
992
993 >>> s.str.rpartition()
994 0 1 2
995 0 Linda van der Berg
996 1 George Pitt-Rivers
997
998 To partition by something different than a space:
999
1000 >>> s.str.partition('-')
1001 0 1 2
1002 0 Linda van der Berg
1003 1 George Pitt - Rivers
1004
1005 To return a Series containing tuples instead of a DataFrame:
1006
1007 >>> s.str.partition('-', expand=False)
1008 0 (Linda van der Berg, , )
1009 1 (George Pitt, -, Rivers)
1010 dtype: object
1011
1012 Also available on indices:
1013
1014 >>> idx = pd.Index(['X 123', 'Y 999'])
1015 >>> idx
1016 Index(['X 123', 'Y 999'], dtype='object')
1017
1018 Which will create a MultiIndex:
1019
1020 >>> idx.str.partition()
1021 MultiIndex([('X', ' ', '123'),
1022 ('Y', ' ', '999')],
1023 )
1024
1025 Or an index with tuples with ``expand=False``:
1026
1027 >>> idx.str.partition(expand=False)
1028 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
1029 """
1030
1031 @Appender(
1032 _shared_docs["str_partition"]
1033 % {
1034 "side": "first",
1035 "return": "3 elements containing the string itself, followed by two "
1036 "empty strings",
1037 "also": "rpartition : Split the string at the last occurrence of `sep`.",
1038 }
1039 )
1040 @forbid_nonstring_types(["bytes"])
1041 def partition(self, sep: str = " ", expand: bool = True):
1042 result = self._data.array._str_partition(sep, expand)
1043 if self._data.dtype == "category":
1044 dtype = self._data.dtype.categories.dtype
1045 else:
1046 dtype = object if self._data.dtype == object else None
1047 return self._wrap_result(
1048 result, expand=expand, returns_string=expand, dtype=dtype
1049 )
1050
1051 @Appender(
1052 _shared_docs["str_partition"]
1053 % {
1054 "side": "last",
1055 "return": "3 elements containing two empty strings, followed by the "
1056 "string itself",
1057 "also": "partition : Split the string at the first occurrence of `sep`.",
1058 }
1059 )
1060 @forbid_nonstring_types(["bytes"])
1061 def rpartition(self, sep: str = " ", expand: bool = True):
1062 result = self._data.array._str_rpartition(sep, expand)
1063 if self._data.dtype == "category":
1064 dtype = self._data.dtype.categories.dtype
1065 else:
1066 dtype = object if self._data.dtype == object else None
1067 return self._wrap_result(
1068 result, expand=expand, returns_string=expand, dtype=dtype
1069 )
1070
1071 def get(self, i):
1072 """
1073 Extract element from each component at specified position or with specified key.
1074
1075 Extract element from lists, tuples, dict, or strings in each element in the
1076 Series/Index.
1077
1078 Parameters
1079 ----------
1080 i : int or hashable dict label
1081 Position or key of element to extract.
1082
1083 Returns
1084 -------
1085 Series or Index
1086
1087 Examples
1088 --------
1089 >>> s = pd.Series(["String",
1090 ... (1, 2, 3),
1091 ... ["a", "b", "c"],
1092 ... 123,
1093 ... -456,
1094 ... {1: "Hello", "2": "World"}])
1095 >>> s
1096 0 String
1097 1 (1, 2, 3)
1098 2 [a, b, c]
1099 3 123
1100 4 -456
1101 5 {1: 'Hello', '2': 'World'}
1102 dtype: object
1103
1104 >>> s.str.get(1)
1105 0 t
1106 1 2
1107 2 b
1108 3 NaN
1109 4 NaN
1110 5 Hello
1111 dtype: object
1112
1113 >>> s.str.get(-1)
1114 0 g
1115 1 3
1116 2 c
1117 3 NaN
1118 4 NaN
1119 5 None
1120 dtype: object
1121
1122 Return element with given key
1123
1124 >>> s = pd.Series([{"name": "Hello", "value": "World"},
1125 ... {"name": "Goodbye", "value": "Planet"}])
1126 >>> s.str.get('name')
1127 0 Hello
1128 1 Goodbye
1129 dtype: object
1130 """
1131 result = self._data.array._str_get(i)
1132 return self._wrap_result(result)
1133
1134 @forbid_nonstring_types(["bytes"])
1135 def join(self, sep: str):
1136 """
1137 Join lists contained as elements in the Series/Index with passed delimiter.
1138
1139 If the elements of a Series are lists themselves, join the content of these
1140 lists using the delimiter passed to the function.
1141 This function is an equivalent to :meth:`str.join`.
1142
1143 Parameters
1144 ----------
1145 sep : str
1146 Delimiter to use between list entries.
1147
1148 Returns
1149 -------
1150 Series/Index: object
1151 The list entries concatenated by intervening occurrences of the
1152 delimiter.
1153
1154 Raises
1155 ------
1156 AttributeError
1157 If the supplied Series contains neither strings nor lists.
1158
1159 See Also
1160 --------
1161 str.join : Standard library version of this method.
1162 Series.str.split : Split strings around given separator/delimiter.
1163
1164 Notes
1165 -----
1166 If any of the list items is not a string object, the result of the join
1167 will be `NaN`.
1168
1169 Examples
1170 --------
1171 Example with a list that contains non-string elements.
1172
1173 >>> s = pd.Series([['lion', 'elephant', 'zebra'],
1174 ... [1.1, 2.2, 3.3],
1175 ... ['cat', np.nan, 'dog'],
1176 ... ['cow', 4.5, 'goat'],
1177 ... ['duck', ['swan', 'fish'], 'guppy']])
1178 >>> s
1179 0 [lion, elephant, zebra]
1180 1 [1.1, 2.2, 3.3]
1181 2 [cat, nan, dog]
1182 3 [cow, 4.5, goat]
1183 4 [duck, [swan, fish], guppy]
1184 dtype: object
1185
1186 Join all lists using a '-'. The lists containing object(s) of types other
1187 than str will produce a NaN.
1188
1189 >>> s.str.join('-')
1190 0 lion-elephant-zebra
1191 1 NaN
1192 2 NaN
1193 3 NaN
1194 4 NaN
1195 dtype: object
1196 """
1197 result = self._data.array._str_join(sep)
1198 return self._wrap_result(result)
1199
1200 @forbid_nonstring_types(["bytes"])
1201 def contains(
1202 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
1203 ):
1204 r"""
1205 Test if pattern or regex is contained within a string of a Series or Index.
1206
1207 Return boolean Series or Index based on whether a given pattern or regex is
1208 contained within a string of a Series or Index.
1209
1210 Parameters
1211 ----------
1212 pat : str
1213 Character sequence or regular expression.
1214 case : bool, default True
1215 If True, case sensitive.
1216 flags : int, default 0 (no flags)
1217 Flags to pass through to the re module, e.g. re.IGNORECASE.
1218 na : scalar, optional
1219 Fill value for missing values. The default depends on dtype of the
1220 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
1221 ``pandas.NA`` is used.
1222 regex : bool, default True
1223 If True, assumes the pat is a regular expression.
1224
1225 If False, treats the pat as a literal string.
1226
1227 Returns
1228 -------
1229 Series or Index of boolean values
1230 A Series or Index of boolean values indicating whether the
1231 given pattern is contained within the string of each element
1232 of the Series or Index.
1233
1234 See Also
1235 --------
1236 match : Analogous, but stricter, relying on re.match instead of re.search.
1237 Series.str.startswith : Test if the start of each string element matches a
1238 pattern.
1239 Series.str.endswith : Same as startswith, but tests the end of string.
1240
1241 Examples
1242 --------
1243 Returning a Series of booleans using only a literal pattern.
1244
1245 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])
1246 >>> s1.str.contains('og', regex=False)
1247 0 False
1248 1 True
1249 2 False
1250 3 False
1251 4 NaN
1252 dtype: object
1253
1254 Returning an Index of booleans using only a literal pattern.
1255
1256 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.nan])
1257 >>> ind.str.contains('23', regex=False)
1258 Index([False, False, False, True, nan], dtype='object')
1259
1260 Specifying case sensitivity using `case`.
1261
1262 >>> s1.str.contains('oG', case=True, regex=True)
1263 0 False
1264 1 False
1265 2 False
1266 3 False
1267 4 NaN
1268 dtype: object
1269
1270 Specifying `na` to be `False` instead of `NaN` replaces NaN values
1271 with `False`. If Series or Index does not contain NaN values
1272 the resultant dtype will be `bool`, otherwise, an `object` dtype.
1273
1274 >>> s1.str.contains('og', na=False, regex=True)
1275 0 False
1276 1 True
1277 2 False
1278 3 False
1279 4 False
1280 dtype: bool
1281
1282 Returning 'house' or 'dog' when either expression occurs in a string.
1283
1284 >>> s1.str.contains('house|dog', regex=True)
1285 0 False
1286 1 True
1287 2 True
1288 3 False
1289 4 NaN
1290 dtype: object
1291
1292 Ignoring case sensitivity using `flags` with regex.
1293
1294 >>> import re
1295 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
1296 0 False
1297 1 False
1298 2 True
1299 3 False
1300 4 NaN
1301 dtype: object
1302
1303 Returning any digit using regular expression.
1304
1305 >>> s1.str.contains('\\d', regex=True)
1306 0 False
1307 1 False
1308 2 False
1309 3 True
1310 4 NaN
1311 dtype: object
1312
1313 Ensure `pat` is a not a literal pattern when `regex` is set to True.
1314 Note in the following example one might expect only `s2[1]` and `s2[3]` to
1315 return `True`. However, '.0' as a regex matches any character
1316 followed by a 0.
1317
1318 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
1319 >>> s2.str.contains('.0', regex=True)
1320 0 True
1321 1 True
1322 2 False
1323 3 True
1324 4 False
1325 dtype: bool
1326 """
1327 if regex and re.compile(pat).groups:
1328 warnings.warn(
1329 "This pattern is interpreted as a regular expression, and has "
1330 "match groups. To actually get the groups, use str.extract.",
1331 UserWarning,
1332 stacklevel=find_stack_level(),
1333 )
1334
1335 result = self._data.array._str_contains(pat, case, flags, na, regex)
1336 return self._wrap_result(result, fill_value=na, returns_string=False)
1337
1338 @forbid_nonstring_types(["bytes"])
1339 def match(self, pat: str, case: bool = True, flags: int = 0, na=None):
1340 """
1341 Determine if each string starts with a match of a regular expression.
1342
1343 Parameters
1344 ----------
1345 pat : str
1346 Character sequence.
1347 case : bool, default True
1348 If True, case sensitive.
1349 flags : int, default 0 (no flags)
1350 Regex module flags, e.g. re.IGNORECASE.
1351 na : scalar, optional
1352 Fill value for missing values. The default depends on dtype of the
1353 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
1354 ``pandas.NA`` is used.
1355
1356 Returns
1357 -------
1358 Series/Index/array of boolean values
1359
1360 See Also
1361 --------
1362 fullmatch : Stricter matching that requires the entire string to match.
1363 contains : Analogous, but less strict, relying on re.search instead of
1364 re.match.
1365 extract : Extract matched groups.
1366
1367 Examples
1368 --------
1369 >>> ser = pd.Series(["horse", "eagle", "donkey"])
1370 >>> ser.str.match("e")
1371 0 False
1372 1 True
1373 2 False
1374 dtype: bool
1375 """
1376 result = self._data.array._str_match(pat, case=case, flags=flags, na=na)
1377 return self._wrap_result(result, fill_value=na, returns_string=False)
1378
1379 @forbid_nonstring_types(["bytes"])
1380 def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None):
1381 """
1382 Determine if each string entirely matches a regular expression.
1383
1384 Parameters
1385 ----------
1386 pat : str
1387 Character sequence or regular expression.
1388 case : bool, default True
1389 If True, case sensitive.
1390 flags : int, default 0 (no flags)
1391 Regex module flags, e.g. re.IGNORECASE.
1392 na : scalar, optional
1393 Fill value for missing values. The default depends on dtype of the
1394 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
1395 ``pandas.NA`` is used.
1396
1397 Returns
1398 -------
1399 Series/Index/array of boolean values
1400
1401 See Also
1402 --------
1403 match : Similar, but also returns `True` when only a *prefix* of the string
1404 matches the regular expression.
1405 extract : Extract matched groups.
1406
1407 Examples
1408 --------
1409 >>> ser = pd.Series(["cat", "duck", "dove"])
1410 >>> ser.str.fullmatch(r'd.+')
1411 0 False
1412 1 True
1413 2 True
1414 dtype: bool
1415 """
1416 result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
1417 return self._wrap_result(result, fill_value=na, returns_string=False)
1418
1419 @forbid_nonstring_types(["bytes"])
1420 def replace(
1421 self,
1422 pat: str | re.Pattern,
1423 repl: str | Callable,
1424 n: int = -1,
1425 case: bool | None = None,
1426 flags: int = 0,
1427 regex: bool = False,
1428 ):
1429 r"""
1430 Replace each occurrence of pattern/regex in the Series/Index.
1431
1432 Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on
1433 the regex value.
1434
1435 Parameters
1436 ----------
1437 pat : str or compiled regex
1438 String can be a character sequence or regular expression.
1439 repl : str or callable
1440 Replacement string or a callable. The callable is passed the regex
1441 match object and must return a replacement string to be used.
1442 See :func:`re.sub`.
1443 n : int, default -1 (all)
1444 Number of replacements to make from start.
1445 case : bool, default None
1446 Determines if replace is case sensitive:
1447
1448 - If True, case sensitive (the default if `pat` is a string)
1449 - Set to False for case insensitive
1450 - Cannot be set if `pat` is a compiled regex.
1451
1452 flags : int, default 0 (no flags)
1453 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled
1454 regex.
1455 regex : bool, default False
1456 Determines if the passed-in pattern is a regular expression:
1457
1458 - If True, assumes the passed-in pattern is a regular expression.
1459 - If False, treats the pattern as a literal string
1460 - Cannot be set to False if `pat` is a compiled regex or `repl` is
1461 a callable.
1462
1463 Returns
1464 -------
1465 Series or Index of object
1466 A copy of the object with all matching occurrences of `pat` replaced by
1467 `repl`.
1468
1469 Raises
1470 ------
1471 ValueError
1472 * if `regex` is False and `repl` is a callable or `pat` is a compiled
1473 regex
1474 * if `pat` is a compiled regex and `case` or `flags` is set
1475
1476 Notes
1477 -----
1478 When `pat` is a compiled regex, all flags should be included in the
1479 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
1480 regex will raise an error.
1481
1482 Examples
1483 --------
1484 When `pat` is a string and `regex` is True, the given `pat`
1485 is compiled as a regex. When `repl` is a string, it replaces matching
1486 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
1487 left as is:
1488
1489 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
1490 0 bao
1491 1 baz
1492 2 NaN
1493 dtype: object
1494
1495 When `pat` is a string and `regex` is False, every `pat` is replaced with
1496 `repl` as with :meth:`str.replace`:
1497
1498 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
1499 0 bao
1500 1 fuz
1501 2 NaN
1502 dtype: object
1503
1504 When `repl` is a callable, it is called on every `pat` using
1505 :func:`re.sub`. The callable should expect one positional argument
1506 (a regex object) and return a string.
1507
1508 To get the idea:
1509
1510 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True)
1511 0 <re.Match object; span=(0, 1), match='f'>oo
1512 1 <re.Match object; span=(0, 1), match='f'>uz
1513 2 NaN
1514 dtype: object
1515
1516 Reverse every lowercase alphabetic word:
1517
1518 >>> repl = lambda m: m.group(0)[::-1]
1519 >>> ser = pd.Series(['foo 123', 'bar baz', np.nan])
1520 >>> ser.str.replace(r'[a-z]+', repl, regex=True)
1521 0 oof 123
1522 1 rab zab
1523 2 NaN
1524 dtype: object
1525
1526 Using regex groups (extract second group and swap case):
1527
1528 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
1529 >>> repl = lambda m: m.group('two').swapcase()
1530 >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz'])
1531 >>> ser.str.replace(pat, repl, regex=True)
1532 0 tWO
1533 1 bAR
1534 dtype: object
1535
1536 Using a compiled regex with flags
1537
1538 >>> import re
1539 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
1540 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)
1541 0 foo
1542 1 bar
1543 2 NaN
1544 dtype: object
1545 """
1546 # Check whether repl is valid (GH 13438, GH 15055)
1547 if not (isinstance(repl, str) or callable(repl)):
1548 raise TypeError("repl must be a string or callable")
1549
1550 is_compiled_re = is_re(pat)
1551 if regex or regex is None:
1552 if is_compiled_re and (case is not None or flags != 0):
1553 raise ValueError(
1554 "case and flags cannot be set when pat is a compiled regex"
1555 )
1556
1557 elif is_compiled_re:
1558 raise ValueError(
1559 "Cannot use a compiled regex as replacement pattern with regex=False"
1560 )
1561 elif callable(repl):
1562 raise ValueError("Cannot use a callable replacement when regex=False")
1563
1564 if case is None:
1565 case = True
1566
1567 result = self._data.array._str_replace(
1568 pat, repl, n=n, case=case, flags=flags, regex=regex
1569 )
1570 return self._wrap_result(result)
1571
1572 @forbid_nonstring_types(["bytes"])
1573 def repeat(self, repeats):
1574 """
1575 Duplicate each string in the Series or Index.
1576
1577 Parameters
1578 ----------
1579 repeats : int or sequence of int
1580 Same value for all (int) or different value per (sequence).
1581
1582 Returns
1583 -------
1584 Series or pandas.Index
1585 Series or Index of repeated string objects specified by
1586 input parameter repeats.
1587
1588 Examples
1589 --------
1590 >>> s = pd.Series(['a', 'b', 'c'])
1591 >>> s
1592 0 a
1593 1 b
1594 2 c
1595 dtype: object
1596
1597 Single int repeats string in Series
1598
1599 >>> s.str.repeat(repeats=2)
1600 0 aa
1601 1 bb
1602 2 cc
1603 dtype: object
1604
1605 Sequence of int repeats corresponding string in Series
1606
1607 >>> s.str.repeat(repeats=[1, 2, 3])
1608 0 a
1609 1 bb
1610 2 ccc
1611 dtype: object
1612 """
1613 result = self._data.array._str_repeat(repeats)
1614 return self._wrap_result(result)
1615
1616 @forbid_nonstring_types(["bytes"])
1617 def pad(
1618 self,
1619 width: int,
1620 side: Literal["left", "right", "both"] = "left",
1621 fillchar: str = " ",
1622 ):
1623 """
1624 Pad strings in the Series/Index up to width.
1625
1626 Parameters
1627 ----------
1628 width : int
1629 Minimum width of resulting string; additional characters will be filled
1630 with character defined in `fillchar`.
1631 side : {'left', 'right', 'both'}, default 'left'
1632 Side from which to fill resulting string.
1633 fillchar : str, default ' '
1634 Additional character for filling, default is whitespace.
1635
1636 Returns
1637 -------
1638 Series or Index of object
1639 Returns Series or Index with minimum number of char in object.
1640
1641 See Also
1642 --------
1643 Series.str.rjust : Fills the left side of strings with an arbitrary
1644 character. Equivalent to ``Series.str.pad(side='left')``.
1645 Series.str.ljust : Fills the right side of strings with an arbitrary
1646 character. Equivalent to ``Series.str.pad(side='right')``.
1647 Series.str.center : Fills both sides of strings with an arbitrary
1648 character. Equivalent to ``Series.str.pad(side='both')``.
1649 Series.str.zfill : Pad strings in the Series/Index by prepending '0'
1650 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
1651
1652 Examples
1653 --------
1654 >>> s = pd.Series(["caribou", "tiger"])
1655 >>> s
1656 0 caribou
1657 1 tiger
1658 dtype: object
1659
1660 >>> s.str.pad(width=10)
1661 0 caribou
1662 1 tiger
1663 dtype: object
1664
1665 >>> s.str.pad(width=10, side='right', fillchar='-')
1666 0 caribou---
1667 1 tiger-----
1668 dtype: object
1669
1670 >>> s.str.pad(width=10, side='both', fillchar='-')
1671 0 -caribou--
1672 1 --tiger---
1673 dtype: object
1674 """
1675 if not isinstance(fillchar, str):
1676 msg = f"fillchar must be a character, not {type(fillchar).__name__}"
1677 raise TypeError(msg)
1678
1679 if len(fillchar) != 1:
1680 raise TypeError("fillchar must be a character, not str")
1681
1682 if not is_integer(width):
1683 msg = f"width must be of integer type, not {type(width).__name__}"
1684 raise TypeError(msg)
1685
1686 result = self._data.array._str_pad(width, side=side, fillchar=fillchar)
1687 return self._wrap_result(result)
1688
1689 _shared_docs[
1690 "str_pad"
1691 ] = """
1692 Pad %(side)s side of strings in the Series/Index.
1693
1694 Equivalent to :meth:`str.%(method)s`.
1695
1696 Parameters
1697 ----------
1698 width : int
1699 Minimum width of resulting string; additional characters will be filled
1700 with ``fillchar``.
1701 fillchar : str
1702 Additional character for filling, default is whitespace.
1703
1704 Returns
1705 -------
1706 Series/Index of objects.
1707
1708 Examples
1709 --------
1710 For Series.str.center:
1711
1712 >>> ser = pd.Series(['dog', 'bird', 'mouse'])
1713 >>> ser.str.center(8, fillchar='.')
1714 0 ..dog...
1715 1 ..bird..
1716 2 .mouse..
1717 dtype: object
1718
1719 For Series.str.ljust:
1720
1721 >>> ser = pd.Series(['dog', 'bird', 'mouse'])
1722 >>> ser.str.ljust(8, fillchar='.')
1723 0 dog.....
1724 1 bird....
1725 2 mouse...
1726 dtype: object
1727
1728 For Series.str.rjust:
1729
1730 >>> ser = pd.Series(['dog', 'bird', 'mouse'])
1731 >>> ser.str.rjust(8, fillchar='.')
1732 0 .....dog
1733 1 ....bird
1734 2 ...mouse
1735 dtype: object
1736 """
1737
1738 @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"})
1739 @forbid_nonstring_types(["bytes"])
1740 def center(self, width: int, fillchar: str = " "):
1741 return self.pad(width, side="both", fillchar=fillchar)
1742
1743 @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"})
1744 @forbid_nonstring_types(["bytes"])
1745 def ljust(self, width: int, fillchar: str = " "):
1746 return self.pad(width, side="right", fillchar=fillchar)
1747
1748 @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"})
1749 @forbid_nonstring_types(["bytes"])
1750 def rjust(self, width: int, fillchar: str = " "):
1751 return self.pad(width, side="left", fillchar=fillchar)
1752
1753 @forbid_nonstring_types(["bytes"])
1754 def zfill(self, width: int):
1755 """
1756 Pad strings in the Series/Index by prepending '0' characters.
1757
1758 Strings in the Series/Index are padded with '0' characters on the
1759 left of the string to reach a total string length `width`. Strings
1760 in the Series/Index with length greater or equal to `width` are
1761 unchanged.
1762
1763 Parameters
1764 ----------
1765 width : int
1766 Minimum length of resulting string; strings with length less
1767 than `width` be prepended with '0' characters.
1768
1769 Returns
1770 -------
1771 Series/Index of objects.
1772
1773 See Also
1774 --------
1775 Series.str.rjust : Fills the left side of strings with an arbitrary
1776 character.
1777 Series.str.ljust : Fills the right side of strings with an arbitrary
1778 character.
1779 Series.str.pad : Fills the specified sides of strings with an arbitrary
1780 character.
1781 Series.str.center : Fills both sides of strings with an arbitrary
1782 character.
1783
1784 Notes
1785 -----
1786 Differs from :meth:`str.zfill` which has special handling
1787 for '+'/'-' in the string.
1788
1789 Examples
1790 --------
1791 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
1792 >>> s
1793 0 -1
1794 1 1
1795 2 1000
1796 3 10
1797 4 NaN
1798 dtype: object
1799
1800 Note that ``10`` and ``NaN`` are not strings, therefore they are
1801 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
1802 special character and the zero is added to the right of it
1803 (:meth:`str.zfill` would have moved it to the left). ``1000``
1804 remains unchanged as it is longer than `width`.
1805
1806 >>> s.str.zfill(3)
1807 0 -01
1808 1 001
1809 2 1000
1810 3 NaN
1811 4 NaN
1812 dtype: object
1813 """
1814 if not is_integer(width):
1815 msg = f"width must be of integer type, not {type(width).__name__}"
1816 raise TypeError(msg)
1817 f = lambda x: x.zfill(width)
1818 result = self._data.array._str_map(f)
1819 return self._wrap_result(result)
1820
1821 def slice(self, start=None, stop=None, step=None):
1822 """
1823 Slice substrings from each element in the Series or Index.
1824
1825 Parameters
1826 ----------
1827 start : int, optional
1828 Start position for slice operation.
1829 stop : int, optional
1830 Stop position for slice operation.
1831 step : int, optional
1832 Step size for slice operation.
1833
1834 Returns
1835 -------
1836 Series or Index of object
1837 Series or Index from sliced substring from original string object.
1838
1839 See Also
1840 --------
1841 Series.str.slice_replace : Replace a slice with a string.
1842 Series.str.get : Return element at position.
1843 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
1844 being the position.
1845
1846 Examples
1847 --------
1848 >>> s = pd.Series(["koala", "dog", "chameleon"])
1849 >>> s
1850 0 koala
1851 1 dog
1852 2 chameleon
1853 dtype: object
1854
1855 >>> s.str.slice(start=1)
1856 0 oala
1857 1 og
1858 2 hameleon
1859 dtype: object
1860
1861 >>> s.str.slice(start=-1)
1862 0 a
1863 1 g
1864 2 n
1865 dtype: object
1866
1867 >>> s.str.slice(stop=2)
1868 0 ko
1869 1 do
1870 2 ch
1871 dtype: object
1872
1873 >>> s.str.slice(step=2)
1874 0 kaa
1875 1 dg
1876 2 caeen
1877 dtype: object
1878
1879 >>> s.str.slice(start=0, stop=5, step=3)
1880 0 kl
1881 1 d
1882 2 cm
1883 dtype: object
1884
1885 Equivalent behaviour to:
1886
1887 >>> s.str[0:5:3]
1888 0 kl
1889 1 d
1890 2 cm
1891 dtype: object
1892 """
1893 result = self._data.array._str_slice(start, stop, step)
1894 return self._wrap_result(result)
1895
1896 @forbid_nonstring_types(["bytes"])
1897 def slice_replace(self, start=None, stop=None, repl=None):
1898 """
1899 Replace a positional slice of a string with another value.
1900
1901 Parameters
1902 ----------
1903 start : int, optional
1904 Left index position to use for the slice. If not specified (None),
1905 the slice is unbounded on the left, i.e. slice from the start
1906 of the string.
1907 stop : int, optional
1908 Right index position to use for the slice. If not specified (None),
1909 the slice is unbounded on the right, i.e. slice until the
1910 end of the string.
1911 repl : str, optional
1912 String for replacement. If not specified (None), the sliced region
1913 is replaced with an empty string.
1914
1915 Returns
1916 -------
1917 Series or Index
1918 Same type as the original object.
1919
1920 See Also
1921 --------
1922 Series.str.slice : Just slicing without replacement.
1923
1924 Examples
1925 --------
1926 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
1927 >>> s
1928 0 a
1929 1 ab
1930 2 abc
1931 3 abdc
1932 4 abcde
1933 dtype: object
1934
1935 Specify just `start`, meaning replace `start` until the end of the
1936 string with `repl`.
1937
1938 >>> s.str.slice_replace(1, repl='X')
1939 0 aX
1940 1 aX
1941 2 aX
1942 3 aX
1943 4 aX
1944 dtype: object
1945
1946 Specify just `stop`, meaning the start of the string to `stop` is replaced
1947 with `repl`, and the rest of the string is included.
1948
1949 >>> s.str.slice_replace(stop=2, repl='X')
1950 0 X
1951 1 X
1952 2 Xc
1953 3 Xdc
1954 4 Xcde
1955 dtype: object
1956
1957 Specify `start` and `stop`, meaning the slice from `start` to `stop` is
1958 replaced with `repl`. Everything before or after `start` and `stop` is
1959 included as is.
1960
1961 >>> s.str.slice_replace(start=1, stop=3, repl='X')
1962 0 aX
1963 1 aX
1964 2 aX
1965 3 aXc
1966 4 aXde
1967 dtype: object
1968 """
1969 result = self._data.array._str_slice_replace(start, stop, repl)
1970 return self._wrap_result(result)
1971
1972 def decode(self, encoding, errors: str = "strict"):
1973 """
1974 Decode character string in the Series/Index using indicated encoding.
1975
1976 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
1977 python3.
1978
1979 Parameters
1980 ----------
1981 encoding : str
1982 errors : str, optional
1983
1984 Returns
1985 -------
1986 Series or Index
1987
1988 Examples
1989 --------
1990 For Series:
1991
1992 >>> ser = pd.Series([b'cow', b'123', b'()'])
1993 >>> ser.str.decode('ascii')
1994 0 cow
1995 1 123
1996 2 ()
1997 dtype: object
1998 """
1999 # TODO: Add a similar _bytes interface.
2000 if encoding in _cpython_optimized_decoders:
2001 # CPython optimized implementation
2002 f = lambda x: x.decode(encoding, errors)
2003 else:
2004 decoder = codecs.getdecoder(encoding)
2005 f = lambda x: decoder(x, errors)[0]
2006 arr = self._data.array
2007 # assert isinstance(arr, (StringArray,))
2008 result = arr._str_map(f)
2009 return self._wrap_result(result)
2010
2011 @forbid_nonstring_types(["bytes"])
2012 def encode(self, encoding, errors: str = "strict"):
2013 """
2014 Encode character string in the Series/Index using indicated encoding.
2015
2016 Equivalent to :meth:`str.encode`.
2017
2018 Parameters
2019 ----------
2020 encoding : str
2021 errors : str, optional
2022
2023 Returns
2024 -------
2025 Series/Index of objects
2026
2027 Examples
2028 --------
2029 >>> ser = pd.Series(['cow', '123', '()'])
2030 >>> ser.str.encode(encoding='ascii')
2031 0 b'cow'
2032 1 b'123'
2033 2 b'()'
2034 dtype: object
2035 """
2036 result = self._data.array._str_encode(encoding, errors)
2037 return self._wrap_result(result, returns_string=False)
2038
2039 _shared_docs[
2040 "str_strip"
2041 ] = r"""
2042 Remove %(position)s characters.
2043
2044 Strip whitespaces (including newlines) or a set of specified characters
2045 from each string in the Series/Index from %(side)s.
2046 Replaces any non-strings in Series with NaNs.
2047 Equivalent to :meth:`str.%(method)s`.
2048
2049 Parameters
2050 ----------
2051 to_strip : str or None, default None
2052 Specifying the set of characters to be removed.
2053 All combinations of this set of characters will be stripped.
2054 If None then whitespaces are removed.
2055
2056 Returns
2057 -------
2058 Series or Index of object
2059
2060 See Also
2061 --------
2062 Series.str.strip : Remove leading and trailing characters in Series/Index.
2063 Series.str.lstrip : Remove leading characters in Series/Index.
2064 Series.str.rstrip : Remove trailing characters in Series/Index.
2065
2066 Examples
2067 --------
2068 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])
2069 >>> s
2070 0 1. Ant.
2071 1 2. Bee!\n
2072 2 3. Cat?\t
2073 3 NaN
2074 4 10
2075 5 True
2076 dtype: object
2077
2078 >>> s.str.strip()
2079 0 1. Ant.
2080 1 2. Bee!
2081 2 3. Cat?
2082 3 NaN
2083 4 NaN
2084 5 NaN
2085 dtype: object
2086
2087 >>> s.str.lstrip('123.')
2088 0 Ant.
2089 1 Bee!\n
2090 2 Cat?\t
2091 3 NaN
2092 4 NaN
2093 5 NaN
2094 dtype: object
2095
2096 >>> s.str.rstrip('.!? \n\t')
2097 0 1. Ant
2098 1 2. Bee
2099 2 3. Cat
2100 3 NaN
2101 4 NaN
2102 5 NaN
2103 dtype: object
2104
2105 >>> s.str.strip('123.!? \n\t')
2106 0 Ant
2107 1 Bee
2108 2 Cat
2109 3 NaN
2110 4 NaN
2111 5 NaN
2112 dtype: object
2113 """
2114
2115 @Appender(
2116 _shared_docs["str_strip"]
2117 % {
2118 "side": "left and right sides",
2119 "method": "strip",
2120 "position": "leading and trailing",
2121 }
2122 )
2123 @forbid_nonstring_types(["bytes"])
2124 def strip(self, to_strip=None):
2125 result = self._data.array._str_strip(to_strip)
2126 return self._wrap_result(result)
2127
2128 @Appender(
2129 _shared_docs["str_strip"]
2130 % {"side": "left side", "method": "lstrip", "position": "leading"}
2131 )
2132 @forbid_nonstring_types(["bytes"])
2133 def lstrip(self, to_strip=None):
2134 result = self._data.array._str_lstrip(to_strip)
2135 return self._wrap_result(result)
2136
2137 @Appender(
2138 _shared_docs["str_strip"]
2139 % {"side": "right side", "method": "rstrip", "position": "trailing"}
2140 )
2141 @forbid_nonstring_types(["bytes"])
2142 def rstrip(self, to_strip=None):
2143 result = self._data.array._str_rstrip(to_strip)
2144 return self._wrap_result(result)
2145
2146 _shared_docs[
2147 "str_removefix"
2148 ] = r"""
2149 Remove a %(side)s from an object series.
2150
2151 If the %(side)s is not present, the original string will be returned.
2152
2153 Parameters
2154 ----------
2155 %(side)s : str
2156 Remove the %(side)s of the string.
2157
2158 Returns
2159 -------
2160 Series/Index: object
2161 The Series or Index with given %(side)s removed.
2162
2163 See Also
2164 --------
2165 Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.
2166
2167 Examples
2168 --------
2169 >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])
2170 >>> s
2171 0 str_foo
2172 1 str_bar
2173 2 no_prefix
2174 dtype: object
2175 >>> s.str.removeprefix("str_")
2176 0 foo
2177 1 bar
2178 2 no_prefix
2179 dtype: object
2180
2181 >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])
2182 >>> s
2183 0 foo_str
2184 1 bar_str
2185 2 no_suffix
2186 dtype: object
2187 >>> s.str.removesuffix("_str")
2188 0 foo
2189 1 bar
2190 2 no_suffix
2191 dtype: object
2192 """
2193
2194 @Appender(
2195 _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"}
2196 )
2197 @forbid_nonstring_types(["bytes"])
2198 def removeprefix(self, prefix: str):
2199 result = self._data.array._str_removeprefix(prefix)
2200 return self._wrap_result(result)
2201
2202 @Appender(
2203 _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"}
2204 )
2205 @forbid_nonstring_types(["bytes"])
2206 def removesuffix(self, suffix: str):
2207 result = self._data.array._str_removesuffix(suffix)
2208 return self._wrap_result(result)
2209
2210 @forbid_nonstring_types(["bytes"])
2211 def wrap(self, width: int, **kwargs):
2212 r"""
2213 Wrap strings in Series/Index at specified line width.
2214
2215 This method has the same keyword parameters and defaults as
2216 :class:`textwrap.TextWrapper`.
2217
2218 Parameters
2219 ----------
2220 width : int
2221 Maximum line width.
2222 expand_tabs : bool, optional
2223 If True, tab characters will be expanded to spaces (default: True).
2224 replace_whitespace : bool, optional
2225 If True, each whitespace character (as defined by string.whitespace)
2226 remaining after tab expansion will be replaced by a single space
2227 (default: True).
2228 drop_whitespace : bool, optional
2229 If True, whitespace that, after wrapping, happens to end up at the
2230 beginning or end of a line is dropped (default: True).
2231 break_long_words : bool, optional
2232 If True, then words longer than width will be broken in order to ensure
2233 that no lines are longer than width. If it is false, long words will
2234 not be broken, and some lines may be longer than width (default: True).
2235 break_on_hyphens : bool, optional
2236 If True, wrapping will occur preferably on whitespace and right after
2237 hyphens in compound words, as it is customary in English. If false,
2238 only whitespaces will be considered as potentially good places for line
2239 breaks, but you need to set break_long_words to false if you want truly
2240 insecable words (default: True).
2241
2242 Returns
2243 -------
2244 Series or Index
2245
2246 Notes
2247 -----
2248 Internally, this method uses a :class:`textwrap.TextWrapper` instance with
2249 default settings. To achieve behavior matching R's stringr library str_wrap
2250 function, use the arguments:
2251
2252 - expand_tabs = False
2253 - replace_whitespace = True
2254 - drop_whitespace = True
2255 - break_long_words = False
2256 - break_on_hyphens = False
2257
2258 Examples
2259 --------
2260 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
2261 >>> s.str.wrap(12)
2262 0 line to be\nwrapped
2263 1 another line\nto be\nwrapped
2264 dtype: object
2265 """
2266 result = self._data.array._str_wrap(width, **kwargs)
2267 return self._wrap_result(result)
2268
2269 @forbid_nonstring_types(["bytes"])
2270 def get_dummies(self, sep: str = "|"):
2271 """
2272 Return DataFrame of dummy/indicator variables for Series.
2273
2274 Each string in Series is split by sep and returned as a DataFrame
2275 of dummy/indicator variables.
2276
2277 Parameters
2278 ----------
2279 sep : str, default "|"
2280 String to split on.
2281
2282 Returns
2283 -------
2284 DataFrame
2285 Dummy variables corresponding to values of the Series.
2286
2287 See Also
2288 --------
2289 get_dummies : Convert categorical variable into dummy/indicator
2290 variables.
2291
2292 Examples
2293 --------
2294 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
2295 a b c
2296 0 1 1 0
2297 1 1 0 0
2298 2 1 0 1
2299
2300 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
2301 a b c
2302 0 1 1 0
2303 1 0 0 0
2304 2 1 0 1
2305 """
2306 # we need to cast to Series of strings as only that has all
2307 # methods available for making the dummies...
2308 result, name = self._data.array._str_get_dummies(sep)
2309 return self._wrap_result(
2310 result,
2311 name=name,
2312 expand=True,
2313 returns_string=False,
2314 )
2315
2316 @forbid_nonstring_types(["bytes"])
2317 def translate(self, table):
2318 """
2319 Map all characters in the string through the given mapping table.
2320
2321 Equivalent to standard :meth:`str.translate`.
2322
2323 Parameters
2324 ----------
2325 table : dict
2326 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
2327 None. Unmapped characters are left untouched.
2328 Characters mapped to None are deleted. :meth:`str.maketrans` is a
2329 helper function for making translation tables.
2330
2331 Returns
2332 -------
2333 Series or Index
2334
2335 Examples
2336 --------
2337 >>> ser = pd.Series(["El niño", "Françoise"])
2338 >>> mytable = str.maketrans({'ñ': 'n', 'ç': 'c'})
2339 >>> ser.str.translate(mytable)
2340 0 El nino
2341 1 Francoise
2342 dtype: object
2343 """
2344 result = self._data.array._str_translate(table)
2345 dtype = object if self._data.dtype == "object" else None
2346 return self._wrap_result(result, dtype=dtype)
2347
2348 @forbid_nonstring_types(["bytes"])
2349 def count(self, pat, flags: int = 0):
2350 r"""
2351 Count occurrences of pattern in each string of the Series/Index.
2352
2353 This function is used to count the number of times a particular regex
2354 pattern is repeated in each of the string elements of the
2355 :class:`~pandas.Series`.
2356
2357 Parameters
2358 ----------
2359 pat : str
2360 Valid regular expression.
2361 flags : int, default 0, meaning no flags
2362 Flags for the `re` module. For a complete list, `see here
2363 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
2364 **kwargs
2365 For compatibility with other string methods. Not used.
2366
2367 Returns
2368 -------
2369 Series or Index
2370 Same type as the calling object containing the integer counts.
2371
2372 See Also
2373 --------
2374 re : Standard library module for regular expressions.
2375 str.count : Standard library version, without regular expression support.
2376
2377 Notes
2378 -----
2379 Some characters need to be escaped when passing in `pat`.
2380 eg. ``'$'`` has a special meaning in regex and must be escaped when
2381 finding this literal character.
2382
2383 Examples
2384 --------
2385 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
2386 >>> s.str.count('a')
2387 0 0.0
2388 1 0.0
2389 2 2.0
2390 3 2.0
2391 4 NaN
2392 5 0.0
2393 6 1.0
2394 dtype: float64
2395
2396 Escape ``'$'`` to find the literal dollar sign.
2397
2398 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
2399 >>> s.str.count('\\$')
2400 0 1
2401 1 0
2402 2 1
2403 3 2
2404 4 2
2405 5 0
2406 dtype: int64
2407
2408 This is also available on Index
2409
2410 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
2411 Index([0, 0, 2, 1], dtype='int64')
2412 """
2413 result = self._data.array._str_count(pat, flags)
2414 return self._wrap_result(result, returns_string=False)
2415
2416 @forbid_nonstring_types(["bytes"])
2417 def startswith(
2418 self, pat: str | tuple[str, ...], na: Scalar | None = None
2419 ) -> Series | Index:
2420 """
2421 Test if the start of each string element matches a pattern.
2422
2423 Equivalent to :meth:`str.startswith`.
2424
2425 Parameters
2426 ----------
2427 pat : str or tuple[str, ...]
2428 Character sequence or tuple of strings. Regular expressions are not
2429 accepted.
2430 na : object, default NaN
2431 Object shown if element tested is not a string. The default depends
2432 on dtype of the array. For object-dtype, ``numpy.nan`` is used.
2433 For ``StringDtype``, ``pandas.NA`` is used.
2434
2435 Returns
2436 -------
2437 Series or Index of bool
2438 A Series of booleans indicating whether the given pattern matches
2439 the start of each string element.
2440
2441 See Also
2442 --------
2443 str.startswith : Python standard library string method.
2444 Series.str.endswith : Same as startswith, but tests the end of string.
2445 Series.str.contains : Tests if string element contains a pattern.
2446
2447 Examples
2448 --------
2449 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
2450 >>> s
2451 0 bat
2452 1 Bear
2453 2 cat
2454 3 NaN
2455 dtype: object
2456
2457 >>> s.str.startswith('b')
2458 0 True
2459 1 False
2460 2 False
2461 3 NaN
2462 dtype: object
2463
2464 >>> s.str.startswith(('b', 'B'))
2465 0 True
2466 1 True
2467 2 False
2468 3 NaN
2469 dtype: object
2470
2471 Specifying `na` to be `False` instead of `NaN`.
2472
2473 >>> s.str.startswith('b', na=False)
2474 0 True
2475 1 False
2476 2 False
2477 3 False
2478 dtype: bool
2479 """
2480 if not isinstance(pat, (str, tuple)):
2481 msg = f"expected a string or tuple, not {type(pat).__name__}"
2482 raise TypeError(msg)
2483 result = self._data.array._str_startswith(pat, na=na)
2484 return self._wrap_result(result, returns_string=False)
2485
2486 @forbid_nonstring_types(["bytes"])
2487 def endswith(
2488 self, pat: str | tuple[str, ...], na: Scalar | None = None
2489 ) -> Series | Index:
2490 """
2491 Test if the end of each string element matches a pattern.
2492
2493 Equivalent to :meth:`str.endswith`.
2494
2495 Parameters
2496 ----------
2497 pat : str or tuple[str, ...]
2498 Character sequence or tuple of strings. Regular expressions are not
2499 accepted.
2500 na : object, default NaN
2501 Object shown if element tested is not a string. The default depends
2502 on dtype of the array. For object-dtype, ``numpy.nan`` is used.
2503 For ``StringDtype``, ``pandas.NA`` is used.
2504
2505 Returns
2506 -------
2507 Series or Index of bool
2508 A Series of booleans indicating whether the given pattern matches
2509 the end of each string element.
2510
2511 See Also
2512 --------
2513 str.endswith : Python standard library string method.
2514 Series.str.startswith : Same as endswith, but tests the start of string.
2515 Series.str.contains : Tests if string element contains a pattern.
2516
2517 Examples
2518 --------
2519 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
2520 >>> s
2521 0 bat
2522 1 bear
2523 2 caT
2524 3 NaN
2525 dtype: object
2526
2527 >>> s.str.endswith('t')
2528 0 True
2529 1 False
2530 2 False
2531 3 NaN
2532 dtype: object
2533
2534 >>> s.str.endswith(('t', 'T'))
2535 0 True
2536 1 False
2537 2 True
2538 3 NaN
2539 dtype: object
2540
2541 Specifying `na` to be `False` instead of `NaN`.
2542
2543 >>> s.str.endswith('t', na=False)
2544 0 True
2545 1 False
2546 2 False
2547 3 False
2548 dtype: bool
2549 """
2550 if not isinstance(pat, (str, tuple)):
2551 msg = f"expected a string or tuple, not {type(pat).__name__}"
2552 raise TypeError(msg)
2553 result = self._data.array._str_endswith(pat, na=na)
2554 return self._wrap_result(result, returns_string=False)
2555
2556 @forbid_nonstring_types(["bytes"])
2557 def findall(self, pat, flags: int = 0):
2558 """
2559 Find all occurrences of pattern or regular expression in the Series/Index.
2560
2561 Equivalent to applying :func:`re.findall` to all the elements in the
2562 Series/Index.
2563
2564 Parameters
2565 ----------
2566 pat : str
2567 Pattern or regular expression.
2568 flags : int, default 0
2569 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
2570 means no flags).
2571
2572 Returns
2573 -------
2574 Series/Index of lists of strings
2575 All non-overlapping matches of pattern or regular expression in each
2576 string of this Series/Index.
2577
2578 See Also
2579 --------
2580 count : Count occurrences of pattern or regular expression in each string
2581 of the Series/Index.
2582 extractall : For each string in the Series, extract groups from all matches
2583 of regular expression and return a DataFrame with one row for each
2584 match and one column for each group.
2585 re.findall : The equivalent ``re`` function to all non-overlapping matches
2586 of pattern or regular expression in string, as a list of strings.
2587
2588 Examples
2589 --------
2590 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
2591
2592 The search for the pattern 'Monkey' returns one match:
2593
2594 >>> s.str.findall('Monkey')
2595 0 []
2596 1 [Monkey]
2597 2 []
2598 dtype: object
2599
2600 On the other hand, the search for the pattern 'MONKEY' doesn't return any
2601 match:
2602
2603 >>> s.str.findall('MONKEY')
2604 0 []
2605 1 []
2606 2 []
2607 dtype: object
2608
2609 Flags can be added to the pattern or regular expression. For instance,
2610 to find the pattern 'MONKEY' ignoring the case:
2611
2612 >>> import re
2613 >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
2614 0 []
2615 1 [Monkey]
2616 2 []
2617 dtype: object
2618
2619 When the pattern matches more than one string in the Series, all matches
2620 are returned:
2621
2622 >>> s.str.findall('on')
2623 0 [on]
2624 1 [on]
2625 2 []
2626 dtype: object
2627
2628 Regular expressions are supported too. For instance, the search for all the
2629 strings ending with the word 'on' is shown next:
2630
2631 >>> s.str.findall('on$')
2632 0 [on]
2633 1 []
2634 2 []
2635 dtype: object
2636
2637 If the pattern is found more than once in the same string, then a list of
2638 multiple strings is returned:
2639
2640 >>> s.str.findall('b')
2641 0 []
2642 1 []
2643 2 [b, b]
2644 dtype: object
2645 """
2646 result = self._data.array._str_findall(pat, flags)
2647 return self._wrap_result(result, returns_string=False)
2648
2649 @forbid_nonstring_types(["bytes"])
2650 def extract(
2651 self, pat: str, flags: int = 0, expand: bool = True
2652 ) -> DataFrame | Series | Index:
2653 r"""
2654 Extract capture groups in the regex `pat` as columns in a DataFrame.
2655
2656 For each subject string in the Series, extract groups from the
2657 first match of regular expression `pat`.
2658
2659 Parameters
2660 ----------
2661 pat : str
2662 Regular expression pattern with capturing groups.
2663 flags : int, default 0 (no flags)
2664 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
2665 modify regular expression matching for things like case,
2666 spaces, etc. For more details, see :mod:`re`.
2667 expand : bool, default True
2668 If True, return DataFrame with one column per capture group.
2669 If False, return a Series/Index if there is one capture group
2670 or DataFrame if there are multiple capture groups.
2671
2672 Returns
2673 -------
2674 DataFrame or Series or Index
2675 A DataFrame with one row for each subject string, and one
2676 column for each group. Any capture group names in regular
2677 expression pat will be used for column names; otherwise
2678 capture group numbers will be used. The dtype of each result
2679 column is always object, even when no match is found. If
2680 ``expand=False`` and pat has only one capture group, then
2681 return a Series (if subject is a Series) or Index (if subject
2682 is an Index).
2683
2684 See Also
2685 --------
2686 extractall : Returns all matches (not just the first match).
2687
2688 Examples
2689 --------
2690 A pattern with two groups will return a DataFrame with two columns.
2691 Non-matches will be NaN.
2692
2693 >>> s = pd.Series(['a1', 'b2', 'c3'])
2694 >>> s.str.extract(r'([ab])(\d)')
2695 0 1
2696 0 a 1
2697 1 b 2
2698 2 NaN NaN
2699
2700 A pattern may contain optional groups.
2701
2702 >>> s.str.extract(r'([ab])?(\d)')
2703 0 1
2704 0 a 1
2705 1 b 2
2706 2 NaN 3
2707
2708 Named groups will become column names in the result.
2709
2710 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
2711 letter digit
2712 0 a 1
2713 1 b 2
2714 2 NaN NaN
2715
2716 A pattern with one group will return a DataFrame with one column
2717 if expand=True.
2718
2719 >>> s.str.extract(r'[ab](\d)', expand=True)
2720 0
2721 0 1
2722 1 2
2723 2 NaN
2724
2725 A pattern with one group will return a Series if expand=False.
2726
2727 >>> s.str.extract(r'[ab](\d)', expand=False)
2728 0 1
2729 1 2
2730 2 NaN
2731 dtype: object
2732 """
2733 from pandas import DataFrame
2734
2735 if not isinstance(expand, bool):
2736 raise ValueError("expand must be True or False")
2737
2738 regex = re.compile(pat, flags=flags)
2739 if regex.groups == 0:
2740 raise ValueError("pattern contains no capture groups")
2741
2742 if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex):
2743 raise ValueError("only one regex group is supported with Index")
2744
2745 obj = self._data
2746 result_dtype = _result_dtype(obj)
2747
2748 returns_df = regex.groups > 1 or expand
2749
2750 if returns_df:
2751 name = None
2752 columns = _get_group_names(regex)
2753
2754 if obj.array.size == 0:
2755 result = DataFrame(columns=columns, dtype=result_dtype)
2756
2757 else:
2758 result_list = self._data.array._str_extract(
2759 pat, flags=flags, expand=returns_df
2760 )
2761
2762 result_index: Index | None
2763 if isinstance(obj, ABCSeries):
2764 result_index = obj.index
2765 else:
2766 result_index = None
2767
2768 result = DataFrame(
2769 result_list, columns=columns, index=result_index, dtype=result_dtype
2770 )
2771
2772 else:
2773 name = _get_single_group_name(regex)
2774 result = self._data.array._str_extract(pat, flags=flags, expand=returns_df)
2775 return self._wrap_result(result, name=name, dtype=result_dtype)
2776
2777 @forbid_nonstring_types(["bytes"])
2778 def extractall(self, pat, flags: int = 0) -> DataFrame:
2779 r"""
2780 Extract capture groups in the regex `pat` as columns in DataFrame.
2781
2782 For each subject string in the Series, extract groups from all
2783 matches of regular expression pat. When each subject string in the
2784 Series has exactly one match, extractall(pat).xs(0, level='match')
2785 is the same as extract(pat).
2786
2787 Parameters
2788 ----------
2789 pat : str
2790 Regular expression pattern with capturing groups.
2791 flags : int, default 0 (no flags)
2792 A ``re`` module flag, for example ``re.IGNORECASE``. These allow
2793 to modify regular expression matching for things like case, spaces,
2794 etc. Multiple flags can be combined with the bitwise OR operator,
2795 for example ``re.IGNORECASE | re.MULTILINE``.
2796
2797 Returns
2798 -------
2799 DataFrame
2800 A ``DataFrame`` with one row for each match, and one column for each
2801 group. Its rows have a ``MultiIndex`` with first levels that come from
2802 the subject ``Series``. The last level is named 'match' and indexes the
2803 matches in each item of the ``Series``. Any capture group names in
2804 regular expression pat will be used for column names; otherwise capture
2805 group numbers will be used.
2806
2807 See Also
2808 --------
2809 extract : Returns first match only (not all matches).
2810
2811 Examples
2812 --------
2813 A pattern with one group will return a DataFrame with one column.
2814 Indices with no matches will not appear in the result.
2815
2816 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
2817 >>> s.str.extractall(r"[ab](\d)")
2818 0
2819 match
2820 A 0 1
2821 1 2
2822 B 0 1
2823
2824 Capture group names are used for column names of the result.
2825
2826 >>> s.str.extractall(r"[ab](?P<digit>\d)")
2827 digit
2828 match
2829 A 0 1
2830 1 2
2831 B 0 1
2832
2833 A pattern with two groups will return a DataFrame with two columns.
2834
2835 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
2836 letter digit
2837 match
2838 A 0 a 1
2839 1 a 2
2840 B 0 b 1
2841
2842 Optional groups that do not match are NaN in the result.
2843
2844 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
2845 letter digit
2846 match
2847 A 0 a 1
2848 1 a 2
2849 B 0 b 1
2850 C 0 NaN 1
2851 """
2852 # TODO: dispatch
2853 return str_extractall(self._orig, pat, flags)
2854
2855 _shared_docs[
2856 "find"
2857 ] = """
2858 Return %(side)s indexes in each strings in the Series/Index.
2859
2860 Each of returned indexes corresponds to the position where the
2861 substring is fully contained between [start:end]. Return -1 on
2862 failure. Equivalent to standard :meth:`str.%(method)s`.
2863
2864 Parameters
2865 ----------
2866 sub : str
2867 Substring being searched.
2868 start : int
2869 Left edge index.
2870 end : int
2871 Right edge index.
2872
2873 Returns
2874 -------
2875 Series or Index of int.
2876
2877 See Also
2878 --------
2879 %(also)s
2880
2881 Examples
2882 --------
2883 For Series.str.find:
2884
2885 >>> ser = pd.Series(["cow_", "duck_", "do_ve"])
2886 >>> ser.str.find("_")
2887 0 3
2888 1 4
2889 2 2
2890 dtype: int64
2891
2892 For Series.str.rfind:
2893
2894 >>> ser = pd.Series(["_cow_", "duck_", "do_v_e"])
2895 >>> ser.str.rfind("_")
2896 0 4
2897 1 4
2898 2 4
2899 dtype: int64
2900 """
2901
2902 @Appender(
2903 _shared_docs["find"]
2904 % {
2905 "side": "lowest",
2906 "method": "find",
2907 "also": "rfind : Return highest indexes in each strings.",
2908 }
2909 )
2910 @forbid_nonstring_types(["bytes"])
2911 def find(self, sub, start: int = 0, end=None):
2912 if not isinstance(sub, str):
2913 msg = f"expected a string object, not {type(sub).__name__}"
2914 raise TypeError(msg)
2915
2916 result = self._data.array._str_find(sub, start, end)
2917 return self._wrap_result(result, returns_string=False)
2918
2919 @Appender(
2920 _shared_docs["find"]
2921 % {
2922 "side": "highest",
2923 "method": "rfind",
2924 "also": "find : Return lowest indexes in each strings.",
2925 }
2926 )
2927 @forbid_nonstring_types(["bytes"])
2928 def rfind(self, sub, start: int = 0, end=None):
2929 if not isinstance(sub, str):
2930 msg = f"expected a string object, not {type(sub).__name__}"
2931 raise TypeError(msg)
2932
2933 result = self._data.array._str_rfind(sub, start=start, end=end)
2934 return self._wrap_result(result, returns_string=False)
2935
2936 @forbid_nonstring_types(["bytes"])
2937 def normalize(self, form):
2938 """
2939 Return the Unicode normal form for the strings in the Series/Index.
2940
2941 For more information on the forms, see the
2942 :func:`unicodedata.normalize`.
2943
2944 Parameters
2945 ----------
2946 form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
2947 Unicode form.
2948
2949 Returns
2950 -------
2951 Series/Index of objects
2952
2953 Examples
2954 --------
2955 >>> ser = pd.Series(['ñ'])
2956 >>> ser.str.normalize('NFC') == ser.str.normalize('NFD')
2957 0 False
2958 dtype: bool
2959 """
2960 result = self._data.array._str_normalize(form)
2961 return self._wrap_result(result)
2962
2963 _shared_docs[
2964 "index"
2965 ] = """
2966 Return %(side)s indexes in each string in Series/Index.
2967
2968 Each of the returned indexes corresponds to the position where the
2969 substring is fully contained between [start:end]. This is the same
2970 as ``str.%(similar)s`` except instead of returning -1, it raises a
2971 ValueError when the substring is not found. Equivalent to standard
2972 ``str.%(method)s``.
2973
2974 Parameters
2975 ----------
2976 sub : str
2977 Substring being searched.
2978 start : int
2979 Left edge index.
2980 end : int
2981 Right edge index.
2982
2983 Returns
2984 -------
2985 Series or Index of object
2986
2987 See Also
2988 --------
2989 %(also)s
2990
2991 Examples
2992 --------
2993 For Series.str.index:
2994
2995 >>> ser = pd.Series(["horse", "eagle", "donkey"])
2996 >>> ser.str.index("e")
2997 0 4
2998 1 0
2999 2 4
3000 dtype: int64
3001
3002 For Series.str.rindex:
3003
3004 >>> ser = pd.Series(["Deer", "eagle", "Sheep"])
3005 >>> ser.str.rindex("e")
3006 0 2
3007 1 4
3008 2 3
3009 dtype: int64
3010 """
3011
3012 @Appender(
3013 _shared_docs["index"]
3014 % {
3015 "side": "lowest",
3016 "similar": "find",
3017 "method": "index",
3018 "also": "rindex : Return highest indexes in each strings.",
3019 }
3020 )
3021 @forbid_nonstring_types(["bytes"])
3022 def index(self, sub, start: int = 0, end=None):
3023 if not isinstance(sub, str):
3024 msg = f"expected a string object, not {type(sub).__name__}"
3025 raise TypeError(msg)
3026
3027 result = self._data.array._str_index(sub, start=start, end=end)
3028 return self._wrap_result(result, returns_string=False)
3029
3030 @Appender(
3031 _shared_docs["index"]
3032 % {
3033 "side": "highest",
3034 "similar": "rfind",
3035 "method": "rindex",
3036 "also": "index : Return lowest indexes in each strings.",
3037 }
3038 )
3039 @forbid_nonstring_types(["bytes"])
3040 def rindex(self, sub, start: int = 0, end=None):
3041 if not isinstance(sub, str):
3042 msg = f"expected a string object, not {type(sub).__name__}"
3043 raise TypeError(msg)
3044
3045 result = self._data.array._str_rindex(sub, start=start, end=end)
3046 return self._wrap_result(result, returns_string=False)
3047
3048 def len(self):
3049 """
3050 Compute the length of each element in the Series/Index.
3051
3052 The element may be a sequence (such as a string, tuple or list) or a collection
3053 (such as a dictionary).
3054
3055 Returns
3056 -------
3057 Series or Index of int
3058 A Series or Index of integer values indicating the length of each
3059 element in the Series or Index.
3060
3061 See Also
3062 --------
3063 str.len : Python built-in function returning the length of an object.
3064 Series.size : Returns the length of the Series.
3065
3066 Examples
3067 --------
3068 Returns the length (number of characters) in a string. Returns the
3069 number of entries for dictionaries, lists or tuples.
3070
3071 >>> s = pd.Series(['dog',
3072 ... '',
3073 ... 5,
3074 ... {'foo' : 'bar'},
3075 ... [2, 3, 5, 7],
3076 ... ('one', 'two', 'three')])
3077 >>> s
3078 0 dog
3079 1
3080 2 5
3081 3 {'foo': 'bar'}
3082 4 [2, 3, 5, 7]
3083 5 (one, two, three)
3084 dtype: object
3085 >>> s.str.len()
3086 0 3.0
3087 1 0.0
3088 2 NaN
3089 3 1.0
3090 4 4.0
3091 5 3.0
3092 dtype: float64
3093 """
3094 result = self._data.array._str_len()
3095 return self._wrap_result(result, returns_string=False)
3096
3097 _shared_docs[
3098 "casemethods"
3099 ] = """
3100 Convert strings in the Series/Index to %(type)s.
3101 %(version)s
3102 Equivalent to :meth:`str.%(method)s`.
3103
3104 Returns
3105 -------
3106 Series or Index of object
3107
3108 See Also
3109 --------
3110 Series.str.lower : Converts all characters to lowercase.
3111 Series.str.upper : Converts all characters to uppercase.
3112 Series.str.title : Converts first character of each word to uppercase and
3113 remaining to lowercase.
3114 Series.str.capitalize : Converts first character to uppercase and
3115 remaining to lowercase.
3116 Series.str.swapcase : Converts uppercase to lowercase and lowercase to
3117 uppercase.
3118 Series.str.casefold: Removes all case distinctions in the string.
3119
3120 Examples
3121 --------
3122 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
3123 >>> s
3124 0 lower
3125 1 CAPITALS
3126 2 this is a sentence
3127 3 SwApCaSe
3128 dtype: object
3129
3130 >>> s.str.lower()
3131 0 lower
3132 1 capitals
3133 2 this is a sentence
3134 3 swapcase
3135 dtype: object
3136
3137 >>> s.str.upper()
3138 0 LOWER
3139 1 CAPITALS
3140 2 THIS IS A SENTENCE
3141 3 SWAPCASE
3142 dtype: object
3143
3144 >>> s.str.title()
3145 0 Lower
3146 1 Capitals
3147 2 This Is A Sentence
3148 3 Swapcase
3149 dtype: object
3150
3151 >>> s.str.capitalize()
3152 0 Lower
3153 1 Capitals
3154 2 This is a sentence
3155 3 Swapcase
3156 dtype: object
3157
3158 >>> s.str.swapcase()
3159 0 LOWER
3160 1 capitals
3161 2 THIS IS A SENTENCE
3162 3 sWaPcAsE
3163 dtype: object
3164 """
3165 # Types:
3166 # cases:
3167 # upper, lower, title, capitalize, swapcase, casefold
3168 # boolean:
3169 # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle
3170 # _doc_args holds dict of strings to use in substituting casemethod docs
3171 _doc_args: dict[str, dict[str, str]] = {}
3172 _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""}
3173 _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""}
3174 _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""}
3175 _doc_args["capitalize"] = {
3176 "type": "be capitalized",
3177 "method": "capitalize",
3178 "version": "",
3179 }
3180 _doc_args["swapcase"] = {
3181 "type": "be swapcased",
3182 "method": "swapcase",
3183 "version": "",
3184 }
3185 _doc_args["casefold"] = {
3186 "type": "be casefolded",
3187 "method": "casefold",
3188 "version": "",
3189 }
3190
3191 @Appender(_shared_docs["casemethods"] % _doc_args["lower"])
3192 @forbid_nonstring_types(["bytes"])
3193 def lower(self):
3194 result = self._data.array._str_lower()
3195 return self._wrap_result(result)
3196
3197 @Appender(_shared_docs["casemethods"] % _doc_args["upper"])
3198 @forbid_nonstring_types(["bytes"])
3199 def upper(self):
3200 result = self._data.array._str_upper()
3201 return self._wrap_result(result)
3202
3203 @Appender(_shared_docs["casemethods"] % _doc_args["title"])
3204 @forbid_nonstring_types(["bytes"])
3205 def title(self):
3206 result = self._data.array._str_title()
3207 return self._wrap_result(result)
3208
3209 @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])
3210 @forbid_nonstring_types(["bytes"])
3211 def capitalize(self):
3212 result = self._data.array._str_capitalize()
3213 return self._wrap_result(result)
3214
3215 @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])
3216 @forbid_nonstring_types(["bytes"])
3217 def swapcase(self):
3218 result = self._data.array._str_swapcase()
3219 return self._wrap_result(result)
3220
3221 @Appender(_shared_docs["casemethods"] % _doc_args["casefold"])
3222 @forbid_nonstring_types(["bytes"])
3223 def casefold(self):
3224 result = self._data.array._str_casefold()
3225 return self._wrap_result(result)
3226
3227 _shared_docs[
3228 "ismethods"
3229 ] = """
3230 Check whether all characters in each string are %(type)s.
3231
3232 This is equivalent to running the Python string method
3233 :meth:`str.%(method)s` for each element of the Series/Index. If a string
3234 has zero characters, ``False`` is returned for that check.
3235
3236 Returns
3237 -------
3238 Series or Index of bool
3239 Series or Index of boolean values with the same length as the original
3240 Series/Index.
3241
3242 See Also
3243 --------
3244 Series.str.isalpha : Check whether all characters are alphabetic.
3245 Series.str.isnumeric : Check whether all characters are numeric.
3246 Series.str.isalnum : Check whether all characters are alphanumeric.
3247 Series.str.isdigit : Check whether all characters are digits.
3248 Series.str.isdecimal : Check whether all characters are decimal.
3249 Series.str.isspace : Check whether all characters are whitespace.
3250 Series.str.islower : Check whether all characters are lowercase.
3251 Series.str.isupper : Check whether all characters are uppercase.
3252 Series.str.istitle : Check whether all characters are titlecase.
3253
3254 Examples
3255 --------
3256 **Checks for Alphabetic and Numeric Characters**
3257
3258 >>> s1 = pd.Series(['one', 'one1', '1', ''])
3259
3260 >>> s1.str.isalpha()
3261 0 True
3262 1 False
3263 2 False
3264 3 False
3265 dtype: bool
3266
3267 >>> s1.str.isnumeric()
3268 0 False
3269 1 False
3270 2 True
3271 3 False
3272 dtype: bool
3273
3274 >>> s1.str.isalnum()
3275 0 True
3276 1 True
3277 2 True
3278 3 False
3279 dtype: bool
3280
3281 Note that checks against characters mixed with any additional punctuation
3282 or whitespace will evaluate to false for an alphanumeric check.
3283
3284 >>> s2 = pd.Series(['A B', '1.5', '3,000'])
3285 >>> s2.str.isalnum()
3286 0 False
3287 1 False
3288 2 False
3289 dtype: bool
3290
3291 **More Detailed Checks for Numeric Characters**
3292
3293 There are several different but overlapping sets of numeric characters that
3294 can be checked for.
3295
3296 >>> s3 = pd.Series(['23', '³', '⅕', ''])
3297
3298 The ``s3.str.isdecimal`` method checks for characters used to form numbers
3299 in base 10.
3300
3301 >>> s3.str.isdecimal()
3302 0 True
3303 1 False
3304 2 False
3305 3 False
3306 dtype: bool
3307
3308 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
3309 includes special digits, like superscripted and subscripted digits in
3310 unicode.
3311
3312 >>> s3.str.isdigit()
3313 0 True
3314 1 True
3315 2 False
3316 3 False
3317 dtype: bool
3318
3319 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
3320 includes other characters that can represent quantities such as unicode
3321 fractions.
3322
3323 >>> s3.str.isnumeric()
3324 0 True
3325 1 True
3326 2 True
3327 3 False
3328 dtype: bool
3329
3330 **Checks for Whitespace**
3331
3332 >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
3333 >>> s4.str.isspace()
3334 0 True
3335 1 True
3336 2 False
3337 dtype: bool
3338
3339 **Checks for Character Case**
3340
3341 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
3342
3343 >>> s5.str.islower()
3344 0 True
3345 1 False
3346 2 False
3347 3 False
3348 dtype: bool
3349
3350 >>> s5.str.isupper()
3351 0 False
3352 1 False
3353 2 True
3354 3 False
3355 dtype: bool
3356
3357 The ``s5.str.istitle`` method checks for whether all words are in title
3358 case (whether only the first letter of each word is capitalized). Words are
3359 assumed to be as any sequence of non-numeric characters separated by
3360 whitespace characters.
3361
3362 >>> s5.str.istitle()
3363 0 False
3364 1 True
3365 2 False
3366 3 False
3367 dtype: bool
3368 """
3369 _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"}
3370 _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"}
3371 _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"}
3372 _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"}
3373 _doc_args["islower"] = {"type": "lowercase", "method": "islower"}
3374 _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"}
3375 _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"}
3376 _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"}
3377 _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"}
3378 # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)
3379
3380 isalnum = _map_and_wrap(
3381 "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]
3382 )
3383 isalpha = _map_and_wrap(
3384 "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]
3385 )
3386 isdigit = _map_and_wrap(
3387 "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]
3388 )
3389 isspace = _map_and_wrap(
3390 "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]
3391 )
3392 islower = _map_and_wrap(
3393 "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]
3394 )
3395 isupper = _map_and_wrap(
3396 "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"]
3397 )
3398 istitle = _map_and_wrap(
3399 "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"]
3400 )
3401 isnumeric = _map_and_wrap(
3402 "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]
3403 )
3404 isdecimal = _map_and_wrap(
3405 "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]
3406 )
3407
3408
3409def cat_safe(list_of_columns: list[npt.NDArray[np.object_]], sep: str):
3410 """
3411 Auxiliary function for :meth:`str.cat`.
3412
3413 Same signature as cat_core, but handles TypeErrors in concatenation, which
3414 happen if the arrays in list_of columns have the wrong dtypes or content.
3415
3416 Parameters
3417 ----------
3418 list_of_columns : list of numpy arrays
3419 List of arrays to be concatenated with sep;
3420 these arrays may not contain NaNs!
3421 sep : string
3422 The separator string for concatenating the columns.
3423
3424 Returns
3425 -------
3426 nd.array
3427 The concatenation of list_of_columns with sep.
3428 """
3429 try:
3430 result = cat_core(list_of_columns, sep)
3431 except TypeError:
3432 # if there are any non-string values (wrong dtype or hidden behind
3433 # object dtype), np.sum will fail; catch and return with better message
3434 for column in list_of_columns:
3435 dtype = lib.infer_dtype(column, skipna=True)
3436 if dtype not in ["string", "empty"]:
3437 raise TypeError(
3438 "Concatenation requires list-likes containing only "
3439 "strings (or missing values). Offending values found in "
3440 f"column {dtype}"
3441 ) from None
3442 return result
3443
3444
3445def cat_core(list_of_columns: list, sep: str):
3446 """
3447 Auxiliary function for :meth:`str.cat`
3448
3449 Parameters
3450 ----------
3451 list_of_columns : list of numpy arrays
3452 List of arrays to be concatenated with sep;
3453 these arrays may not contain NaNs!
3454 sep : string
3455 The separator string for concatenating the columns.
3456
3457 Returns
3458 -------
3459 nd.array
3460 The concatenation of list_of_columns with sep.
3461 """
3462 if sep == "":
3463 # no need to interleave sep if it is empty
3464 arr_of_cols = np.asarray(list_of_columns, dtype=object)
3465 return np.sum(arr_of_cols, axis=0)
3466 list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
3467 list_with_sep[::2] = list_of_columns
3468 arr_with_sep = np.asarray(list_with_sep, dtype=object)
3469 return np.sum(arr_with_sep, axis=0)
3470
3471
3472def _result_dtype(arr):
3473 # workaround #27953
3474 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
3475 # when the list of values is empty.
3476 from pandas.core.arrays.string_ import StringDtype
3477
3478 if isinstance(arr.dtype, (ArrowDtype, StringDtype)):
3479 return arr.dtype
3480 return object
3481
3482
3483def _get_single_group_name(regex: re.Pattern) -> Hashable:
3484 if regex.groupindex:
3485 return next(iter(regex.groupindex))
3486 else:
3487 return None
3488
3489
3490def _get_group_names(regex: re.Pattern) -> list[Hashable]:
3491 """
3492 Get named groups from compiled regex.
3493
3494 Unnamed groups are numbered.
3495
3496 Parameters
3497 ----------
3498 regex : compiled regex
3499
3500 Returns
3501 -------
3502 list of column labels
3503 """
3504 names = {v: k for k, v in regex.groupindex.items()}
3505 return [names.get(1 + i, i) for i in range(regex.groups)]
3506
3507
3508def str_extractall(arr, pat, flags: int = 0) -> DataFrame:
3509 regex = re.compile(pat, flags=flags)
3510 # the regex must contain capture groups.
3511 if regex.groups == 0:
3512 raise ValueError("pattern contains no capture groups")
3513
3514 if isinstance(arr, ABCIndex):
3515 arr = arr.to_series().reset_index(drop=True).astype(arr.dtype)
3516
3517 columns = _get_group_names(regex)
3518 match_list = []
3519 index_list = []
3520 is_mi = arr.index.nlevels > 1
3521
3522 for subject_key, subject in arr.items():
3523 if isinstance(subject, str):
3524 if not is_mi:
3525 subject_key = (subject_key,)
3526
3527 for match_i, match_tuple in enumerate(regex.findall(subject)):
3528 if isinstance(match_tuple, str):
3529 match_tuple = (match_tuple,)
3530 na_tuple = [np.nan if group == "" else group for group in match_tuple]
3531 match_list.append(na_tuple)
3532 result_key = tuple(subject_key + (match_i,))
3533 index_list.append(result_key)
3534
3535 from pandas import MultiIndex
3536
3537 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
3538 dtype = _result_dtype(arr)
3539
3540 result = arr._constructor_expanddim(
3541 match_list, index=index, columns=columns, dtype=dtype
3542 )
3543 return result