1from __future__ import annotations
2
3import codecs
4from functools import wraps
5import re
6from typing import (
7 TYPE_CHECKING,
8 Callable,
9 Hashable,
10 Literal,
11 cast,
12)
13import warnings
14
15import numpy as np
16
17from pandas._libs import lib
18from pandas._typing import (
19 AlignJoin,
20 DtypeObj,
21 F,
22 Scalar,
23)
24from pandas.util._decorators import Appender
25from pandas.util._exceptions import find_stack_level
26
27from pandas.core.dtypes.common import (
28 ensure_object,
29 is_bool_dtype,
30 is_categorical_dtype,
31 is_integer,
32 is_list_like,
33 is_object_dtype,
34 is_re,
35)
36from pandas.core.dtypes.generic import (
37 ABCDataFrame,
38 ABCIndex,
39 ABCMultiIndex,
40 ABCSeries,
41)
42from pandas.core.dtypes.missing import isna
43
44from pandas.core.arrays.arrow.dtype import ArrowDtype
45from pandas.core.base import NoNewAttributesMixin
46from pandas.core.construction import extract_array
47
48if TYPE_CHECKING:
49 from pandas import (
50 DataFrame,
51 Index,
52 Series,
53 )
54
55_shared_docs: dict[str, str] = {}
56_cpython_optimized_encoders = (
57 "utf-8",
58 "utf8",
59 "latin-1",
60 "latin1",
61 "iso-8859-1",
62 "mbcs",
63 "ascii",
64)
65_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")
66
67
68def forbid_nonstring_types(
69 forbidden: list[str] | None, name: str | None = None
70) -> Callable[[F], F]:
71 """
72 Decorator to forbid specific types for a method of StringMethods.
73
74 For calling `.str.{method}` on a Series or Index, it is necessary to first
75 initialize the :class:`StringMethods` object, and then call the method.
76 However, different methods allow different input types, and so this can not
77 be checked during :meth:`StringMethods.__init__`, but must be done on a
78 per-method basis. This decorator exists to facilitate this process, and
79 make it explicit which (inferred) types are disallowed by the method.
80
81 :meth:`StringMethods.__init__` allows the *union* of types its different
82 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),
83 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].
84
85 The default string types ['string', 'empty'] are allowed for all methods.
86 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method
87 then needs to forbid the types it is not intended for.
88
89 Parameters
90 ----------
91 forbidden : list-of-str or None
92 List of forbidden non-string types, may be one or more of
93 `['bytes', 'mixed', 'mixed-integer']`.
94 name : str, default None
95 Name of the method to use in the error message. By default, this is
96 None, in which case the name from the method being wrapped will be
97 copied. However, for working with further wrappers (like _pat_wrapper
98 and _noarg_wrapper), it is necessary to specify the name.
99
100 Returns
101 -------
102 func : wrapper
103 The method to which the decorator is applied, with an added check that
104 enforces the inferred type to not be in the list of forbidden types.
105
106 Raises
107 ------
108 TypeError
109 If the inferred type of the underlying data is in `forbidden`.
110 """
111 # deal with None
112 forbidden = [] if forbidden is None else forbidden
113
114 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(
115 forbidden
116 )
117
118 def _forbid_nonstring_types(func: F) -> F:
119 func_name = func.__name__ if name is None else name
120
121 @wraps(func)
122 def wrapper(self, *args, **kwargs):
123 if self._inferred_dtype not in allowed_types:
124 msg = (
125 f"Cannot use .str.{func_name} with values of "
126 f"inferred dtype '{self._inferred_dtype}'."
127 )
128 raise TypeError(msg)
129 return func(self, *args, **kwargs)
130
131 wrapper.__name__ = func_name
132 return cast(F, wrapper)
133
134 return _forbid_nonstring_types
135
136
137def _map_and_wrap(name, docstring):
138 @forbid_nonstring_types(["bytes"], name=name)
139 def wrapper(self):
140 result = getattr(self._data.array, f"_str_{name}")()
141 return self._wrap_result(result)
142
143 wrapper.__doc__ = docstring
144 return wrapper
145
146
147class StringMethods(NoNewAttributesMixin):
148 """
149 Vectorized string functions for Series and Index.
150
151 NAs stay NA unless handled otherwise by a particular method.
152 Patterned after Python's string methods, with some inspiration from
153 R's stringr package.
154
155 Examples
156 --------
157 >>> s = pd.Series(["A_Str_Series"])
158 >>> s
159 0 A_Str_Series
160 dtype: object
161
162 >>> s.str.split("_")
163 0 [A, Str, Series]
164 dtype: object
165
166 >>> s.str.replace("_", "")
167 0 AStrSeries
168 dtype: object
169 """
170
171 # Note: see the docstring in pandas.core.strings.__init__
172 # for an explanation of the implementation.
173 # TODO: Dispatch all the methods
174 # Currently the following are not dispatched to the array
175 # * cat
176 # * extractall
177
178 def __init__(self, data) -> None:
179 from pandas.core.arrays.string_ import StringDtype
180
181 self._inferred_dtype = self._validate(data)
182 self._is_categorical = is_categorical_dtype(data.dtype)
183 self._is_string = isinstance(data.dtype, StringDtype)
184 self._data = data
185
186 self._index = self._name = None
187 if isinstance(data, ABCSeries):
188 self._index = data.index
189 self._name = data.name
190
191 # ._values.categories works for both Series/Index
192 self._parent = data._values.categories if self._is_categorical else data
193 # save orig to blow up categoricals to the right type
194 self._orig = data
195 self._freeze()
196
197 @staticmethod
198 def _validate(data):
199 """
200 Auxiliary function for StringMethods, infers and checks dtype of data.
201
202 This is a "first line of defence" at the creation of the StringMethods-
203 object, and just checks that the dtype is in the
204 *union* of the allowed types over all string methods below; this
205 restriction is then refined on a per-method basis using the decorator
206 @forbid_nonstring_types (more info in the corresponding docstring).
207
208 This really should exclude all series/index with any non-string values,
209 but that isn't practical for performance reasons until we have a str
210 dtype (GH 9343 / 13877)
211
212 Parameters
213 ----------
214 data : The content of the Series
215
216 Returns
217 -------
218 dtype : inferred dtype of data
219 """
220 if isinstance(data, ABCMultiIndex):
221 raise AttributeError(
222 "Can only use .str accessor with Index, not MultiIndex"
223 )
224
225 # see _libs/lib.pyx for list of inferred types
226 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]
227
228 data = extract_array(data)
229
230 values = getattr(data, "categories", data) # categorical / normal
231
232 inferred_dtype = lib.infer_dtype(values, skipna=True)
233
234 if inferred_dtype not in allowed_types:
235 raise AttributeError("Can only use .str accessor with string values!")
236 return inferred_dtype
237
238 def __getitem__(self, key):
239 result = self._data.array._str_getitem(key)
240 return self._wrap_result(result)
241
242 def _wrap_result(
243 self,
244 result,
245 name=None,
246 expand: bool | None = None,
247 fill_value=np.nan,
248 returns_string: bool = True,
249 returns_bool: bool = False,
250 ):
251 from pandas import (
252 Index,
253 MultiIndex,
254 )
255
256 if not hasattr(result, "ndim") or not hasattr(result, "dtype"):
257 if isinstance(result, ABCDataFrame):
258 result = result.__finalize__(self._orig, name="str")
259 return result
260 assert result.ndim < 3
261
262 # We can be wrapping a string / object / categorical result, in which
263 # case we'll want to return the same dtype as the input.
264 # Or we can be wrapping a numeric output, in which case we don't want
265 # to return a StringArray.
266 # Ideally the array method returns the right array type.
267 if expand is None:
268 # infer from ndim if expand is not specified
269 expand = result.ndim != 1
270 elif expand is True and not isinstance(self._orig, ABCIndex):
271 # required when expand=True is explicitly specified
272 # not needed when inferred
273 if isinstance(result.dtype, ArrowDtype):
274 import pyarrow as pa
275
276 from pandas.compat import pa_version_under11p0
277
278 from pandas.core.arrays.arrow.array import ArrowExtensionArray
279
280 value_lengths = result._data.combine_chunks().value_lengths()
281 max_len = pa.compute.max(value_lengths).as_py()
282 min_len = pa.compute.min(value_lengths).as_py()
283 if result._hasna:
284 # ArrowExtensionArray.fillna doesn't work for list scalars
285 result = ArrowExtensionArray(
286 result._data.fill_null([None] * max_len)
287 )
288 if min_len < max_len:
289 # append nulls to each scalar list element up to max_len
290 if not pa_version_under11p0:
291 result = ArrowExtensionArray(
292 pa.compute.list_slice(
293 result._data,
294 start=0,
295 stop=max_len,
296 return_fixed_size_list=True,
297 )
298 )
299 else:
300 all_null = np.full(max_len, fill_value=None, dtype=object)
301 values = result.to_numpy()
302 new_values = []
303 for row in values:
304 if len(row) < max_len:
305 nulls = all_null[: max_len - len(row)]
306 row = np.append(row, nulls)
307 new_values.append(row)
308 pa_type = result._data.type
309 result = ArrowExtensionArray(pa.array(new_values, type=pa_type))
310 if name is not None:
311 labels = name
312 else:
313 labels = range(max_len)
314 result = {
315 label: ArrowExtensionArray(pa.array(res))
316 for label, res in zip(labels, (zip(*result.tolist())))
317 }
318 elif is_object_dtype(result):
319
320 def cons_row(x):
321 if is_list_like(x):
322 return x
323 else:
324 return [x]
325
326 result = [cons_row(x) for x in result]
327 if result and not self._is_string:
328 # propagate nan values to match longest sequence (GH 18450)
329 max_len = max(len(x) for x in result)
330 result = [
331 x * max_len if len(x) == 0 or x[0] is np.nan else x
332 for x in result
333 ]
334
335 if not isinstance(expand, bool):
336 raise ValueError("expand must be True or False")
337
338 if expand is False:
339 # if expand is False, result should have the same name
340 # as the original otherwise specified
341 if name is None:
342 name = getattr(result, "name", None)
343 if name is None:
344 # do not use logical or, _orig may be a DataFrame
345 # which has "name" column
346 name = self._orig.name
347
348 # Wait until we are sure result is a Series or Index before
349 # checking attributes (GH 12180)
350 if isinstance(self._orig, ABCIndex):
351 # if result is a boolean np.array, return the np.array
352 # instead of wrapping it into a boolean Index (GH 8875)
353 if is_bool_dtype(result):
354 return result
355
356 if expand:
357 result = list(result)
358 out = MultiIndex.from_tuples(result, names=name)
359 if out.nlevels == 1:
360 # We had all tuples of length-one, which are
361 # better represented as a regular Index.
362 out = out.get_level_values(0)
363 return out
364 else:
365 return Index(result, name=name)
366 else:
367 index = self._orig.index
368 # This is a mess.
369 dtype: DtypeObj | str | None
370 vdtype = getattr(result, "dtype", None)
371 if self._is_string:
372 if is_bool_dtype(vdtype):
373 dtype = result.dtype
374 elif returns_string:
375 dtype = self._orig.dtype
376 else:
377 dtype = vdtype
378 else:
379 dtype = vdtype
380
381 if expand:
382 cons = self._orig._constructor_expanddim
383 result = cons(result, columns=name, index=index, dtype=dtype)
384 else:
385 # Must be a Series
386 cons = self._orig._constructor
387 result = cons(result, name=name, index=index, dtype=dtype)
388 result = result.__finalize__(self._orig, method="str")
389 if name is not None and result.ndim == 1:
390 # __finalize__ might copy over the original name, but we may
391 # want the new name (e.g. str.extract).
392 result.name = name
393 return result
394
395 def _get_series_list(self, others):
396 """
397 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input
398 into a list of Series (elements without an index must match the length
399 of the calling Series/Index).
400
401 Parameters
402 ----------
403 others : Series, DataFrame, np.ndarray, list-like or list-like of
404 Objects that are either Series, Index or np.ndarray (1-dim).
405
406 Returns
407 -------
408 list of Series
409 Others transformed into list of Series.
410 """
411 from pandas import (
412 DataFrame,
413 Series,
414 )
415
416 # self._orig is either Series or Index
417 idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index
418
419 # Generally speaking, all objects without an index inherit the index
420 # `idx` of the calling Series/Index - i.e. must have matching length.
421 # Objects with an index (i.e. Series/Index/DataFrame) keep their own.
422 if isinstance(others, ABCSeries):
423 return [others]
424 elif isinstance(others, ABCIndex):
425 return [Series(others, index=idx, dtype=others.dtype)]
426 elif isinstance(others, ABCDataFrame):
427 return [others[x] for x in others]
428 elif isinstance(others, np.ndarray) and others.ndim == 2:
429 others = DataFrame(others, index=idx)
430 return [others[x] for x in others]
431 elif is_list_like(others, allow_sets=False):
432 others = list(others) # ensure iterators do not get read twice etc
433
434 # in case of list-like `others`, all elements must be
435 # either Series/Index/np.ndarray (1-dim)...
436 if all(
437 isinstance(x, (ABCSeries, ABCIndex))
438 or (isinstance(x, np.ndarray) and x.ndim == 1)
439 for x in others
440 ):
441 los: list[Series] = []
442 while others: # iterate through list and append each element
443 los = los + self._get_series_list(others.pop(0))
444 return los
445 # ... or just strings
446 elif all(not is_list_like(x) for x in others):
447 return [Series(others, index=idx)]
448 raise TypeError(
449 "others must be Series, Index, DataFrame, np.ndarray "
450 "or list-like (either containing only strings or "
451 "containing only objects of type Series/Index/"
452 "np.ndarray[1-dim])"
453 )
454
455 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])
456 def cat(
457 self,
458 others=None,
459 sep=None,
460 na_rep=None,
461 join: AlignJoin = "left",
462 ) -> str | Series | Index:
463 """
464 Concatenate strings in the Series/Index with given separator.
465
466 If `others` is specified, this function concatenates the Series/Index
467 and elements of `others` element-wise.
468 If `others` is not passed, then all values in the Series/Index are
469 concatenated into a single string with a given `sep`.
470
471 Parameters
472 ----------
473 others : Series, Index, DataFrame, np.ndarray or list-like
474 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and
475 other list-likes of strings must have the same length as the
476 calling Series/Index, with the exception of indexed objects (i.e.
477 Series/Index/DataFrame) if `join` is not None.
478
479 If others is a list-like that contains a combination of Series,
480 Index or np.ndarray (1-dim), then all elements will be unpacked and
481 must satisfy the above criteria individually.
482
483 If others is None, the method returns the concatenation of all
484 strings in the calling Series/Index.
485 sep : str, default ''
486 The separator between the different elements/columns. By default
487 the empty string `''` is used.
488 na_rep : str or None, default None
489 Representation that is inserted for all missing values:
490
491 - If `na_rep` is None, and `others` is None, missing values in the
492 Series/Index are omitted from the result.
493 - If `na_rep` is None, and `others` is not None, a row containing a
494 missing value in any of the columns (before concatenation) will
495 have a missing value in the result.
496 join : {'left', 'right', 'outer', 'inner'}, default 'left'
497 Determines the join-style between the calling Series/Index and any
498 Series/Index/DataFrame in `others` (objects without an index need
499 to match the length of the calling Series/Index). To disable
500 alignment, use `.values` on any Series/Index/DataFrame in `others`.
501
502 Returns
503 -------
504 str, Series or Index
505 If `others` is None, `str` is returned, otherwise a `Series/Index`
506 (same type as caller) of objects is returned.
507
508 See Also
509 --------
510 split : Split each string in the Series/Index.
511 join : Join lists contained as elements in the Series/Index.
512
513 Examples
514 --------
515 When not passing `others`, all values are concatenated into a single
516 string:
517
518 >>> s = pd.Series(['a', 'b', np.nan, 'd'])
519 >>> s.str.cat(sep=' ')
520 'a b d'
521
522 By default, NA values in the Series are ignored. Using `na_rep`, they
523 can be given a representation:
524
525 >>> s.str.cat(sep=' ', na_rep='?')
526 'a b ? d'
527
528 If `others` is specified, corresponding values are concatenated with
529 the separator. Result will be a Series of strings.
530
531 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')
532 0 a,A
533 1 b,B
534 2 NaN
535 3 d,D
536 dtype: object
537
538 Missing values will remain missing in the result, but can again be
539 represented using `na_rep`
540
541 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')
542 0 a,A
543 1 b,B
544 2 -,C
545 3 d,D
546 dtype: object
547
548 If `sep` is not specified, the values are concatenated without
549 separation.
550
551 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')
552 0 aA
553 1 bB
554 2 -C
555 3 dD
556 dtype: object
557
558 Series with different indexes can be aligned before concatenation. The
559 `join`-keyword works as in other methods.
560
561 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])
562 >>> s.str.cat(t, join='left', na_rep='-')
563 0 aa
564 1 b-
565 2 -c
566 3 dd
567 dtype: object
568 >>>
569 >>> s.str.cat(t, join='outer', na_rep='-')
570 0 aa
571 1 b-
572 2 -c
573 3 dd
574 4 -e
575 dtype: object
576 >>>
577 >>> s.str.cat(t, join='inner', na_rep='-')
578 0 aa
579 2 -c
580 3 dd
581 dtype: object
582 >>>
583 >>> s.str.cat(t, join='right', na_rep='-')
584 3 dd
585 0 aa
586 4 -e
587 2 -c
588 dtype: object
589
590 For more examples, see :ref:`here <text.concatenate>`.
591 """
592 # TODO: dispatch
593 from pandas import (
594 Index,
595 Series,
596 concat,
597 )
598
599 if isinstance(others, str):
600 raise ValueError("Did you mean to supply a `sep` keyword?")
601 if sep is None:
602 sep = ""
603
604 if isinstance(self._orig, ABCIndex):
605 data = Series(self._orig, index=self._orig, dtype=self._orig.dtype)
606 else: # Series
607 data = self._orig
608
609 # concatenate Series/Index with itself if no "others"
610 if others is None:
611 # error: Incompatible types in assignment (expression has type
612 # "ndarray", variable has type "Series")
613 data = ensure_object(data) # type: ignore[assignment]
614 na_mask = isna(data)
615 if na_rep is None and na_mask.any():
616 return sep.join(data[~na_mask])
617 elif na_rep is not None and na_mask.any():
618 return sep.join(np.where(na_mask, na_rep, data))
619 else:
620 return sep.join(data)
621
622 try:
623 # turn anything in "others" into lists of Series
624 others = self._get_series_list(others)
625 except ValueError as err: # do not catch TypeError raised by _get_series_list
626 raise ValueError(
627 "If `others` contains arrays or lists (or other "
628 "list-likes without an index), these must all be "
629 "of the same length as the calling Series/Index."
630 ) from err
631
632 # align if required
633 if any(not data.index.equals(x.index) for x in others):
634 # Need to add keys for uniqueness in case of duplicate columns
635 others = concat(
636 others,
637 axis=1,
638 join=(join if join == "inner" else "outer"),
639 keys=range(len(others)),
640 sort=False,
641 copy=False,
642 )
643 data, others = data.align(others, join=join)
644 others = [others[x] for x in others] # again list of Series
645
646 all_cols = [ensure_object(x) for x in [data] + others]
647 na_masks = np.array([isna(x) for x in all_cols])
648 union_mask = np.logical_or.reduce(na_masks, axis=0)
649
650 if na_rep is None and union_mask.any():
651 # no na_rep means NaNs for all rows where any column has a NaN
652 # only necessary if there are actually any NaNs
653 result = np.empty(len(data), dtype=object)
654 np.putmask(result, union_mask, np.nan)
655
656 not_masked = ~union_mask
657 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)
658 elif na_rep is not None and union_mask.any():
659 # fill NaNs with na_rep in case there are actually any NaNs
660 all_cols = [
661 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)
662 ]
663 result = cat_safe(all_cols, sep)
664 else:
665 # no NaNs - can just concatenate
666 result = cat_safe(all_cols, sep)
667
668 out: Index | Series
669 if isinstance(self._orig, ABCIndex):
670 # add dtype for case that result is all-NA
671
672 out = Index(result, dtype=object, name=self._orig.name)
673 else: # Series
674 if is_categorical_dtype(self._orig.dtype):
675 # We need to infer the new categories.
676 dtype = None
677 else:
678 dtype = self._orig.dtype
679 res_ser = Series(
680 result, dtype=dtype, index=data.index, name=self._orig.name, copy=False
681 )
682 out = res_ser.__finalize__(self._orig, method="str_cat")
683 return out
684
685 _shared_docs[
686 "str_split"
687 ] = r"""
688 Split strings around given separator/delimiter.
689
690 Splits the string in the Series/Index from the %(side)s,
691 at the specified delimiter string.
692
693 Parameters
694 ----------
695 pat : str%(pat_regex)s, optional
696 %(pat_description)s.
697 If not specified, split on whitespace.
698 n : int, default -1 (all)
699 Limit number of splits in output.
700 ``None``, 0 and -1 will be interpreted as return all splits.
701 expand : bool, default False
702 Expand the split strings into separate columns.
703
704 - If ``True``, return DataFrame/MultiIndex expanding dimensionality.
705 - If ``False``, return Series/Index, containing lists of strings.
706 %(regex_argument)s
707 Returns
708 -------
709 Series, Index, DataFrame or MultiIndex
710 Type matches caller unless ``expand=True`` (see Notes).
711 %(raises_split)s
712 See Also
713 --------
714 Series.str.split : Split strings around given separator/delimiter.
715 Series.str.rsplit : Splits string around given separator/delimiter,
716 starting from the right.
717 Series.str.join : Join lists contained as elements in the Series/Index
718 with passed delimiter.
719 str.split : Standard library version for split.
720 str.rsplit : Standard library version for rsplit.
721
722 Notes
723 -----
724 The handling of the `n` keyword depends on the number of found splits:
725
726 - If found splits > `n`, make first `n` splits only
727 - If found splits <= `n`, make all splits
728 - If for a certain row the number of found splits < `n`,
729 append `None` for padding up to `n` if ``expand=True``
730
731 If using ``expand=True``, Series and Index callers return DataFrame and
732 MultiIndex objects, respectively.
733 %(regex_pat_note)s
734 Examples
735 --------
736 >>> s = pd.Series(
737 ... [
738 ... "this is a regular sentence",
739 ... "https://docs.python.org/3/tutorial/index.html",
740 ... np.nan
741 ... ]
742 ... )
743 >>> s
744 0 this is a regular sentence
745 1 https://docs.python.org/3/tutorial/index.html
746 2 NaN
747 dtype: object
748
749 In the default setting, the string is split by whitespace.
750
751 >>> s.str.split()
752 0 [this, is, a, regular, sentence]
753 1 [https://docs.python.org/3/tutorial/index.html]
754 2 NaN
755 dtype: object
756
757 Without the `n` parameter, the outputs of `rsplit` and `split`
758 are identical.
759
760 >>> s.str.rsplit()
761 0 [this, is, a, regular, sentence]
762 1 [https://docs.python.org/3/tutorial/index.html]
763 2 NaN
764 dtype: object
765
766 The `n` parameter can be used to limit the number of splits on the
767 delimiter. The outputs of `split` and `rsplit` are different.
768
769 >>> s.str.split(n=2)
770 0 [this, is, a regular sentence]
771 1 [https://docs.python.org/3/tutorial/index.html]
772 2 NaN
773 dtype: object
774
775 >>> s.str.rsplit(n=2)
776 0 [this is a, regular, sentence]
777 1 [https://docs.python.org/3/tutorial/index.html]
778 2 NaN
779 dtype: object
780
781 The `pat` parameter can be used to split by other characters.
782
783 >>> s.str.split(pat="/")
784 0 [this is a regular sentence]
785 1 [https:, , docs.python.org, 3, tutorial, index...
786 2 NaN
787 dtype: object
788
789 When using ``expand=True``, the split elements will expand out into
790 separate columns. If NaN is present, it is propagated throughout
791 the columns during the split.
792
793 >>> s.str.split(expand=True)
794 0 1 2 3 4
795 0 this is a regular sentence
796 1 https://docs.python.org/3/tutorial/index.html None None None None
797 2 NaN NaN NaN NaN NaN
798
799 For slightly more complex use cases like splitting the html document name
800 from a url, a combination of parameter settings can be used.
801
802 >>> s.str.rsplit("/", n=1, expand=True)
803 0 1
804 0 this is a regular sentence None
805 1 https://docs.python.org/3/tutorial index.html
806 2 NaN NaN
807 %(regex_examples)s"""
808
809 @Appender(
810 _shared_docs["str_split"]
811 % {
812 "side": "beginning",
813 "pat_regex": " or compiled regex",
814 "pat_description": "String or regular expression to split on",
815 "regex_argument": """
816 regex : bool, default None
817 Determines if the passed-in pattern is a regular expression:
818
819 - If ``True``, assumes the passed-in pattern is a regular expression
820 - If ``False``, treats the pattern as a literal string.
821 - If ``None`` and `pat` length is 1, treats `pat` as a literal string.
822 - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.
823 - Cannot be set to False if `pat` is a compiled regex
824
825 .. versionadded:: 1.4.0
826 """,
827 "raises_split": """
828 Raises
829 ------
830 ValueError
831 * if `regex` is False and `pat` is a compiled regex
832 """,
833 "regex_pat_note": """
834 Use of `regex =False` with a `pat` as a compiled regex will raise an error.
835 """,
836 "method": "split",
837 "regex_examples": r"""
838 Remember to escape special characters when explicitly using regular expressions.
839
840 >>> s = pd.Series(["foo and bar plus baz"])
841 >>> s.str.split(r"and|plus", expand=True)
842 0 1 2
843 0 foo bar baz
844
845 Regular expressions can be used to handle urls or file names.
846 When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled
847 as a regex only if ``len(pat) != 1``.
848
849 >>> s = pd.Series(['foojpgbar.jpg'])
850 >>> s.str.split(r".", expand=True)
851 0 1
852 0 foojpgbar jpg
853
854 >>> s.str.split(r"\.jpg", expand=True)
855 0 1
856 0 foojpgbar
857
858 When ``regex=True``, `pat` is interpreted as a regex
859
860 >>> s.str.split(r"\.jpg", regex=True, expand=True)
861 0 1
862 0 foojpgbar
863
864 A compiled regex can be passed as `pat`
865
866 >>> import re
867 >>> s.str.split(re.compile(r"\.jpg"), expand=True)
868 0 1
869 0 foojpgbar
870
871 When ``regex=False``, `pat` is interpreted as the string itself
872
873 >>> s.str.split(r"\.jpg", regex=False, expand=True)
874 0
875 0 foojpgbar.jpg
876 """,
877 }
878 )
879 @forbid_nonstring_types(["bytes"])
880 def split(
881 self,
882 pat: str | re.Pattern | None = None,
883 *,
884 n=-1,
885 expand: bool = False,
886 regex: bool | None = None,
887 ):
888 if regex is False and is_re(pat):
889 raise ValueError(
890 "Cannot use a compiled regex as replacement pattern with regex=False"
891 )
892 if is_re(pat):
893 regex = True
894 result = self._data.array._str_split(pat, n, expand, regex)
895 return self._wrap_result(result, returns_string=expand, expand=expand)
896
897 @Appender(
898 _shared_docs["str_split"]
899 % {
900 "side": "end",
901 "pat_regex": "",
902 "pat_description": "String to split on",
903 "regex_argument": "",
904 "raises_split": "",
905 "regex_pat_note": "",
906 "method": "rsplit",
907 "regex_examples": "",
908 }
909 )
910 @forbid_nonstring_types(["bytes"])
911 def rsplit(self, pat=None, *, n=-1, expand: bool = False):
912 result = self._data.array._str_rsplit(pat, n=n)
913 return self._wrap_result(result, expand=expand, returns_string=expand)
914
915 _shared_docs[
916 "str_partition"
917 ] = """
918 Split the string at the %(side)s occurrence of `sep`.
919
920 This method splits the string at the %(side)s occurrence of `sep`,
921 and returns 3 elements containing the part before the separator,
922 the separator itself, and the part after the separator.
923 If the separator is not found, return %(return)s.
924
925 Parameters
926 ----------
927 sep : str, default whitespace
928 String to split on.
929 expand : bool, default True
930 If True, return DataFrame/MultiIndex expanding dimensionality.
931 If False, return Series/Index.
932
933 Returns
934 -------
935 DataFrame/MultiIndex or Series/Index of objects
936
937 See Also
938 --------
939 %(also)s
940 Series.str.split : Split strings around given separators.
941 str.partition : Standard library version.
942
943 Examples
944 --------
945
946 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
947 >>> s
948 0 Linda van der Berg
949 1 George Pitt-Rivers
950 dtype: object
951
952 >>> s.str.partition()
953 0 1 2
954 0 Linda van der Berg
955 1 George Pitt-Rivers
956
957 To partition by the last space instead of the first one:
958
959 >>> s.str.rpartition()
960 0 1 2
961 0 Linda van der Berg
962 1 George Pitt-Rivers
963
964 To partition by something different than a space:
965
966 >>> s.str.partition('-')
967 0 1 2
968 0 Linda van der Berg
969 1 George Pitt - Rivers
970
971 To return a Series containing tuples instead of a DataFrame:
972
973 >>> s.str.partition('-', expand=False)
974 0 (Linda van der Berg, , )
975 1 (George Pitt, -, Rivers)
976 dtype: object
977
978 Also available on indices:
979
980 >>> idx = pd.Index(['X 123', 'Y 999'])
981 >>> idx
982 Index(['X 123', 'Y 999'], dtype='object')
983
984 Which will create a MultiIndex:
985
986 >>> idx.str.partition()
987 MultiIndex([('X', ' ', '123'),
988 ('Y', ' ', '999')],
989 )
990
991 Or an index with tuples with ``expand=False``:
992
993 >>> idx.str.partition(expand=False)
994 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')
995 """
996
997 @Appender(
998 _shared_docs["str_partition"]
999 % {
1000 "side": "first",
1001 "return": "3 elements containing the string itself, followed by two "
1002 "empty strings",
1003 "also": "rpartition : Split the string at the last occurrence of `sep`.",
1004 }
1005 )
1006 @forbid_nonstring_types(["bytes"])
1007 def partition(self, sep: str = " ", expand: bool = True):
1008 result = self._data.array._str_partition(sep, expand)
1009 return self._wrap_result(result, expand=expand, returns_string=expand)
1010
1011 @Appender(
1012 _shared_docs["str_partition"]
1013 % {
1014 "side": "last",
1015 "return": "3 elements containing two empty strings, followed by the "
1016 "string itself",
1017 "also": "partition : Split the string at the first occurrence of `sep`.",
1018 }
1019 )
1020 @forbid_nonstring_types(["bytes"])
1021 def rpartition(self, sep: str = " ", expand: bool = True):
1022 result = self._data.array._str_rpartition(sep, expand)
1023 return self._wrap_result(result, expand=expand, returns_string=expand)
1024
1025 def get(self, i):
1026 """
1027 Extract element from each component at specified position or with specified key.
1028
1029 Extract element from lists, tuples, dict, or strings in each element in the
1030 Series/Index.
1031
1032 Parameters
1033 ----------
1034 i : int or hashable dict label
1035 Position or key of element to extract.
1036
1037 Returns
1038 -------
1039 Series or Index
1040
1041 Examples
1042 --------
1043 >>> s = pd.Series(["String",
1044 ... (1, 2, 3),
1045 ... ["a", "b", "c"],
1046 ... 123,
1047 ... -456,
1048 ... {1: "Hello", "2": "World"}])
1049 >>> s
1050 0 String
1051 1 (1, 2, 3)
1052 2 [a, b, c]
1053 3 123
1054 4 -456
1055 5 {1: 'Hello', '2': 'World'}
1056 dtype: object
1057
1058 >>> s.str.get(1)
1059 0 t
1060 1 2
1061 2 b
1062 3 NaN
1063 4 NaN
1064 5 Hello
1065 dtype: object
1066
1067 >>> s.str.get(-1)
1068 0 g
1069 1 3
1070 2 c
1071 3 NaN
1072 4 NaN
1073 5 None
1074 dtype: object
1075
1076 Return element with given key
1077
1078 >>> s = pd.Series([{"name": "Hello", "value": "World"},
1079 ... {"name": "Goodbye", "value": "Planet"}])
1080 >>> s.str.get('name')
1081 0 Hello
1082 1 Goodbye
1083 dtype: object
1084 """
1085 result = self._data.array._str_get(i)
1086 return self._wrap_result(result)
1087
1088 @forbid_nonstring_types(["bytes"])
1089 def join(self, sep):
1090 """
1091 Join lists contained as elements in the Series/Index with passed delimiter.
1092
1093 If the elements of a Series are lists themselves, join the content of these
1094 lists using the delimiter passed to the function.
1095 This function is an equivalent to :meth:`str.join`.
1096
1097 Parameters
1098 ----------
1099 sep : str
1100 Delimiter to use between list entries.
1101
1102 Returns
1103 -------
1104 Series/Index: object
1105 The list entries concatenated by intervening occurrences of the
1106 delimiter.
1107
1108 Raises
1109 ------
1110 AttributeError
1111 If the supplied Series contains neither strings nor lists.
1112
1113 See Also
1114 --------
1115 str.join : Standard library version of this method.
1116 Series.str.split : Split strings around given separator/delimiter.
1117
1118 Notes
1119 -----
1120 If any of the list items is not a string object, the result of the join
1121 will be `NaN`.
1122
1123 Examples
1124 --------
1125 Example with a list that contains non-string elements.
1126
1127 >>> s = pd.Series([['lion', 'elephant', 'zebra'],
1128 ... [1.1, 2.2, 3.3],
1129 ... ['cat', np.nan, 'dog'],
1130 ... ['cow', 4.5, 'goat'],
1131 ... ['duck', ['swan', 'fish'], 'guppy']])
1132 >>> s
1133 0 [lion, elephant, zebra]
1134 1 [1.1, 2.2, 3.3]
1135 2 [cat, nan, dog]
1136 3 [cow, 4.5, goat]
1137 4 [duck, [swan, fish], guppy]
1138 dtype: object
1139
1140 Join all lists using a '-'. The lists containing object(s) of types other
1141 than str will produce a NaN.
1142
1143 >>> s.str.join('-')
1144 0 lion-elephant-zebra
1145 1 NaN
1146 2 NaN
1147 3 NaN
1148 4 NaN
1149 dtype: object
1150 """
1151 result = self._data.array._str_join(sep)
1152 return self._wrap_result(result)
1153
1154 @forbid_nonstring_types(["bytes"])
1155 def contains(
1156 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True
1157 ):
1158 r"""
1159 Test if pattern or regex is contained within a string of a Series or Index.
1160
1161 Return boolean Series or Index based on whether a given pattern or regex is
1162 contained within a string of a Series or Index.
1163
1164 Parameters
1165 ----------
1166 pat : str
1167 Character sequence or regular expression.
1168 case : bool, default True
1169 If True, case sensitive.
1170 flags : int, default 0 (no flags)
1171 Flags to pass through to the re module, e.g. re.IGNORECASE.
1172 na : scalar, optional
1173 Fill value for missing values. The default depends on dtype of the
1174 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
1175 ``pandas.NA`` is used.
1176 regex : bool, default True
1177 If True, assumes the pat is a regular expression.
1178
1179 If False, treats the pat as a literal string.
1180
1181 Returns
1182 -------
1183 Series or Index of boolean values
1184 A Series or Index of boolean values indicating whether the
1185 given pattern is contained within the string of each element
1186 of the Series or Index.
1187
1188 See Also
1189 --------
1190 match : Analogous, but stricter, relying on re.match instead of re.search.
1191 Series.str.startswith : Test if the start of each string element matches a
1192 pattern.
1193 Series.str.endswith : Same as startswith, but tests the end of string.
1194
1195 Examples
1196 --------
1197 Returning a Series of booleans using only a literal pattern.
1198
1199 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
1200 >>> s1.str.contains('og', regex=False)
1201 0 False
1202 1 True
1203 2 False
1204 3 False
1205 4 NaN
1206 dtype: object
1207
1208 Returning an Index of booleans using only a literal pattern.
1209
1210 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
1211 >>> ind.str.contains('23', regex=False)
1212 Index([False, False, False, True, nan], dtype='object')
1213
1214 Specifying case sensitivity using `case`.
1215
1216 >>> s1.str.contains('oG', case=True, regex=True)
1217 0 False
1218 1 False
1219 2 False
1220 3 False
1221 4 NaN
1222 dtype: object
1223
1224 Specifying `na` to be `False` instead of `NaN` replaces NaN values
1225 with `False`. If Series or Index does not contain NaN values
1226 the resultant dtype will be `bool`, otherwise, an `object` dtype.
1227
1228 >>> s1.str.contains('og', na=False, regex=True)
1229 0 False
1230 1 True
1231 2 False
1232 3 False
1233 4 False
1234 dtype: bool
1235
1236 Returning 'house' or 'dog' when either expression occurs in a string.
1237
1238 >>> s1.str.contains('house|dog', regex=True)
1239 0 False
1240 1 True
1241 2 True
1242 3 False
1243 4 NaN
1244 dtype: object
1245
1246 Ignoring case sensitivity using `flags` with regex.
1247
1248 >>> import re
1249 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)
1250 0 False
1251 1 False
1252 2 True
1253 3 False
1254 4 NaN
1255 dtype: object
1256
1257 Returning any digit using regular expression.
1258
1259 >>> s1.str.contains('\\d', regex=True)
1260 0 False
1261 1 False
1262 2 False
1263 3 True
1264 4 NaN
1265 dtype: object
1266
1267 Ensure `pat` is a not a literal pattern when `regex` is set to True.
1268 Note in the following example one might expect only `s2[1]` and `s2[3]` to
1269 return `True`. However, '.0' as a regex matches any character
1270 followed by a 0.
1271
1272 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])
1273 >>> s2.str.contains('.0', regex=True)
1274 0 True
1275 1 True
1276 2 False
1277 3 True
1278 4 False
1279 dtype: bool
1280 """
1281 if regex and re.compile(pat).groups:
1282 warnings.warn(
1283 "This pattern is interpreted as a regular expression, and has "
1284 "match groups. To actually get the groups, use str.extract.",
1285 UserWarning,
1286 stacklevel=find_stack_level(),
1287 )
1288
1289 result = self._data.array._str_contains(pat, case, flags, na, regex)
1290 return self._wrap_result(result, fill_value=na, returns_string=False)
1291
1292 @forbid_nonstring_types(["bytes"])
1293 def match(self, pat, case: bool = True, flags: int = 0, na=None):
1294 """
1295 Determine if each string starts with a match of a regular expression.
1296
1297 Parameters
1298 ----------
1299 pat : str
1300 Character sequence or regular expression.
1301 case : bool, default True
1302 If True, case sensitive.
1303 flags : int, default 0 (no flags)
1304 Regex module flags, e.g. re.IGNORECASE.
1305 na : scalar, optional
1306 Fill value for missing values. The default depends on dtype of the
1307 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
1308 ``pandas.NA`` is used.
1309
1310 Returns
1311 -------
1312 Series/Index/array of boolean values
1313
1314 See Also
1315 --------
1316 fullmatch : Stricter matching that requires the entire string to match.
1317 contains : Analogous, but less strict, relying on re.search instead of
1318 re.match.
1319 extract : Extract matched groups.
1320 """
1321 result = self._data.array._str_match(pat, case=case, flags=flags, na=na)
1322 return self._wrap_result(result, fill_value=na, returns_string=False)
1323
1324 @forbid_nonstring_types(["bytes"])
1325 def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None):
1326 """
1327 Determine if each string entirely matches a regular expression.
1328
1329 .. versionadded:: 1.1.0
1330
1331 Parameters
1332 ----------
1333 pat : str
1334 Character sequence or regular expression.
1335 case : bool, default True
1336 If True, case sensitive.
1337 flags : int, default 0 (no flags)
1338 Regex module flags, e.g. re.IGNORECASE.
1339 na : scalar, optional
1340 Fill value for missing values. The default depends on dtype of the
1341 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,
1342 ``pandas.NA`` is used.
1343
1344 Returns
1345 -------
1346 Series/Index/array of boolean values
1347
1348 See Also
1349 --------
1350 match : Similar, but also returns `True` when only a *prefix* of the string
1351 matches the regular expression.
1352 extract : Extract matched groups.
1353 """
1354 result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)
1355 return self._wrap_result(result, fill_value=na, returns_string=False)
1356
1357 @forbid_nonstring_types(["bytes"])
1358 def replace(
1359 self,
1360 pat: str | re.Pattern,
1361 repl: str | Callable,
1362 n: int = -1,
1363 case: bool | None = None,
1364 flags: int = 0,
1365 regex: bool = False,
1366 ):
1367 r"""
1368 Replace each occurrence of pattern/regex in the Series/Index.
1369
1370 Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on
1371 the regex value.
1372
1373 Parameters
1374 ----------
1375 pat : str or compiled regex
1376 String can be a character sequence or regular expression.
1377 repl : str or callable
1378 Replacement string or a callable. The callable is passed the regex
1379 match object and must return a replacement string to be used.
1380 See :func:`re.sub`.
1381 n : int, default -1 (all)
1382 Number of replacements to make from start.
1383 case : bool, default None
1384 Determines if replace is case sensitive:
1385
1386 - If True, case sensitive (the default if `pat` is a string)
1387 - Set to False for case insensitive
1388 - Cannot be set if `pat` is a compiled regex.
1389
1390 flags : int, default 0 (no flags)
1391 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled
1392 regex.
1393 regex : bool, default False
1394 Determines if the passed-in pattern is a regular expression:
1395
1396 - If True, assumes the passed-in pattern is a regular expression.
1397 - If False, treats the pattern as a literal string
1398 - Cannot be set to False if `pat` is a compiled regex or `repl` is
1399 a callable.
1400
1401 Returns
1402 -------
1403 Series or Index of object
1404 A copy of the object with all matching occurrences of `pat` replaced by
1405 `repl`.
1406
1407 Raises
1408 ------
1409 ValueError
1410 * if `regex` is False and `repl` is a callable or `pat` is a compiled
1411 regex
1412 * if `pat` is a compiled regex and `case` or `flags` is set
1413
1414 Notes
1415 -----
1416 When `pat` is a compiled regex, all flags should be included in the
1417 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled
1418 regex will raise an error.
1419
1420 Examples
1421 --------
1422 When `pat` is a string and `regex` is True (the default), the given `pat`
1423 is compiled as a regex. When `repl` is a string, it replaces matching
1424 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are
1425 left as is:
1426
1427 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)
1428 0 bao
1429 1 baz
1430 2 NaN
1431 dtype: object
1432
1433 When `pat` is a string and `regex` is False, every `pat` is replaced with
1434 `repl` as with :meth:`str.replace`:
1435
1436 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
1437 0 bao
1438 1 fuz
1439 2 NaN
1440 dtype: object
1441
1442 When `repl` is a callable, it is called on every `pat` using
1443 :func:`re.sub`. The callable should expect one positional argument
1444 (a regex object) and return a string.
1445
1446 To get the idea:
1447
1448 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True)
1449 0 <re.Match object; span=(0, 1), match='f'>oo
1450 1 <re.Match object; span=(0, 1), match='f'>uz
1451 2 NaN
1452 dtype: object
1453
1454 Reverse every lowercase alphabetic word:
1455
1456 >>> repl = lambda m: m.group(0)[::-1]
1457 >>> ser = pd.Series(['foo 123', 'bar baz', np.nan])
1458 >>> ser.str.replace(r'[a-z]+', repl, regex=True)
1459 0 oof 123
1460 1 rab zab
1461 2 NaN
1462 dtype: object
1463
1464 Using regex groups (extract second group and swap case):
1465
1466 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"
1467 >>> repl = lambda m: m.group('two').swapcase()
1468 >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz'])
1469 >>> ser.str.replace(pat, repl, regex=True)
1470 0 tWO
1471 1 bAR
1472 dtype: object
1473
1474 Using a compiled regex with flags
1475
1476 >>> import re
1477 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)
1478 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)
1479 0 foo
1480 1 bar
1481 2 NaN
1482 dtype: object
1483 """
1484 # Check whether repl is valid (GH 13438, GH 15055)
1485 if not (isinstance(repl, str) or callable(repl)):
1486 raise TypeError("repl must be a string or callable")
1487
1488 is_compiled_re = is_re(pat)
1489 if regex or regex is None:
1490 if is_compiled_re and (case is not None or flags != 0):
1491 raise ValueError(
1492 "case and flags cannot be set when pat is a compiled regex"
1493 )
1494
1495 elif is_compiled_re:
1496 raise ValueError(
1497 "Cannot use a compiled regex as replacement pattern with regex=False"
1498 )
1499 elif callable(repl):
1500 raise ValueError("Cannot use a callable replacement when regex=False")
1501
1502 if case is None:
1503 case = True
1504
1505 result = self._data.array._str_replace(
1506 pat, repl, n=n, case=case, flags=flags, regex=regex
1507 )
1508 return self._wrap_result(result)
1509
1510 @forbid_nonstring_types(["bytes"])
1511 def repeat(self, repeats):
1512 """
1513 Duplicate each string in the Series or Index.
1514
1515 Parameters
1516 ----------
1517 repeats : int or sequence of int
1518 Same value for all (int) or different value per (sequence).
1519
1520 Returns
1521 -------
1522 Series or pandas.Index
1523 Series or Index of repeated string objects specified by
1524 input parameter repeats.
1525
1526 Examples
1527 --------
1528 >>> s = pd.Series(['a', 'b', 'c'])
1529 >>> s
1530 0 a
1531 1 b
1532 2 c
1533 dtype: object
1534
1535 Single int repeats string in Series
1536
1537 >>> s.str.repeat(repeats=2)
1538 0 aa
1539 1 bb
1540 2 cc
1541 dtype: object
1542
1543 Sequence of int repeats corresponding string in Series
1544
1545 >>> s.str.repeat(repeats=[1, 2, 3])
1546 0 a
1547 1 bb
1548 2 ccc
1549 dtype: object
1550 """
1551 result = self._data.array._str_repeat(repeats)
1552 return self._wrap_result(result)
1553
1554 @forbid_nonstring_types(["bytes"])
1555 def pad(
1556 self,
1557 width,
1558 side: Literal["left", "right", "both"] = "left",
1559 fillchar: str = " ",
1560 ):
1561 """
1562 Pad strings in the Series/Index up to width.
1563
1564 Parameters
1565 ----------
1566 width : int
1567 Minimum width of resulting string; additional characters will be filled
1568 with character defined in `fillchar`.
1569 side : {'left', 'right', 'both'}, default 'left'
1570 Side from which to fill resulting string.
1571 fillchar : str, default ' '
1572 Additional character for filling, default is whitespace.
1573
1574 Returns
1575 -------
1576 Series or Index of object
1577 Returns Series or Index with minimum number of char in object.
1578
1579 See Also
1580 --------
1581 Series.str.rjust : Fills the left side of strings with an arbitrary
1582 character. Equivalent to ``Series.str.pad(side='left')``.
1583 Series.str.ljust : Fills the right side of strings with an arbitrary
1584 character. Equivalent to ``Series.str.pad(side='right')``.
1585 Series.str.center : Fills both sides of strings with an arbitrary
1586 character. Equivalent to ``Series.str.pad(side='both')``.
1587 Series.str.zfill : Pad strings in the Series/Index by prepending '0'
1588 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
1589
1590 Examples
1591 --------
1592 >>> s = pd.Series(["caribou", "tiger"])
1593 >>> s
1594 0 caribou
1595 1 tiger
1596 dtype: object
1597
1598 >>> s.str.pad(width=10)
1599 0 caribou
1600 1 tiger
1601 dtype: object
1602
1603 >>> s.str.pad(width=10, side='right', fillchar='-')
1604 0 caribou---
1605 1 tiger-----
1606 dtype: object
1607
1608 >>> s.str.pad(width=10, side='both', fillchar='-')
1609 0 -caribou--
1610 1 --tiger---
1611 dtype: object
1612 """
1613 if not isinstance(fillchar, str):
1614 msg = f"fillchar must be a character, not {type(fillchar).__name__}"
1615 raise TypeError(msg)
1616
1617 if len(fillchar) != 1:
1618 raise TypeError("fillchar must be a character, not str")
1619
1620 if not is_integer(width):
1621 msg = f"width must be of integer type, not {type(width).__name__}"
1622 raise TypeError(msg)
1623
1624 result = self._data.array._str_pad(width, side=side, fillchar=fillchar)
1625 return self._wrap_result(result)
1626
1627 _shared_docs[
1628 "str_pad"
1629 ] = """
1630 Pad %(side)s side of strings in the Series/Index.
1631
1632 Equivalent to :meth:`str.%(method)s`.
1633
1634 Parameters
1635 ----------
1636 width : int
1637 Minimum width of resulting string; additional characters will be filled
1638 with ``fillchar``.
1639 fillchar : str
1640 Additional character for filling, default is whitespace.
1641
1642 Returns
1643 -------
1644 Series/Index of objects.
1645 """
1646
1647 @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"})
1648 @forbid_nonstring_types(["bytes"])
1649 def center(self, width, fillchar: str = " "):
1650 return self.pad(width, side="both", fillchar=fillchar)
1651
1652 @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"})
1653 @forbid_nonstring_types(["bytes"])
1654 def ljust(self, width, fillchar: str = " "):
1655 return self.pad(width, side="right", fillchar=fillchar)
1656
1657 @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"})
1658 @forbid_nonstring_types(["bytes"])
1659 def rjust(self, width, fillchar: str = " "):
1660 return self.pad(width, side="left", fillchar=fillchar)
1661
1662 @forbid_nonstring_types(["bytes"])
1663 def zfill(self, width):
1664 """
1665 Pad strings in the Series/Index by prepending '0' characters.
1666
1667 Strings in the Series/Index are padded with '0' characters on the
1668 left of the string to reach a total string length `width`. Strings
1669 in the Series/Index with length greater or equal to `width` are
1670 unchanged.
1671
1672 Parameters
1673 ----------
1674 width : int
1675 Minimum length of resulting string; strings with length less
1676 than `width` be prepended with '0' characters.
1677
1678 Returns
1679 -------
1680 Series/Index of objects.
1681
1682 See Also
1683 --------
1684 Series.str.rjust : Fills the left side of strings with an arbitrary
1685 character.
1686 Series.str.ljust : Fills the right side of strings with an arbitrary
1687 character.
1688 Series.str.pad : Fills the specified sides of strings with an arbitrary
1689 character.
1690 Series.str.center : Fills both sides of strings with an arbitrary
1691 character.
1692
1693 Notes
1694 -----
1695 Differs from :meth:`str.zfill` which has special handling
1696 for '+'/'-' in the string.
1697
1698 Examples
1699 --------
1700 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])
1701 >>> s
1702 0 -1
1703 1 1
1704 2 1000
1705 3 10
1706 4 NaN
1707 dtype: object
1708
1709 Note that ``10`` and ``NaN`` are not strings, therefore they are
1710 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a
1711 special character and the zero is added to the right of it
1712 (:meth:`str.zfill` would have moved it to the left). ``1000``
1713 remains unchanged as it is longer than `width`.
1714
1715 >>> s.str.zfill(3)
1716 0 -01
1717 1 001
1718 2 1000
1719 3 NaN
1720 4 NaN
1721 dtype: object
1722 """
1723 if not is_integer(width):
1724 msg = f"width must be of integer type, not {type(width).__name__}"
1725 raise TypeError(msg)
1726 f = lambda x: x.zfill(width)
1727 result = self._data.array._str_map(f)
1728 return self._wrap_result(result)
1729
1730 def slice(self, start=None, stop=None, step=None):
1731 """
1732 Slice substrings from each element in the Series or Index.
1733
1734 Parameters
1735 ----------
1736 start : int, optional
1737 Start position for slice operation.
1738 stop : int, optional
1739 Stop position for slice operation.
1740 step : int, optional
1741 Step size for slice operation.
1742
1743 Returns
1744 -------
1745 Series or Index of object
1746 Series or Index from sliced substring from original string object.
1747
1748 See Also
1749 --------
1750 Series.str.slice_replace : Replace a slice with a string.
1751 Series.str.get : Return element at position.
1752 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`
1753 being the position.
1754
1755 Examples
1756 --------
1757 >>> s = pd.Series(["koala", "dog", "chameleon"])
1758 >>> s
1759 0 koala
1760 1 dog
1761 2 chameleon
1762 dtype: object
1763
1764 >>> s.str.slice(start=1)
1765 0 oala
1766 1 og
1767 2 hameleon
1768 dtype: object
1769
1770 >>> s.str.slice(start=-1)
1771 0 a
1772 1 g
1773 2 n
1774 dtype: object
1775
1776 >>> s.str.slice(stop=2)
1777 0 ko
1778 1 do
1779 2 ch
1780 dtype: object
1781
1782 >>> s.str.slice(step=2)
1783 0 kaa
1784 1 dg
1785 2 caeen
1786 dtype: object
1787
1788 >>> s.str.slice(start=0, stop=5, step=3)
1789 0 kl
1790 1 d
1791 2 cm
1792 dtype: object
1793
1794 Equivalent behaviour to:
1795
1796 >>> s.str[0:5:3]
1797 0 kl
1798 1 d
1799 2 cm
1800 dtype: object
1801 """
1802 result = self._data.array._str_slice(start, stop, step)
1803 return self._wrap_result(result)
1804
1805 @forbid_nonstring_types(["bytes"])
1806 def slice_replace(self, start=None, stop=None, repl=None):
1807 """
1808 Replace a positional slice of a string with another value.
1809
1810 Parameters
1811 ----------
1812 start : int, optional
1813 Left index position to use for the slice. If not specified (None),
1814 the slice is unbounded on the left, i.e. slice from the start
1815 of the string.
1816 stop : int, optional
1817 Right index position to use for the slice. If not specified (None),
1818 the slice is unbounded on the right, i.e. slice until the
1819 end of the string.
1820 repl : str, optional
1821 String for replacement. If not specified (None), the sliced region
1822 is replaced with an empty string.
1823
1824 Returns
1825 -------
1826 Series or Index
1827 Same type as the original object.
1828
1829 See Also
1830 --------
1831 Series.str.slice : Just slicing without replacement.
1832
1833 Examples
1834 --------
1835 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])
1836 >>> s
1837 0 a
1838 1 ab
1839 2 abc
1840 3 abdc
1841 4 abcde
1842 dtype: object
1843
1844 Specify just `start`, meaning replace `start` until the end of the
1845 string with `repl`.
1846
1847 >>> s.str.slice_replace(1, repl='X')
1848 0 aX
1849 1 aX
1850 2 aX
1851 3 aX
1852 4 aX
1853 dtype: object
1854
1855 Specify just `stop`, meaning the start of the string to `stop` is replaced
1856 with `repl`, and the rest of the string is included.
1857
1858 >>> s.str.slice_replace(stop=2, repl='X')
1859 0 X
1860 1 X
1861 2 Xc
1862 3 Xdc
1863 4 Xcde
1864 dtype: object
1865
1866 Specify `start` and `stop`, meaning the slice from `start` to `stop` is
1867 replaced with `repl`. Everything before or after `start` and `stop` is
1868 included as is.
1869
1870 >>> s.str.slice_replace(start=1, stop=3, repl='X')
1871 0 aX
1872 1 aX
1873 2 aX
1874 3 aXc
1875 4 aXde
1876 dtype: object
1877 """
1878 result = self._data.array._str_slice_replace(start, stop, repl)
1879 return self._wrap_result(result)
1880
1881 def decode(self, encoding, errors: str = "strict"):
1882 """
1883 Decode character string in the Series/Index using indicated encoding.
1884
1885 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in
1886 python3.
1887
1888 Parameters
1889 ----------
1890 encoding : str
1891 errors : str, optional
1892
1893 Returns
1894 -------
1895 Series or Index
1896 """
1897 # TODO: Add a similar _bytes interface.
1898 if encoding in _cpython_optimized_decoders:
1899 # CPython optimized implementation
1900 f = lambda x: x.decode(encoding, errors)
1901 else:
1902 decoder = codecs.getdecoder(encoding)
1903 f = lambda x: decoder(x, errors)[0]
1904 arr = self._data.array
1905 # assert isinstance(arr, (StringArray,))
1906 result = arr._str_map(f)
1907 return self._wrap_result(result)
1908
1909 @forbid_nonstring_types(["bytes"])
1910 def encode(self, encoding, errors: str = "strict"):
1911 """
1912 Encode character string in the Series/Index using indicated encoding.
1913
1914 Equivalent to :meth:`str.encode`.
1915
1916 Parameters
1917 ----------
1918 encoding : str
1919 errors : str, optional
1920
1921 Returns
1922 -------
1923 Series/Index of objects
1924 """
1925 result = self._data.array._str_encode(encoding, errors)
1926 return self._wrap_result(result, returns_string=False)
1927
1928 _shared_docs[
1929 "str_strip"
1930 ] = r"""
1931 Remove %(position)s characters.
1932
1933 Strip whitespaces (including newlines) or a set of specified characters
1934 from each string in the Series/Index from %(side)s.
1935 Replaces any non-strings in Series with NaNs.
1936 Equivalent to :meth:`str.%(method)s`.
1937
1938 Parameters
1939 ----------
1940 to_strip : str or None, default None
1941 Specifying the set of characters to be removed.
1942 All combinations of this set of characters will be stripped.
1943 If None then whitespaces are removed.
1944
1945 Returns
1946 -------
1947 Series or Index of object
1948
1949 See Also
1950 --------
1951 Series.str.strip : Remove leading and trailing characters in Series/Index.
1952 Series.str.lstrip : Remove leading characters in Series/Index.
1953 Series.str.rstrip : Remove trailing characters in Series/Index.
1954
1955 Examples
1956 --------
1957 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])
1958 >>> s
1959 0 1. Ant.
1960 1 2. Bee!\n
1961 2 3. Cat?\t
1962 3 NaN
1963 4 10
1964 5 True
1965 dtype: object
1966
1967 >>> s.str.strip()
1968 0 1. Ant.
1969 1 2. Bee!
1970 2 3. Cat?
1971 3 NaN
1972 4 NaN
1973 5 NaN
1974 dtype: object
1975
1976 >>> s.str.lstrip('123.')
1977 0 Ant.
1978 1 Bee!\n
1979 2 Cat?\t
1980 3 NaN
1981 4 NaN
1982 5 NaN
1983 dtype: object
1984
1985 >>> s.str.rstrip('.!? \n\t')
1986 0 1. Ant
1987 1 2. Bee
1988 2 3. Cat
1989 3 NaN
1990 4 NaN
1991 5 NaN
1992 dtype: object
1993
1994 >>> s.str.strip('123.!? \n\t')
1995 0 Ant
1996 1 Bee
1997 2 Cat
1998 3 NaN
1999 4 NaN
2000 5 NaN
2001 dtype: object
2002 """
2003
2004 @Appender(
2005 _shared_docs["str_strip"]
2006 % {
2007 "side": "left and right sides",
2008 "method": "strip",
2009 "position": "leading and trailing",
2010 }
2011 )
2012 @forbid_nonstring_types(["bytes"])
2013 def strip(self, to_strip=None):
2014 result = self._data.array._str_strip(to_strip)
2015 return self._wrap_result(result)
2016
2017 @Appender(
2018 _shared_docs["str_strip"]
2019 % {"side": "left side", "method": "lstrip", "position": "leading"}
2020 )
2021 @forbid_nonstring_types(["bytes"])
2022 def lstrip(self, to_strip=None):
2023 result = self._data.array._str_lstrip(to_strip)
2024 return self._wrap_result(result)
2025
2026 @Appender(
2027 _shared_docs["str_strip"]
2028 % {"side": "right side", "method": "rstrip", "position": "trailing"}
2029 )
2030 @forbid_nonstring_types(["bytes"])
2031 def rstrip(self, to_strip=None):
2032 result = self._data.array._str_rstrip(to_strip)
2033 return self._wrap_result(result)
2034
2035 _shared_docs[
2036 "str_removefix"
2037 ] = r"""
2038 Remove a %(side)s from an object series.
2039
2040 If the %(side)s is not present, the original string will be returned.
2041
2042 Parameters
2043 ----------
2044 %(side)s : str
2045 Remove the %(side)s of the string.
2046
2047 Returns
2048 -------
2049 Series/Index: object
2050 The Series or Index with given %(side)s removed.
2051
2052 See Also
2053 --------
2054 Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.
2055
2056 Examples
2057 --------
2058 >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])
2059 >>> s
2060 0 str_foo
2061 1 str_bar
2062 2 no_prefix
2063 dtype: object
2064 >>> s.str.removeprefix("str_")
2065 0 foo
2066 1 bar
2067 2 no_prefix
2068 dtype: object
2069
2070 >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])
2071 >>> s
2072 0 foo_str
2073 1 bar_str
2074 2 no_suffix
2075 dtype: object
2076 >>> s.str.removesuffix("_str")
2077 0 foo
2078 1 bar
2079 2 no_suffix
2080 dtype: object
2081 """
2082
2083 @Appender(
2084 _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"}
2085 )
2086 @forbid_nonstring_types(["bytes"])
2087 def removeprefix(self, prefix):
2088 result = self._data.array._str_removeprefix(prefix)
2089 return self._wrap_result(result)
2090
2091 @Appender(
2092 _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"}
2093 )
2094 @forbid_nonstring_types(["bytes"])
2095 def removesuffix(self, suffix):
2096 result = self._data.array._str_removesuffix(suffix)
2097 return self._wrap_result(result)
2098
2099 @forbid_nonstring_types(["bytes"])
2100 def wrap(self, width, **kwargs):
2101 r"""
2102 Wrap strings in Series/Index at specified line width.
2103
2104 This method has the same keyword parameters and defaults as
2105 :class:`textwrap.TextWrapper`.
2106
2107 Parameters
2108 ----------
2109 width : int
2110 Maximum line width.
2111 expand_tabs : bool, optional
2112 If True, tab characters will be expanded to spaces (default: True).
2113 replace_whitespace : bool, optional
2114 If True, each whitespace character (as defined by string.whitespace)
2115 remaining after tab expansion will be replaced by a single space
2116 (default: True).
2117 drop_whitespace : bool, optional
2118 If True, whitespace that, after wrapping, happens to end up at the
2119 beginning or end of a line is dropped (default: True).
2120 break_long_words : bool, optional
2121 If True, then words longer than width will be broken in order to ensure
2122 that no lines are longer than width. If it is false, long words will
2123 not be broken, and some lines may be longer than width (default: True).
2124 break_on_hyphens : bool, optional
2125 If True, wrapping will occur preferably on whitespace and right after
2126 hyphens in compound words, as it is customary in English. If false,
2127 only whitespaces will be considered as potentially good places for line
2128 breaks, but you need to set break_long_words to false if you want truly
2129 insecable words (default: True).
2130
2131 Returns
2132 -------
2133 Series or Index
2134
2135 Notes
2136 -----
2137 Internally, this method uses a :class:`textwrap.TextWrapper` instance with
2138 default settings. To achieve behavior matching R's stringr library str_wrap
2139 function, use the arguments:
2140
2141 - expand_tabs = False
2142 - replace_whitespace = True
2143 - drop_whitespace = True
2144 - break_long_words = False
2145 - break_on_hyphens = False
2146
2147 Examples
2148 --------
2149 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])
2150 >>> s.str.wrap(12)
2151 0 line to be\nwrapped
2152 1 another line\nto be\nwrapped
2153 dtype: object
2154 """
2155 result = self._data.array._str_wrap(width, **kwargs)
2156 return self._wrap_result(result)
2157
2158 @forbid_nonstring_types(["bytes"])
2159 def get_dummies(self, sep: str = "|"):
2160 """
2161 Return DataFrame of dummy/indicator variables for Series.
2162
2163 Each string in Series is split by sep and returned as a DataFrame
2164 of dummy/indicator variables.
2165
2166 Parameters
2167 ----------
2168 sep : str, default "|"
2169 String to split on.
2170
2171 Returns
2172 -------
2173 DataFrame
2174 Dummy variables corresponding to values of the Series.
2175
2176 See Also
2177 --------
2178 get_dummies : Convert categorical variable into dummy/indicator
2179 variables.
2180
2181 Examples
2182 --------
2183 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()
2184 a b c
2185 0 1 1 0
2186 1 1 0 0
2187 2 1 0 1
2188
2189 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()
2190 a b c
2191 0 1 1 0
2192 1 0 0 0
2193 2 1 0 1
2194 """
2195 # we need to cast to Series of strings as only that has all
2196 # methods available for making the dummies...
2197 result, name = self._data.array._str_get_dummies(sep)
2198 return self._wrap_result(
2199 result,
2200 name=name,
2201 expand=True,
2202 returns_string=False,
2203 )
2204
2205 @forbid_nonstring_types(["bytes"])
2206 def translate(self, table):
2207 """
2208 Map all characters in the string through the given mapping table.
2209
2210 Equivalent to standard :meth:`str.translate`.
2211
2212 Parameters
2213 ----------
2214 table : dict
2215 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or
2216 None. Unmapped characters are left untouched.
2217 Characters mapped to None are deleted. :meth:`str.maketrans` is a
2218 helper function for making translation tables.
2219
2220 Returns
2221 -------
2222 Series or Index
2223 """
2224 result = self._data.array._str_translate(table)
2225 return self._wrap_result(result)
2226
2227 @forbid_nonstring_types(["bytes"])
2228 def count(self, pat, flags: int = 0):
2229 r"""
2230 Count occurrences of pattern in each string of the Series/Index.
2231
2232 This function is used to count the number of times a particular regex
2233 pattern is repeated in each of the string elements of the
2234 :class:`~pandas.Series`.
2235
2236 Parameters
2237 ----------
2238 pat : str
2239 Valid regular expression.
2240 flags : int, default 0, meaning no flags
2241 Flags for the `re` module. For a complete list, `see here
2242 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.
2243 **kwargs
2244 For compatibility with other string methods. Not used.
2245
2246 Returns
2247 -------
2248 Series or Index
2249 Same type as the calling object containing the integer counts.
2250
2251 See Also
2252 --------
2253 re : Standard library module for regular expressions.
2254 str.count : Standard library version, without regular expression support.
2255
2256 Notes
2257 -----
2258 Some characters need to be escaped when passing in `pat`.
2259 eg. ``'$'`` has a special meaning in regex and must be escaped when
2260 finding this literal character.
2261
2262 Examples
2263 --------
2264 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])
2265 >>> s.str.count('a')
2266 0 0.0
2267 1 0.0
2268 2 2.0
2269 3 2.0
2270 4 NaN
2271 5 0.0
2272 6 1.0
2273 dtype: float64
2274
2275 Escape ``'$'`` to find the literal dollar sign.
2276
2277 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])
2278 >>> s.str.count('\\$')
2279 0 1
2280 1 0
2281 2 1
2282 3 2
2283 4 2
2284 5 0
2285 dtype: int64
2286
2287 This is also available on Index
2288
2289 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')
2290 Index([0, 0, 2, 1], dtype='int64')
2291 """
2292 result = self._data.array._str_count(pat, flags)
2293 return self._wrap_result(result, returns_string=False)
2294
2295 @forbid_nonstring_types(["bytes"])
2296 def startswith(
2297 self, pat: str | tuple[str, ...], na: Scalar | None = None
2298 ) -> Series | Index:
2299 """
2300 Test if the start of each string element matches a pattern.
2301
2302 Equivalent to :meth:`str.startswith`.
2303
2304 Parameters
2305 ----------
2306 pat : str or tuple[str, ...]
2307 Character sequence or tuple of strings. Regular expressions are not
2308 accepted.
2309 na : object, default NaN
2310 Object shown if element tested is not a string. The default depends
2311 on dtype of the array. For object-dtype, ``numpy.nan`` is used.
2312 For ``StringDtype``, ``pandas.NA`` is used.
2313
2314 Returns
2315 -------
2316 Series or Index of bool
2317 A Series of booleans indicating whether the given pattern matches
2318 the start of each string element.
2319
2320 See Also
2321 --------
2322 str.startswith : Python standard library string method.
2323 Series.str.endswith : Same as startswith, but tests the end of string.
2324 Series.str.contains : Tests if string element contains a pattern.
2325
2326 Examples
2327 --------
2328 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])
2329 >>> s
2330 0 bat
2331 1 Bear
2332 2 cat
2333 3 NaN
2334 dtype: object
2335
2336 >>> s.str.startswith('b')
2337 0 True
2338 1 False
2339 2 False
2340 3 NaN
2341 dtype: object
2342
2343 >>> s.str.startswith(('b', 'B'))
2344 0 True
2345 1 True
2346 2 False
2347 3 NaN
2348 dtype: object
2349
2350 Specifying `na` to be `False` instead of `NaN`.
2351
2352 >>> s.str.startswith('b', na=False)
2353 0 True
2354 1 False
2355 2 False
2356 3 False
2357 dtype: bool
2358 """
2359 if not isinstance(pat, (str, tuple)):
2360 msg = f"expected a string or tuple, not {type(pat).__name__}"
2361 raise TypeError(msg)
2362 result = self._data.array._str_startswith(pat, na=na)
2363 return self._wrap_result(result, returns_string=False)
2364
2365 @forbid_nonstring_types(["bytes"])
2366 def endswith(
2367 self, pat: str | tuple[str, ...], na: Scalar | None = None
2368 ) -> Series | Index:
2369 """
2370 Test if the end of each string element matches a pattern.
2371
2372 Equivalent to :meth:`str.endswith`.
2373
2374 Parameters
2375 ----------
2376 pat : str or tuple[str, ...]
2377 Character sequence or tuple of strings. Regular expressions are not
2378 accepted.
2379 na : object, default NaN
2380 Object shown if element tested is not a string. The default depends
2381 on dtype of the array. For object-dtype, ``numpy.nan`` is used.
2382 For ``StringDtype``, ``pandas.NA`` is used.
2383
2384 Returns
2385 -------
2386 Series or Index of bool
2387 A Series of booleans indicating whether the given pattern matches
2388 the end of each string element.
2389
2390 See Also
2391 --------
2392 str.endswith : Python standard library string method.
2393 Series.str.startswith : Same as endswith, but tests the start of string.
2394 Series.str.contains : Tests if string element contains a pattern.
2395
2396 Examples
2397 --------
2398 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])
2399 >>> s
2400 0 bat
2401 1 bear
2402 2 caT
2403 3 NaN
2404 dtype: object
2405
2406 >>> s.str.endswith('t')
2407 0 True
2408 1 False
2409 2 False
2410 3 NaN
2411 dtype: object
2412
2413 >>> s.str.endswith(('t', 'T'))
2414 0 True
2415 1 False
2416 2 True
2417 3 NaN
2418 dtype: object
2419
2420 Specifying `na` to be `False` instead of `NaN`.
2421
2422 >>> s.str.endswith('t', na=False)
2423 0 True
2424 1 False
2425 2 False
2426 3 False
2427 dtype: bool
2428 """
2429 if not isinstance(pat, (str, tuple)):
2430 msg = f"expected a string or tuple, not {type(pat).__name__}"
2431 raise TypeError(msg)
2432 result = self._data.array._str_endswith(pat, na=na)
2433 return self._wrap_result(result, returns_string=False)
2434
2435 @forbid_nonstring_types(["bytes"])
2436 def findall(self, pat, flags: int = 0):
2437 """
2438 Find all occurrences of pattern or regular expression in the Series/Index.
2439
2440 Equivalent to applying :func:`re.findall` to all the elements in the
2441 Series/Index.
2442
2443 Parameters
2444 ----------
2445 pat : str
2446 Pattern or regular expression.
2447 flags : int, default 0
2448 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which
2449 means no flags).
2450
2451 Returns
2452 -------
2453 Series/Index of lists of strings
2454 All non-overlapping matches of pattern or regular expression in each
2455 string of this Series/Index.
2456
2457 See Also
2458 --------
2459 count : Count occurrences of pattern or regular expression in each string
2460 of the Series/Index.
2461 extractall : For each string in the Series, extract groups from all matches
2462 of regular expression and return a DataFrame with one row for each
2463 match and one column for each group.
2464 re.findall : The equivalent ``re`` function to all non-overlapping matches
2465 of pattern or regular expression in string, as a list of strings.
2466
2467 Examples
2468 --------
2469 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])
2470
2471 The search for the pattern 'Monkey' returns one match:
2472
2473 >>> s.str.findall('Monkey')
2474 0 []
2475 1 [Monkey]
2476 2 []
2477 dtype: object
2478
2479 On the other hand, the search for the pattern 'MONKEY' doesn't return any
2480 match:
2481
2482 >>> s.str.findall('MONKEY')
2483 0 []
2484 1 []
2485 2 []
2486 dtype: object
2487
2488 Flags can be added to the pattern or regular expression. For instance,
2489 to find the pattern 'MONKEY' ignoring the case:
2490
2491 >>> import re
2492 >>> s.str.findall('MONKEY', flags=re.IGNORECASE)
2493 0 []
2494 1 [Monkey]
2495 2 []
2496 dtype: object
2497
2498 When the pattern matches more than one string in the Series, all matches
2499 are returned:
2500
2501 >>> s.str.findall('on')
2502 0 [on]
2503 1 [on]
2504 2 []
2505 dtype: object
2506
2507 Regular expressions are supported too. For instance, the search for all the
2508 strings ending with the word 'on' is shown next:
2509
2510 >>> s.str.findall('on$')
2511 0 [on]
2512 1 []
2513 2 []
2514 dtype: object
2515
2516 If the pattern is found more than once in the same string, then a list of
2517 multiple strings is returned:
2518
2519 >>> s.str.findall('b')
2520 0 []
2521 1 []
2522 2 [b, b]
2523 dtype: object
2524 """
2525 result = self._data.array._str_findall(pat, flags)
2526 return self._wrap_result(result, returns_string=False)
2527
2528 @forbid_nonstring_types(["bytes"])
2529 def extract(
2530 self, pat: str, flags: int = 0, expand: bool = True
2531 ) -> DataFrame | Series | Index:
2532 r"""
2533 Extract capture groups in the regex `pat` as columns in a DataFrame.
2534
2535 For each subject string in the Series, extract groups from the
2536 first match of regular expression `pat`.
2537
2538 Parameters
2539 ----------
2540 pat : str
2541 Regular expression pattern with capturing groups.
2542 flags : int, default 0 (no flags)
2543 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that
2544 modify regular expression matching for things like case,
2545 spaces, etc. For more details, see :mod:`re`.
2546 expand : bool, default True
2547 If True, return DataFrame with one column per capture group.
2548 If False, return a Series/Index if there is one capture group
2549 or DataFrame if there are multiple capture groups.
2550
2551 Returns
2552 -------
2553 DataFrame or Series or Index
2554 A DataFrame with one row for each subject string, and one
2555 column for each group. Any capture group names in regular
2556 expression pat will be used for column names; otherwise
2557 capture group numbers will be used. The dtype of each result
2558 column is always object, even when no match is found. If
2559 ``expand=False`` and pat has only one capture group, then
2560 return a Series (if subject is a Series) or Index (if subject
2561 is an Index).
2562
2563 See Also
2564 --------
2565 extractall : Returns all matches (not just the first match).
2566
2567 Examples
2568 --------
2569 A pattern with two groups will return a DataFrame with two columns.
2570 Non-matches will be NaN.
2571
2572 >>> s = pd.Series(['a1', 'b2', 'c3'])
2573 >>> s.str.extract(r'([ab])(\d)')
2574 0 1
2575 0 a 1
2576 1 b 2
2577 2 NaN NaN
2578
2579 A pattern may contain optional groups.
2580
2581 >>> s.str.extract(r'([ab])?(\d)')
2582 0 1
2583 0 a 1
2584 1 b 2
2585 2 NaN 3
2586
2587 Named groups will become column names in the result.
2588
2589 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')
2590 letter digit
2591 0 a 1
2592 1 b 2
2593 2 NaN NaN
2594
2595 A pattern with one group will return a DataFrame with one column
2596 if expand=True.
2597
2598 >>> s.str.extract(r'[ab](\d)', expand=True)
2599 0
2600 0 1
2601 1 2
2602 2 NaN
2603
2604 A pattern with one group will return a Series if expand=False.
2605
2606 >>> s.str.extract(r'[ab](\d)', expand=False)
2607 0 1
2608 1 2
2609 2 NaN
2610 dtype: object
2611 """
2612 from pandas import DataFrame
2613
2614 if not isinstance(expand, bool):
2615 raise ValueError("expand must be True or False")
2616
2617 regex = re.compile(pat, flags=flags)
2618 if regex.groups == 0:
2619 raise ValueError("pattern contains no capture groups")
2620
2621 if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex):
2622 raise ValueError("only one regex group is supported with Index")
2623
2624 obj = self._data
2625 result_dtype = _result_dtype(obj)
2626
2627 returns_df = regex.groups > 1 or expand
2628
2629 if returns_df:
2630 name = None
2631 columns = _get_group_names(regex)
2632
2633 if obj.array.size == 0:
2634 result = DataFrame(columns=columns, dtype=result_dtype)
2635
2636 else:
2637 result_list = self._data.array._str_extract(
2638 pat, flags=flags, expand=returns_df
2639 )
2640
2641 result_index: Index | None
2642 if isinstance(obj, ABCSeries):
2643 result_index = obj.index
2644 else:
2645 result_index = None
2646
2647 result = DataFrame(
2648 result_list, columns=columns, index=result_index, dtype=result_dtype
2649 )
2650
2651 else:
2652 name = _get_single_group_name(regex)
2653 result = self._data.array._str_extract(pat, flags=flags, expand=returns_df)
2654 return self._wrap_result(result, name=name)
2655
2656 @forbid_nonstring_types(["bytes"])
2657 def extractall(self, pat, flags: int = 0):
2658 r"""
2659 Extract capture groups in the regex `pat` as columns in DataFrame.
2660
2661 For each subject string in the Series, extract groups from all
2662 matches of regular expression pat. When each subject string in the
2663 Series has exactly one match, extractall(pat).xs(0, level='match')
2664 is the same as extract(pat).
2665
2666 Parameters
2667 ----------
2668 pat : str
2669 Regular expression pattern with capturing groups.
2670 flags : int, default 0 (no flags)
2671 A ``re`` module flag, for example ``re.IGNORECASE``. These allow
2672 to modify regular expression matching for things like case, spaces,
2673 etc. Multiple flags can be combined with the bitwise OR operator,
2674 for example ``re.IGNORECASE | re.MULTILINE``.
2675
2676 Returns
2677 -------
2678 DataFrame
2679 A ``DataFrame`` with one row for each match, and one column for each
2680 group. Its rows have a ``MultiIndex`` with first levels that come from
2681 the subject ``Series``. The last level is named 'match' and indexes the
2682 matches in each item of the ``Series``. Any capture group names in
2683 regular expression pat will be used for column names; otherwise capture
2684 group numbers will be used.
2685
2686 See Also
2687 --------
2688 extract : Returns first match only (not all matches).
2689
2690 Examples
2691 --------
2692 A pattern with one group will return a DataFrame with one column.
2693 Indices with no matches will not appear in the result.
2694
2695 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
2696 >>> s.str.extractall(r"[ab](\d)")
2697 0
2698 match
2699 A 0 1
2700 1 2
2701 B 0 1
2702
2703 Capture group names are used for column names of the result.
2704
2705 >>> s.str.extractall(r"[ab](?P<digit>\d)")
2706 digit
2707 match
2708 A 0 1
2709 1 2
2710 B 0 1
2711
2712 A pattern with two groups will return a DataFrame with two columns.
2713
2714 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")
2715 letter digit
2716 match
2717 A 0 a 1
2718 1 a 2
2719 B 0 b 1
2720
2721 Optional groups that do not match are NaN in the result.
2722
2723 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")
2724 letter digit
2725 match
2726 A 0 a 1
2727 1 a 2
2728 B 0 b 1
2729 C 0 NaN 1
2730 """
2731 # TODO: dispatch
2732 return str_extractall(self._orig, pat, flags)
2733
2734 _shared_docs[
2735 "find"
2736 ] = """
2737 Return %(side)s indexes in each strings in the Series/Index.
2738
2739 Each of returned indexes corresponds to the position where the
2740 substring is fully contained between [start:end]. Return -1 on
2741 failure. Equivalent to standard :meth:`str.%(method)s`.
2742
2743 Parameters
2744 ----------
2745 sub : str
2746 Substring being searched.
2747 start : int
2748 Left edge index.
2749 end : int
2750 Right edge index.
2751
2752 Returns
2753 -------
2754 Series or Index of int.
2755
2756 See Also
2757 --------
2758 %(also)s
2759 """
2760
2761 @Appender(
2762 _shared_docs["find"]
2763 % {
2764 "side": "lowest",
2765 "method": "find",
2766 "also": "rfind : Return highest indexes in each strings.",
2767 }
2768 )
2769 @forbid_nonstring_types(["bytes"])
2770 def find(self, sub, start: int = 0, end=None):
2771 if not isinstance(sub, str):
2772 msg = f"expected a string object, not {type(sub).__name__}"
2773 raise TypeError(msg)
2774
2775 result = self._data.array._str_find(sub, start, end)
2776 return self._wrap_result(result, returns_string=False)
2777
2778 @Appender(
2779 _shared_docs["find"]
2780 % {
2781 "side": "highest",
2782 "method": "rfind",
2783 "also": "find : Return lowest indexes in each strings.",
2784 }
2785 )
2786 @forbid_nonstring_types(["bytes"])
2787 def rfind(self, sub, start: int = 0, end=None):
2788 if not isinstance(sub, str):
2789 msg = f"expected a string object, not {type(sub).__name__}"
2790 raise TypeError(msg)
2791
2792 result = self._data.array._str_rfind(sub, start=start, end=end)
2793 return self._wrap_result(result, returns_string=False)
2794
2795 @forbid_nonstring_types(["bytes"])
2796 def normalize(self, form):
2797 """
2798 Return the Unicode normal form for the strings in the Series/Index.
2799
2800 For more information on the forms, see the
2801 :func:`unicodedata.normalize`.
2802
2803 Parameters
2804 ----------
2805 form : {'NFC', 'NFKC', 'NFD', 'NFKD'}
2806 Unicode form.
2807
2808 Returns
2809 -------
2810 Series/Index of objects
2811 """
2812 result = self._data.array._str_normalize(form)
2813 return self._wrap_result(result)
2814
2815 _shared_docs[
2816 "index"
2817 ] = """
2818 Return %(side)s indexes in each string in Series/Index.
2819
2820 Each of the returned indexes corresponds to the position where the
2821 substring is fully contained between [start:end]. This is the same
2822 as ``str.%(similar)s`` except instead of returning -1, it raises a
2823 ValueError when the substring is not found. Equivalent to standard
2824 ``str.%(method)s``.
2825
2826 Parameters
2827 ----------
2828 sub : str
2829 Substring being searched.
2830 start : int
2831 Left edge index.
2832 end : int
2833 Right edge index.
2834
2835 Returns
2836 -------
2837 Series or Index of object
2838
2839 See Also
2840 --------
2841 %(also)s
2842 """
2843
2844 @Appender(
2845 _shared_docs["index"]
2846 % {
2847 "side": "lowest",
2848 "similar": "find",
2849 "method": "index",
2850 "also": "rindex : Return highest indexes in each strings.",
2851 }
2852 )
2853 @forbid_nonstring_types(["bytes"])
2854 def index(self, sub, start: int = 0, end=None):
2855 if not isinstance(sub, str):
2856 msg = f"expected a string object, not {type(sub).__name__}"
2857 raise TypeError(msg)
2858
2859 result = self._data.array._str_index(sub, start=start, end=end)
2860 return self._wrap_result(result, returns_string=False)
2861
2862 @Appender(
2863 _shared_docs["index"]
2864 % {
2865 "side": "highest",
2866 "similar": "rfind",
2867 "method": "rindex",
2868 "also": "index : Return lowest indexes in each strings.",
2869 }
2870 )
2871 @forbid_nonstring_types(["bytes"])
2872 def rindex(self, sub, start: int = 0, end=None):
2873 if not isinstance(sub, str):
2874 msg = f"expected a string object, not {type(sub).__name__}"
2875 raise TypeError(msg)
2876
2877 result = self._data.array._str_rindex(sub, start=start, end=end)
2878 return self._wrap_result(result, returns_string=False)
2879
2880 def len(self):
2881 """
2882 Compute the length of each element in the Series/Index.
2883
2884 The element may be a sequence (such as a string, tuple or list) or a collection
2885 (such as a dictionary).
2886
2887 Returns
2888 -------
2889 Series or Index of int
2890 A Series or Index of integer values indicating the length of each
2891 element in the Series or Index.
2892
2893 See Also
2894 --------
2895 str.len : Python built-in function returning the length of an object.
2896 Series.size : Returns the length of the Series.
2897
2898 Examples
2899 --------
2900 Returns the length (number of characters) in a string. Returns the
2901 number of entries for dictionaries, lists or tuples.
2902
2903 >>> s = pd.Series(['dog',
2904 ... '',
2905 ... 5,
2906 ... {'foo' : 'bar'},
2907 ... [2, 3, 5, 7],
2908 ... ('one', 'two', 'three')])
2909 >>> s
2910 0 dog
2911 1
2912 2 5
2913 3 {'foo': 'bar'}
2914 4 [2, 3, 5, 7]
2915 5 (one, two, three)
2916 dtype: object
2917 >>> s.str.len()
2918 0 3.0
2919 1 0.0
2920 2 NaN
2921 3 1.0
2922 4 4.0
2923 5 3.0
2924 dtype: float64
2925 """
2926 result = self._data.array._str_len()
2927 return self._wrap_result(result, returns_string=False)
2928
2929 _shared_docs[
2930 "casemethods"
2931 ] = """
2932 Convert strings in the Series/Index to %(type)s.
2933 %(version)s
2934 Equivalent to :meth:`str.%(method)s`.
2935
2936 Returns
2937 -------
2938 Series or Index of object
2939
2940 See Also
2941 --------
2942 Series.str.lower : Converts all characters to lowercase.
2943 Series.str.upper : Converts all characters to uppercase.
2944 Series.str.title : Converts first character of each word to uppercase and
2945 remaining to lowercase.
2946 Series.str.capitalize : Converts first character to uppercase and
2947 remaining to lowercase.
2948 Series.str.swapcase : Converts uppercase to lowercase and lowercase to
2949 uppercase.
2950 Series.str.casefold: Removes all case distinctions in the string.
2951
2952 Examples
2953 --------
2954 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])
2955 >>> s
2956 0 lower
2957 1 CAPITALS
2958 2 this is a sentence
2959 3 SwApCaSe
2960 dtype: object
2961
2962 >>> s.str.lower()
2963 0 lower
2964 1 capitals
2965 2 this is a sentence
2966 3 swapcase
2967 dtype: object
2968
2969 >>> s.str.upper()
2970 0 LOWER
2971 1 CAPITALS
2972 2 THIS IS A SENTENCE
2973 3 SWAPCASE
2974 dtype: object
2975
2976 >>> s.str.title()
2977 0 Lower
2978 1 Capitals
2979 2 This Is A Sentence
2980 3 Swapcase
2981 dtype: object
2982
2983 >>> s.str.capitalize()
2984 0 Lower
2985 1 Capitals
2986 2 This is a sentence
2987 3 Swapcase
2988 dtype: object
2989
2990 >>> s.str.swapcase()
2991 0 LOWER
2992 1 capitals
2993 2 THIS IS A SENTENCE
2994 3 sWaPcAsE
2995 dtype: object
2996 """
2997 # Types:
2998 # cases:
2999 # upper, lower, title, capitalize, swapcase, casefold
3000 # boolean:
3001 # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle
3002 # _doc_args holds dict of strings to use in substituting casemethod docs
3003 _doc_args: dict[str, dict[str, str]] = {}
3004 _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""}
3005 _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""}
3006 _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""}
3007 _doc_args["capitalize"] = {
3008 "type": "be capitalized",
3009 "method": "capitalize",
3010 "version": "",
3011 }
3012 _doc_args["swapcase"] = {
3013 "type": "be swapcased",
3014 "method": "swapcase",
3015 "version": "",
3016 }
3017 _doc_args["casefold"] = {
3018 "type": "be casefolded",
3019 "method": "casefold",
3020 "version": "",
3021 }
3022
3023 @Appender(_shared_docs["casemethods"] % _doc_args["lower"])
3024 @forbid_nonstring_types(["bytes"])
3025 def lower(self):
3026 result = self._data.array._str_lower()
3027 return self._wrap_result(result)
3028
3029 @Appender(_shared_docs["casemethods"] % _doc_args["upper"])
3030 @forbid_nonstring_types(["bytes"])
3031 def upper(self):
3032 result = self._data.array._str_upper()
3033 return self._wrap_result(result)
3034
3035 @Appender(_shared_docs["casemethods"] % _doc_args["title"])
3036 @forbid_nonstring_types(["bytes"])
3037 def title(self):
3038 result = self._data.array._str_title()
3039 return self._wrap_result(result)
3040
3041 @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])
3042 @forbid_nonstring_types(["bytes"])
3043 def capitalize(self):
3044 result = self._data.array._str_capitalize()
3045 return self._wrap_result(result)
3046
3047 @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])
3048 @forbid_nonstring_types(["bytes"])
3049 def swapcase(self):
3050 result = self._data.array._str_swapcase()
3051 return self._wrap_result(result)
3052
3053 @Appender(_shared_docs["casemethods"] % _doc_args["casefold"])
3054 @forbid_nonstring_types(["bytes"])
3055 def casefold(self):
3056 result = self._data.array._str_casefold()
3057 return self._wrap_result(result)
3058
3059 _shared_docs[
3060 "ismethods"
3061 ] = """
3062 Check whether all characters in each string are %(type)s.
3063
3064 This is equivalent to running the Python string method
3065 :meth:`str.%(method)s` for each element of the Series/Index. If a string
3066 has zero characters, ``False`` is returned for that check.
3067
3068 Returns
3069 -------
3070 Series or Index of bool
3071 Series or Index of boolean values with the same length as the original
3072 Series/Index.
3073
3074 See Also
3075 --------
3076 Series.str.isalpha : Check whether all characters are alphabetic.
3077 Series.str.isnumeric : Check whether all characters are numeric.
3078 Series.str.isalnum : Check whether all characters are alphanumeric.
3079 Series.str.isdigit : Check whether all characters are digits.
3080 Series.str.isdecimal : Check whether all characters are decimal.
3081 Series.str.isspace : Check whether all characters are whitespace.
3082 Series.str.islower : Check whether all characters are lowercase.
3083 Series.str.isupper : Check whether all characters are uppercase.
3084 Series.str.istitle : Check whether all characters are titlecase.
3085
3086 Examples
3087 --------
3088 **Checks for Alphabetic and Numeric Characters**
3089
3090 >>> s1 = pd.Series(['one', 'one1', '1', ''])
3091
3092 >>> s1.str.isalpha()
3093 0 True
3094 1 False
3095 2 False
3096 3 False
3097 dtype: bool
3098
3099 >>> s1.str.isnumeric()
3100 0 False
3101 1 False
3102 2 True
3103 3 False
3104 dtype: bool
3105
3106 >>> s1.str.isalnum()
3107 0 True
3108 1 True
3109 2 True
3110 3 False
3111 dtype: bool
3112
3113 Note that checks against characters mixed with any additional punctuation
3114 or whitespace will evaluate to false for an alphanumeric check.
3115
3116 >>> s2 = pd.Series(['A B', '1.5', '3,000'])
3117 >>> s2.str.isalnum()
3118 0 False
3119 1 False
3120 2 False
3121 dtype: bool
3122
3123 **More Detailed Checks for Numeric Characters**
3124
3125 There are several different but overlapping sets of numeric characters that
3126 can be checked for.
3127
3128 >>> s3 = pd.Series(['23', '³', '⅕', ''])
3129
3130 The ``s3.str.isdecimal`` method checks for characters used to form numbers
3131 in base 10.
3132
3133 >>> s3.str.isdecimal()
3134 0 True
3135 1 False
3136 2 False
3137 3 False
3138 dtype: bool
3139
3140 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also
3141 includes special digits, like superscripted and subscripted digits in
3142 unicode.
3143
3144 >>> s3.str.isdigit()
3145 0 True
3146 1 True
3147 2 False
3148 3 False
3149 dtype: bool
3150
3151 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also
3152 includes other characters that can represent quantities such as unicode
3153 fractions.
3154
3155 >>> s3.str.isnumeric()
3156 0 True
3157 1 True
3158 2 True
3159 3 False
3160 dtype: bool
3161
3162 **Checks for Whitespace**
3163
3164 >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])
3165 >>> s4.str.isspace()
3166 0 True
3167 1 True
3168 2 False
3169 dtype: bool
3170
3171 **Checks for Character Case**
3172
3173 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])
3174
3175 >>> s5.str.islower()
3176 0 True
3177 1 False
3178 2 False
3179 3 False
3180 dtype: bool
3181
3182 >>> s5.str.isupper()
3183 0 False
3184 1 False
3185 2 True
3186 3 False
3187 dtype: bool
3188
3189 The ``s5.str.istitle`` method checks for whether all words are in title
3190 case (whether only the first letter of each word is capitalized). Words are
3191 assumed to be as any sequence of non-numeric characters separated by
3192 whitespace characters.
3193
3194 >>> s5.str.istitle()
3195 0 False
3196 1 True
3197 2 False
3198 3 False
3199 dtype: bool
3200 """
3201 _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"}
3202 _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"}
3203 _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"}
3204 _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"}
3205 _doc_args["islower"] = {"type": "lowercase", "method": "islower"}
3206 _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"}
3207 _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"}
3208 _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"}
3209 _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"}
3210 # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)
3211
3212 isalnum = _map_and_wrap(
3213 "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]
3214 )
3215 isalpha = _map_and_wrap(
3216 "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]
3217 )
3218 isdigit = _map_and_wrap(
3219 "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]
3220 )
3221 isspace = _map_and_wrap(
3222 "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]
3223 )
3224 islower = _map_and_wrap(
3225 "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]
3226 )
3227 isupper = _map_and_wrap(
3228 "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"]
3229 )
3230 istitle = _map_and_wrap(
3231 "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"]
3232 )
3233 isnumeric = _map_and_wrap(
3234 "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]
3235 )
3236 isdecimal = _map_and_wrap(
3237 "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]
3238 )
3239
3240
3241def cat_safe(list_of_columns: list, sep: str):
3242 """
3243 Auxiliary function for :meth:`str.cat`.
3244
3245 Same signature as cat_core, but handles TypeErrors in concatenation, which
3246 happen if the arrays in list_of columns have the wrong dtypes or content.
3247
3248 Parameters
3249 ----------
3250 list_of_columns : list of numpy arrays
3251 List of arrays to be concatenated with sep;
3252 these arrays may not contain NaNs!
3253 sep : string
3254 The separator string for concatenating the columns.
3255
3256 Returns
3257 -------
3258 nd.array
3259 The concatenation of list_of_columns with sep.
3260 """
3261 try:
3262 result = cat_core(list_of_columns, sep)
3263 except TypeError:
3264 # if there are any non-string values (wrong dtype or hidden behind
3265 # object dtype), np.sum will fail; catch and return with better message
3266 for column in list_of_columns:
3267 dtype = lib.infer_dtype(column, skipna=True)
3268 if dtype not in ["string", "empty"]:
3269 raise TypeError(
3270 "Concatenation requires list-likes containing only "
3271 "strings (or missing values). Offending values found in "
3272 f"column {dtype}"
3273 ) from None
3274 return result
3275
3276
3277def cat_core(list_of_columns: list, sep: str):
3278 """
3279 Auxiliary function for :meth:`str.cat`
3280
3281 Parameters
3282 ----------
3283 list_of_columns : list of numpy arrays
3284 List of arrays to be concatenated with sep;
3285 these arrays may not contain NaNs!
3286 sep : string
3287 The separator string for concatenating the columns.
3288
3289 Returns
3290 -------
3291 nd.array
3292 The concatenation of list_of_columns with sep.
3293 """
3294 if sep == "":
3295 # no need to interleave sep if it is empty
3296 arr_of_cols = np.asarray(list_of_columns, dtype=object)
3297 return np.sum(arr_of_cols, axis=0)
3298 list_with_sep = [sep] * (2 * len(list_of_columns) - 1)
3299 list_with_sep[::2] = list_of_columns
3300 arr_with_sep = np.asarray(list_with_sep, dtype=object)
3301 return np.sum(arr_with_sep, axis=0)
3302
3303
3304def _result_dtype(arr):
3305 # workaround #27953
3306 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails
3307 # when the list of values is empty.
3308 from pandas.core.arrays.string_ import StringDtype
3309
3310 if isinstance(arr.dtype, StringDtype):
3311 return arr.dtype
3312 else:
3313 return object
3314
3315
3316def _get_single_group_name(regex: re.Pattern) -> Hashable:
3317 if regex.groupindex:
3318 return next(iter(regex.groupindex))
3319 else:
3320 return None
3321
3322
3323def _get_group_names(regex: re.Pattern) -> list[Hashable]:
3324 """
3325 Get named groups from compiled regex.
3326
3327 Unnamed groups are numbered.
3328
3329 Parameters
3330 ----------
3331 regex : compiled regex
3332
3333 Returns
3334 -------
3335 list of column labels
3336 """
3337 names = {v: k for k, v in regex.groupindex.items()}
3338 return [names.get(1 + i, i) for i in range(regex.groups)]
3339
3340
3341def str_extractall(arr, pat, flags: int = 0):
3342 regex = re.compile(pat, flags=flags)
3343 # the regex must contain capture groups.
3344 if regex.groups == 0:
3345 raise ValueError("pattern contains no capture groups")
3346
3347 if isinstance(arr, ABCIndex):
3348 arr = arr.to_series().reset_index(drop=True)
3349
3350 columns = _get_group_names(regex)
3351 match_list = []
3352 index_list = []
3353 is_mi = arr.index.nlevels > 1
3354
3355 for subject_key, subject in arr.items():
3356 if isinstance(subject, str):
3357 if not is_mi:
3358 subject_key = (subject_key,)
3359
3360 for match_i, match_tuple in enumerate(regex.findall(subject)):
3361 if isinstance(match_tuple, str):
3362 match_tuple = (match_tuple,)
3363 na_tuple = [np.NaN if group == "" else group for group in match_tuple]
3364 match_list.append(na_tuple)
3365 result_key = tuple(subject_key + (match_i,))
3366 index_list.append(result_key)
3367
3368 from pandas import MultiIndex
3369
3370 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])
3371 dtype = _result_dtype(arr)
3372
3373 result = arr._constructor_expanddim(
3374 match_list, index=index, columns=columns, dtype=dtype
3375 )
3376 return result