Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/strings/accessor.py: 33%

1from __future__ import annotations

3import codecs

4from functools import wraps

5import re

6from typing import (

7 TYPE_CHECKING,

8 Callable,

9 Literal,

10 cast,

11)

12import warnings

14import numpy as np

16from pandas._libs import lib

17from pandas._typing import (

18 AlignJoin,

19 DtypeObj,

20 F,

21 Scalar,

22 npt,

23)

24from pandas.util._decorators import Appender

25from pandas.util._exceptions import find_stack_level

27from pandas.core.dtypes.common import (

28 ensure_object,

29 is_bool_dtype,

30 is_integer,

31 is_list_like,

32 is_object_dtype,

33 is_re,

34)

35from pandas.core.dtypes.dtypes import (

36 ArrowDtype,

37 CategoricalDtype,

38)

39from pandas.core.dtypes.generic import (

40 ABCDataFrame,

41 ABCIndex,

42 ABCMultiIndex,

43 ABCSeries,

44)

45from pandas.core.dtypes.missing import isna

47from pandas.core.arrays import ExtensionArray

48from pandas.core.base import NoNewAttributesMixin

49from pandas.core.construction import extract_array

51if TYPE_CHECKING:

52 from collections.abc import (

53 Hashable,

54 Iterator,

55 )

57 from pandas import (

58 DataFrame,

59 Index,

60 Series,

61 )

63_shared_docs: dict[str, str] = {}

64_cpython_optimized_encoders = (

65 "utf-8",

66 "utf8",

67 "latin-1",

68 "latin1",

69 "iso-8859-1",

70 "mbcs",

71 "ascii",

72)

73_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")

76def forbid_nonstring_types(

77 forbidden: list[str] | None, name: str | None = None

78) -> Callable[[F], F]:

79 """

80 Decorator to forbid specific types for a method of StringMethods.

82 For calling `.str.{method}` on a Series or Index, it is necessary to first

83 initialize the :class:`StringMethods` object, and then call the method.

84 However, different methods allow different input types, and so this can not

85 be checked during :meth:`StringMethods.__init__`, but must be done on a

86 per-method basis. This decorator exists to facilitate this process, and

87 make it explicit which (inferred) types are disallowed by the method.

89 :meth:`StringMethods.__init__` allows the *union* of types its different

90 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),

91 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].

93 The default string types ['string', 'empty'] are allowed for all methods.

94 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method

95 then needs to forbid the types it is not intended for.

97 Parameters

98 ----------

99 forbidden : list-of-str or None

100 List of forbidden non-string types, may be one or more of

101 `['bytes', 'mixed', 'mixed-integer']`.

102 name : str, default None

103 Name of the method to use in the error message. By default, this is

104 None, in which case the name from the method being wrapped will be

105 copied. However, for working with further wrappers (like _pat_wrapper

106 and _noarg_wrapper), it is necessary to specify the name.

107

108 Returns

109 -------

110 func : wrapper

111 The method to which the decorator is applied, with an added check that

112 enforces the inferred type to not be in the list of forbidden types.

113

114 Raises

115 ------

116 TypeError

117 If the inferred type of the underlying data is in `forbidden`.

118 """

119 # deal with None

120 forbidden = [] if forbidden is None else forbidden

121

122 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(

123 forbidden

124 )

125

126 def _forbid_nonstring_types(func: F) -> F:

127 func_name = func.__name__ if name is None else name

128

129 @wraps(func)

130 def wrapper(self, *args, **kwargs):

131 if self._inferred_dtype not in allowed_types:

132 msg = (

133 f"Cannot use .str.{func_name} with values of "

134 f"inferred dtype '{self._inferred_dtype}'."

135 )

136 raise TypeError(msg)

137 return func(self, *args, **kwargs)

138

139 wrapper.__name__ = func_name

140 return cast(F, wrapper)

141

142 return _forbid_nonstring_types

143

144

145def _map_and_wrap(name: str | None, docstring: str | None):

146 @forbid_nonstring_types(["bytes"], name=name)

147 def wrapper(self):

148 result = getattr(self._data.array, f"_str_{name}")()

149 return self._wrap_result(

150 result, returns_string=name not in ("isnumeric", "isdecimal")

151 )

152

153 wrapper.__doc__ = docstring

154 return wrapper

155

156

157class StringMethods(NoNewAttributesMixin):

158 """

159 Vectorized string functions for Series and Index.

160

161 NAs stay NA unless handled otherwise by a particular method.

162 Patterned after Python's string methods, with some inspiration from

163 R's stringr package.

164

165 Examples

166 --------

167 >>> s = pd.Series(["A_Str_Series"])

168 >>> s

169 0 A_Str_Series

170 dtype: object

171

172 >>> s.str.split("_")

173 0 [A, Str, Series]

174 dtype: object

175

176 >>> s.str.replace("_", "")

177 0 AStrSeries

178 dtype: object

179 """

180

181 # Note: see the docstring in pandas.core.strings.__init__

182 # for an explanation of the implementation.

183 # TODO: Dispatch all the methods

184 # Currently the following are not dispatched to the array

185 # * cat

186 # * extractall

187

188 def __init__(self, data) -> None:

189 from pandas.core.arrays.string_ import StringDtype

190

191 self._inferred_dtype = self._validate(data)

192 self._is_categorical = isinstance(data.dtype, CategoricalDtype)

193 self._is_string = isinstance(data.dtype, StringDtype)

194 self._data = data

195

196 self._index = self._name = None

197 if isinstance(data, ABCSeries):

198 self._index = data.index

199 self._name = data.name

200

201 # ._values.categories works for both Series/Index

202 self._parent = data._values.categories if self._is_categorical else data

203 # save orig to blow up categoricals to the right type

204 self._orig = data

205 self._freeze()

206

207 @staticmethod

208 def _validate(data):

209 """

210 Auxiliary function for StringMethods, infers and checks dtype of data.

211

212 This is a "first line of defence" at the creation of the StringMethods-

213 object, and just checks that the dtype is in the

214 *union* of the allowed types over all string methods below; this

215 restriction is then refined on a per-method basis using the decorator

216 @forbid_nonstring_types (more info in the corresponding docstring).

217

218 This really should exclude all series/index with any non-string values,

219 but that isn't practical for performance reasons until we have a str

220 dtype (GH 9343 / 13877)

221

222 Parameters

223 ----------

224 data : The content of the Series

225

226 Returns

227 -------

228 dtype : inferred dtype of data

229 """

230 if isinstance(data, ABCMultiIndex):

231 raise AttributeError(

232 "Can only use .str accessor with Index, not MultiIndex"

233 )

234

235 # see _libs/lib.pyx for list of inferred types

236 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]

237

238 data = extract_array(data)

239

240 values = getattr(data, "categories", data) # categorical / normal

241

242 inferred_dtype = lib.infer_dtype(values, skipna=True)

243

244 if inferred_dtype not in allowed_types:

245 raise AttributeError("Can only use .str accessor with string values!")

246 return inferred_dtype

247

248 def __getitem__(self, key):

249 result = self._data.array._str_getitem(key)

250 return self._wrap_result(result)

251

252 def __iter__(self) -> Iterator:

253 raise TypeError(f"'{type(self).__name__}' object is not iterable")

254

255 def _wrap_result(

256 self,

257 result,

258 name=None,

259 expand: bool | None = None,

260 fill_value=np.nan,

261 returns_string: bool = True,

262 returns_bool: bool = False,

263 dtype=None,

264 ):

265 from pandas import (

266 Index,

267 MultiIndex,

268 )

269

270 if not hasattr(result, "ndim") or not hasattr(result, "dtype"):

271 if isinstance(result, ABCDataFrame):

272 result = result.__finalize__(self._orig, name="str")

273 return result

274 assert result.ndim < 3

275

276 # We can be wrapping a string / object / categorical result, in which

277 # case we'll want to return the same dtype as the input.

278 # Or we can be wrapping a numeric output, in which case we don't want

279 # to return a StringArray.

280 # Ideally the array method returns the right array type.

281 if expand is None:

282 # infer from ndim if expand is not specified

283 expand = result.ndim != 1

284 elif expand is True and not isinstance(self._orig, ABCIndex):

285 # required when expand=True is explicitly specified

286 # not needed when inferred

287 if isinstance(result.dtype, ArrowDtype):

288 import pyarrow as pa

289

290 from pandas.compat import pa_version_under11p0

291

292 from pandas.core.arrays.arrow.array import ArrowExtensionArray

293

294 value_lengths = pa.compute.list_value_length(result._pa_array)

295 max_len = pa.compute.max(value_lengths).as_py()

296 min_len = pa.compute.min(value_lengths).as_py()

297 if result._hasna:

298 # ArrowExtensionArray.fillna doesn't work for list scalars

299 result = ArrowExtensionArray(

300 result._pa_array.fill_null([None] * max_len)

301 )

302 if min_len < max_len:

303 # append nulls to each scalar list element up to max_len

304 if not pa_version_under11p0:

305 result = ArrowExtensionArray(

306 pa.compute.list_slice(

307 result._pa_array,

308 start=0,

309 stop=max_len,

310 return_fixed_size_list=True,

311 )

312 )

313 else:

314 all_null = np.full(max_len, fill_value=None, dtype=object)

315 values = result.to_numpy()

316 new_values = []

317 for row in values:

318 if len(row) < max_len:

319 nulls = all_null[: max_len - len(row)]

320 row = np.append(row, nulls)

321 new_values.append(row)

322 pa_type = result._pa_array.type

323 result = ArrowExtensionArray(pa.array(new_values, type=pa_type))

324 if name is not None:

325 labels = name

326 else:

327 labels = range(max_len)

328 result = (

329 pa.compute.list_flatten(result._pa_array)

330 .to_numpy()

331 .reshape(len(result), max_len)

332 )

333 result = {

334 label: ArrowExtensionArray(pa.array(res))

335 for label, res in zip(labels, result.T)

336 }

337 elif is_object_dtype(result):

338

339 def cons_row(x):

340 if is_list_like(x):

341 return x

342 else:

343 return [x]

344

345 result = [cons_row(x) for x in result]

346 if result and not self._is_string:

347 # propagate nan values to match longest sequence (GH 18450)

348 max_len = max(len(x) for x in result)

349 result = [

350 x * max_len if len(x) == 0 or x[0] is np.nan else x

351 for x in result

352 ]

353

354 if not isinstance(expand, bool):

355 raise ValueError("expand must be True or False")

356

357 if expand is False:

358 # if expand is False, result should have the same name

359 # as the original otherwise specified

360 if name is None:

361 name = getattr(result, "name", None)

362 if name is None:

363 # do not use logical or, _orig may be a DataFrame

364 # which has "name" column

365 name = self._orig.name

366

367 # Wait until we are sure result is a Series or Index before

368 # checking attributes (GH 12180)

369 if isinstance(self._orig, ABCIndex):

370 # if result is a boolean np.array, return the np.array

371 # instead of wrapping it into a boolean Index (GH 8875)

372 if is_bool_dtype(result):

373 return result

374

375 if expand:

376 result = list(result)

377 out: Index = MultiIndex.from_tuples(result, names=name)

378 if out.nlevels == 1:

379 # We had all tuples of length-one, which are

380 # better represented as a regular Index.

381 out = out.get_level_values(0)

382 return out

383 else:

384 return Index(result, name=name, dtype=dtype)

385 else:

386 index = self._orig.index

387 # This is a mess.

388 _dtype: DtypeObj | str | None = dtype

389 vdtype = getattr(result, "dtype", None)

390 if self._is_string:

391 if is_bool_dtype(vdtype):

392 _dtype = result.dtype

393 elif returns_string:

394 _dtype = self._orig.dtype

395 else:

396 _dtype = vdtype

397 elif vdtype is not None:

398 _dtype = vdtype

399

400 if expand:

401 cons = self._orig._constructor_expanddim

402 result = cons(result, columns=name, index=index, dtype=_dtype)

403 else:

404 # Must be a Series

405 cons = self._orig._constructor

406 result = cons(result, name=name, index=index, dtype=_dtype)

407 result = result.__finalize__(self._orig, method="str")

408 if name is not None and result.ndim == 1:

409 # __finalize__ might copy over the original name, but we may

410 # want the new name (e.g. str.extract).

411 result.name = name

412 return result

413

414 def _get_series_list(self, others):

415 """

416 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input

417 into a list of Series (elements without an index must match the length

418 of the calling Series/Index).

419

420 Parameters

421 ----------

422 others : Series, DataFrame, np.ndarray, list-like or list-like of

423 Objects that are either Series, Index or np.ndarray (1-dim).

424

425 Returns

426 -------

427 list of Series

428 Others transformed into list of Series.

429 """

430 from pandas import (

431 DataFrame,

432 Series,

433 )

434

435 # self._orig is either Series or Index

436 idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index

437

438 # Generally speaking, all objects without an index inherit the index

439 # `idx` of the calling Series/Index - i.e. must have matching length.

440 # Objects with an index (i.e. Series/Index/DataFrame) keep their own.

441 if isinstance(others, ABCSeries):

442 return [others]

443 elif isinstance(others, ABCIndex):

444 return [Series(others, index=idx, dtype=others.dtype)]

445 elif isinstance(others, ABCDataFrame):

446 return [others[x] for x in others]

447 elif isinstance(others, np.ndarray) and others.ndim == 2:

448 others = DataFrame(others, index=idx)

449 return [others[x] for x in others]

450 elif is_list_like(others, allow_sets=False):

451 try:

452 others = list(others) # ensure iterators do not get read twice etc

453 except TypeError:

454 # e.g. ser.str, raise below

455 pass

456 else:

457 # in case of list-like `others`, all elements must be

458 # either Series/Index/np.ndarray (1-dim)...

459 if all(

460 isinstance(x, (ABCSeries, ABCIndex, ExtensionArray))

461 or (isinstance(x, np.ndarray) and x.ndim == 1)

462 for x in others

463 ):

464 los: list[Series] = []

465 while others: # iterate through list and append each element

466 los = los + self._get_series_list(others.pop(0))

467 return los

468 # ... or just strings

469 elif all(not is_list_like(x) for x in others):

470 return [Series(others, index=idx)]

471 raise TypeError(

472 "others must be Series, Index, DataFrame, np.ndarray "

473 "or list-like (either containing only strings or "

474 "containing only objects of type Series/Index/"

475 "np.ndarray[1-dim])"

476 )

477

478 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])

479 def cat(

480 self,

481 others=None,

482 sep: str | None = None,

483 na_rep=None,

484 join: AlignJoin = "left",

485 ) -> str | Series | Index:

486 """

487 Concatenate strings in the Series/Index with given separator.

488

489 If `others` is specified, this function concatenates the Series/Index

490 and elements of `others` element-wise.

491 If `others` is not passed, then all values in the Series/Index are

492 concatenated into a single string with a given `sep`.

493

494 Parameters

495 ----------

496 others : Series, Index, DataFrame, np.ndarray or list-like

497 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and

498 other list-likes of strings must have the same length as the

499 calling Series/Index, with the exception of indexed objects (i.e.

500 Series/Index/DataFrame) if `join` is not None.

501

502 If others is a list-like that contains a combination of Series,

503 Index or np.ndarray (1-dim), then all elements will be unpacked and

504 must satisfy the above criteria individually.

505

506 If others is None, the method returns the concatenation of all

507 strings in the calling Series/Index.

508 sep : str, default ''

509 The separator between the different elements/columns. By default

510 the empty string `''` is used.

511 na_rep : str or None, default None

512 Representation that is inserted for all missing values:

513

514 - If `na_rep` is None, and `others` is None, missing values in the

515 Series/Index are omitted from the result.

516 - If `na_rep` is None, and `others` is not None, a row containing a

517 missing value in any of the columns (before concatenation) will

518 have a missing value in the result.

519 join : {'left', 'right', 'outer', 'inner'}, default 'left'

520 Determines the join-style between the calling Series/Index and any

521 Series/Index/DataFrame in `others` (objects without an index need

522 to match the length of the calling Series/Index). To disable

523 alignment, use `.values` on any Series/Index/DataFrame in `others`.

524

525 Returns

526 -------

527 str, Series or Index

528 If `others` is None, `str` is returned, otherwise a `Series/Index`

529 (same type as caller) of objects is returned.

530

531 See Also

532 --------

533 split : Split each string in the Series/Index.

534 join : Join lists contained as elements in the Series/Index.

535

536 Examples

537 --------

538 When not passing `others`, all values are concatenated into a single

539 string:

540

541 >>> s = pd.Series(['a', 'b', np.nan, 'd'])

542 >>> s.str.cat(sep=' ')

543 'a b d'

544

545 By default, NA values in the Series are ignored. Using `na_rep`, they

546 can be given a representation:

547

548 >>> s.str.cat(sep=' ', na_rep='?')

549 'a b ? d'

550

551 If `others` is specified, corresponding values are concatenated with

552 the separator. Result will be a Series of strings.

553

554 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')

555 0 a,A

556 1 b,B

557 2 NaN

558 3 d,D

559 dtype: object

560

561 Missing values will remain missing in the result, but can again be

562 represented using `na_rep`

563

564 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')

565 0 a,A

566 1 b,B

567 2 -,C

568 3 d,D

569 dtype: object

570

571 If `sep` is not specified, the values are concatenated without

572 separation.

573

574 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')

575 0 aA

576 1 bB

577 2 -C

578 3 dD

579 dtype: object

580

581 Series with different indexes can be aligned before concatenation. The

582 `join`-keyword works as in other methods.

583

584 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])

585 >>> s.str.cat(t, join='left', na_rep='-')

586 0 aa

587 1 b-

588 2 -c

589 3 dd

590 dtype: object

591 >>>

592 >>> s.str.cat(t, join='outer', na_rep='-')

593 0 aa

594 1 b-

595 2 -c

596 3 dd

597 4 -e

598 dtype: object

599 >>>

600 >>> s.str.cat(t, join='inner', na_rep='-')

601 0 aa

602 2 -c

603 3 dd

604 dtype: object

605 >>>

606 >>> s.str.cat(t, join='right', na_rep='-')

607 3 dd

608 0 aa

609 4 -e

610 2 -c

611 dtype: object

612

613 For more examples, see :ref:`here <text.concatenate>`.

614 """

615 # TODO: dispatch

616 from pandas import (

617 Index,

618 Series,

619 concat,

620 )

621

622 if isinstance(others, str):

623 raise ValueError("Did you mean to supply a `sep` keyword?")

624 if sep is None:

625 sep = ""

626

627 if isinstance(self._orig, ABCIndex):

628 data = Series(self._orig, index=self._orig, dtype=self._orig.dtype)

629 else: # Series

630 data = self._orig

631

632 # concatenate Series/Index with itself if no "others"

633 if others is None:

634 # error: Incompatible types in assignment (expression has type

635 # "ndarray", variable has type "Series")

636 data = ensure_object(data) # type: ignore[assignment]

637 na_mask = isna(data)

638 if na_rep is None and na_mask.any():

639 return sep.join(data[~na_mask])

640 elif na_rep is not None and na_mask.any():

641 return sep.join(np.where(na_mask, na_rep, data))

642 else:

643 return sep.join(data)

644

645 try:

646 # turn anything in "others" into lists of Series

647 others = self._get_series_list(others)

648 except ValueError as err: # do not catch TypeError raised by _get_series_list

649 raise ValueError(

650 "If `others` contains arrays or lists (or other "

651 "list-likes without an index), these must all be "

652 "of the same length as the calling Series/Index."

653 ) from err

654

655 # align if required

656 if any(not data.index.equals(x.index) for x in others):

657 # Need to add keys for uniqueness in case of duplicate columns

658 others = concat(

659 others,

660 axis=1,

661 join=(join if join == "inner" else "outer"),

662 keys=range(len(others)),

663 sort=False,

664 copy=False,

665 )

666 data, others = data.align(others, join=join)

667 others = [others[x] for x in others] # again list of Series

668

669 all_cols = [ensure_object(x) for x in [data] + others]

670 na_masks = np.array([isna(x) for x in all_cols])

671 union_mask = np.logical_or.reduce(na_masks, axis=0)

672

673 if na_rep is None and union_mask.any():

674 # no na_rep means NaNs for all rows where any column has a NaN

675 # only necessary if there are actually any NaNs

676 result = np.empty(len(data), dtype=object)

677 np.putmask(result, union_mask, np.nan)

678

679 not_masked = ~union_mask

680 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)

681 elif na_rep is not None and union_mask.any():

682 # fill NaNs with na_rep in case there are actually any NaNs

683 all_cols = [

684 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)

685 ]

686 result = cat_safe(all_cols, sep)

687 else:

688 # no NaNs - can just concatenate

689 result = cat_safe(all_cols, sep)

690

691 out: Index | Series

692 if isinstance(self._orig.dtype, CategoricalDtype):

693 # We need to infer the new categories.

694 dtype = self._orig.dtype.categories.dtype

695 else:

696 dtype = self._orig.dtype

697 if isinstance(self._orig, ABCIndex):

698 # add dtype for case that result is all-NA

699 if isna(result).all():

700 dtype = object # type: ignore[assignment]

701

702 out = Index(result, dtype=dtype, name=self._orig.name)

703 else: # Series

704 res_ser = Series(

705 result, dtype=dtype, index=data.index, name=self._orig.name, copy=False

706 )

707 out = res_ser.__finalize__(self._orig, method="str_cat")

708 return out

709

710 _shared_docs[

711 "str_split"

712 ] = r"""

713 Split strings around given separator/delimiter.

714

715 Splits the string in the Series/Index from the %(side)s,

716 at the specified delimiter string.

717

718 Parameters

719 ----------

720 pat : str%(pat_regex)s, optional

721 %(pat_description)s.

722 If not specified, split on whitespace.

723 n : int, default -1 (all)

724 Limit number of splits in output.

725 ``None``, 0 and -1 will be interpreted as return all splits.

726 expand : bool, default False

727 Expand the split strings into separate columns.

728

729 - If ``True``, return DataFrame/MultiIndex expanding dimensionality.

730 - If ``False``, return Series/Index, containing lists of strings.

731 %(regex_argument)s

732 Returns

733 -------

734 Series, Index, DataFrame or MultiIndex

735 Type matches caller unless ``expand=True`` (see Notes).

736 %(raises_split)s

737 See Also

738 --------

739 Series.str.split : Split strings around given separator/delimiter.

740 Series.str.rsplit : Splits string around given separator/delimiter,

741 starting from the right.

742 Series.str.join : Join lists contained as elements in the Series/Index

743 with passed delimiter.

744 str.split : Standard library version for split.

745 str.rsplit : Standard library version for rsplit.

746

747 Notes

748 -----

749 The handling of the `n` keyword depends on the number of found splits:

750

751 - If found splits > `n`, make first `n` splits only

752 - If found splits <= `n`, make all splits

753 - If for a certain row the number of found splits < `n`,

754 append `None` for padding up to `n` if ``expand=True``

755

756 If using ``expand=True``, Series and Index callers return DataFrame and

757 MultiIndex objects, respectively.

758 %(regex_pat_note)s

759 Examples

760 --------

761 >>> s = pd.Series(

762 ... [

763 ... "this is a regular sentence",

764 ... "https://docs.python.org/3/tutorial/index.html",

765 ... np.nan

766 ... ]

767 ... )

768 >>> s

769 0 this is a regular sentence

770 1 https://docs.python.org/3/tutorial/index.html

771 2 NaN

772 dtype: object

773

774 In the default setting, the string is split by whitespace.

775

776 >>> s.str.split()

777 0 [this, is, a, regular, sentence]

778 1 [https://docs.python.org/3/tutorial/index.html]

779 2 NaN

780 dtype: object

781

782 Without the `n` parameter, the outputs of `rsplit` and `split`

783 are identical.

784

785 >>> s.str.rsplit()

786 0 [this, is, a, regular, sentence]

787 1 [https://docs.python.org/3/tutorial/index.html]

788 2 NaN

789 dtype: object

790

791 The `n` parameter can be used to limit the number of splits on the

792 delimiter. The outputs of `split` and `rsplit` are different.

793

794 >>> s.str.split(n=2)

795 0 [this, is, a regular sentence]

796 1 [https://docs.python.org/3/tutorial/index.html]

797 2 NaN

798 dtype: object

799

800 >>> s.str.rsplit(n=2)

801 0 [this is a, regular, sentence]

802 1 [https://docs.python.org/3/tutorial/index.html]

803 2 NaN

804 dtype: object

805

806 The `pat` parameter can be used to split by other characters.

807

808 >>> s.str.split(pat="/")

809 0 [this is a regular sentence]

810 1 [https:, , docs.python.org, 3, tutorial, index...

811 2 NaN

812 dtype: object

813

814 When using ``expand=True``, the split elements will expand out into

815 separate columns. If NaN is present, it is propagated throughout

816 the columns during the split.

817

818 >>> s.str.split(expand=True)

819 0 1 2 3 4

820 0 this is a regular sentence

821 1 https://docs.python.org/3/tutorial/index.html None None None None

822 2 NaN NaN NaN NaN NaN

823

824 For slightly more complex use cases like splitting the html document name

825 from a url, a combination of parameter settings can be used.

826

827 >>> s.str.rsplit("/", n=1, expand=True)

828 0 1

829 0 this is a regular sentence None

830 1 https://docs.python.org/3/tutorial index.html

831 2 NaN NaN

832 %(regex_examples)s"""

833

834 @Appender(

835 _shared_docs["str_split"]

836 % {

837 "side": "beginning",

838 "pat_regex": " or compiled regex",

839 "pat_description": "String or regular expression to split on",

840 "regex_argument": """

841 regex : bool, default None

842 Determines if the passed-in pattern is a regular expression:

843

844 - If ``True``, assumes the passed-in pattern is a regular expression

845 - If ``False``, treats the pattern as a literal string.

846 - If ``None`` and `pat` length is 1, treats `pat` as a literal string.

847 - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.

848 - Cannot be set to False if `pat` is a compiled regex

849

850 .. versionadded:: 1.4.0

851 """,

852 "raises_split": """

853 Raises

854 ------

855 ValueError

856 * if `regex` is False and `pat` is a compiled regex

857 """,

858 "regex_pat_note": """

859 Use of `regex =False` with a `pat` as a compiled regex will raise an error.

860 """,

861 "method": "split",

862 "regex_examples": r"""

863 Remember to escape special characters when explicitly using regular expressions.

864

865 >>> s = pd.Series(["foo and bar plus baz"])

866 >>> s.str.split(r"and|plus", expand=True)

867 0 1 2

868 0 foo bar baz

869

870 Regular expressions can be used to handle urls or file names.

871 When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled

872 as a regex only if ``len(pat) != 1``.

873

874 >>> s = pd.Series(['foojpgbar.jpg'])

875 >>> s.str.split(r".", expand=True)

876 0 1

877 0 foojpgbar jpg

878

879 >>> s.str.split(r"\.jpg", expand=True)

880 0 1

881 0 foojpgbar

882

883 When ``regex=True``, `pat` is interpreted as a regex

884

885 >>> s.str.split(r"\.jpg", regex=True, expand=True)

886 0 1

887 0 foojpgbar

888

889 A compiled regex can be passed as `pat`

890

891 >>> import re

892 >>> s.str.split(re.compile(r"\.jpg"), expand=True)

893 0 1

894 0 foojpgbar

895

896 When ``regex=False``, `pat` is interpreted as the string itself

897

898 >>> s.str.split(r"\.jpg", regex=False, expand=True)

899 0

900 0 foojpgbar.jpg

901 """,

902 }

903 )

904 @forbid_nonstring_types(["bytes"])

905 def split(

906 self,

907 pat: str | re.Pattern | None = None,

908 *,

909 n=-1,

910 expand: bool = False,

911 regex: bool | None = None,

912 ):

913 if regex is False and is_re(pat):

914 raise ValueError(

915 "Cannot use a compiled regex as replacement pattern with regex=False"

916 )

917 if is_re(pat):

918 regex = True

919 result = self._data.array._str_split(pat, n, expand, regex)

920 if self._data.dtype == "category":

921 dtype = self._data.dtype.categories.dtype

922 else:

923 dtype = object if self._data.dtype == object else None

924 return self._wrap_result(

925 result, expand=expand, returns_string=expand, dtype=dtype

926 )

927

928 @Appender(

929 _shared_docs["str_split"]

930 % {

931 "side": "end",

932 "pat_regex": "",

933 "pat_description": "String to split on",

934 "regex_argument": "",

935 "raises_split": "",

936 "regex_pat_note": "",

937 "method": "rsplit",

938 "regex_examples": "",

939 }

940 )

941 @forbid_nonstring_types(["bytes"])

942 def rsplit(self, pat=None, *, n=-1, expand: bool = False):

943 result = self._data.array._str_rsplit(pat, n=n)

944 dtype = object if self._data.dtype == object else None

945 return self._wrap_result(

946 result, expand=expand, returns_string=expand, dtype=dtype

947 )

948

949 _shared_docs[

950 "str_partition"

951 ] = """

952 Split the string at the %(side)s occurrence of `sep`.

953

954 This method splits the string at the %(side)s occurrence of `sep`,

955 and returns 3 elements containing the part before the separator,

956 the separator itself, and the part after the separator.

957 If the separator is not found, return %(return)s.

958

959 Parameters

960 ----------

961 sep : str, default whitespace

962 String to split on.

963 expand : bool, default True

964 If True, return DataFrame/MultiIndex expanding dimensionality.

965 If False, return Series/Index.

966

967 Returns

968 -------

969 DataFrame/MultiIndex or Series/Index of objects

970

971 See Also

972 --------

973 %(also)s

974 Series.str.split : Split strings around given separators.

975 str.partition : Standard library version.

976

977 Examples

978 --------

979

980 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])

981 >>> s

982 0 Linda van der Berg

983 1 George Pitt-Rivers

984 dtype: object

985

986 >>> s.str.partition()

987 0 1 2

988 0 Linda van der Berg

989 1 George Pitt-Rivers

990

991 To partition by the last space instead of the first one:

992

993 >>> s.str.rpartition()

994 0 1 2

995 0 Linda van der Berg

996 1 George Pitt-Rivers

997

998 To partition by something different than a space:

999

1000 >>> s.str.partition('-')

1001 0 1 2

1002 0 Linda van der Berg

1003 1 George Pitt - Rivers

1004

1005 To return a Series containing tuples instead of a DataFrame:

1006

1007 >>> s.str.partition('-', expand=False)

1008 0 (Linda van der Berg, , )

1009 1 (George Pitt, -, Rivers)

1010 dtype: object

1011

1012 Also available on indices:

1013

1014 >>> idx = pd.Index(['X 123', 'Y 999'])

1015 >>> idx

1016 Index(['X 123', 'Y 999'], dtype='object')

1017

1018 Which will create a MultiIndex:

1019

1020 >>> idx.str.partition()

1021 MultiIndex([('X', ' ', '123'),

1022 ('Y', ' ', '999')],

1023 )

1024

1025 Or an index with tuples with ``expand=False``:

1026

1027 >>> idx.str.partition(expand=False)

1028 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')

1029 """

1030

1031 @Appender(

1032 _shared_docs["str_partition"]

1033 % {

1034 "side": "first",

1035 "return": "3 elements containing the string itself, followed by two "

1036 "empty strings",

1037 "also": "rpartition : Split the string at the last occurrence of `sep`.",

1038 }

1039 )

1040 @forbid_nonstring_types(["bytes"])

1041 def partition(self, sep: str = " ", expand: bool = True):

1042 result = self._data.array._str_partition(sep, expand)

1043 if self._data.dtype == "category":

1044 dtype = self._data.dtype.categories.dtype

1045 else:

1046 dtype = object if self._data.dtype == object else None

1047 return self._wrap_result(

1048 result, expand=expand, returns_string=expand, dtype=dtype

1049 )

1050

1051 @Appender(

1052 _shared_docs["str_partition"]

1053 % {

1054 "side": "last",

1055 "return": "3 elements containing two empty strings, followed by the "

1056 "string itself",

1057 "also": "partition : Split the string at the first occurrence of `sep`.",

1058 }

1059 )

1060 @forbid_nonstring_types(["bytes"])

1061 def rpartition(self, sep: str = " ", expand: bool = True):

1062 result = self._data.array._str_rpartition(sep, expand)

1063 if self._data.dtype == "category":

1064 dtype = self._data.dtype.categories.dtype

1065 else:

1066 dtype = object if self._data.dtype == object else None

1067 return self._wrap_result(

1068 result, expand=expand, returns_string=expand, dtype=dtype

1069 )

1070

1071 def get(self, i):

1072 """

1073 Extract element from each component at specified position or with specified key.

1074

1075 Extract element from lists, tuples, dict, or strings in each element in the

1076 Series/Index.

1077

1078 Parameters

1079 ----------

1080 i : int or hashable dict label

1081 Position or key of element to extract.

1082

1083 Returns

1084 -------

1085 Series or Index

1086

1087 Examples

1088 --------

1089 >>> s = pd.Series(["String",

1090 ... (1, 2, 3),

1091 ... ["a", "b", "c"],

1092 ... 123,

1093 ... -456,

1094 ... {1: "Hello", "2": "World"}])

1095 >>> s

1096 0 String

1097 1 (1, 2, 3)

1098 2 [a, b, c]

1099 3 123

1100 4 -456

1101 5 {1: 'Hello', '2': 'World'}

1102 dtype: object

1103

1104 >>> s.str.get(1)

1105 0 t

1106 1 2

1107 2 b

1108 3 NaN

1109 4 NaN

1110 5 Hello

1111 dtype: object

1112

1113 >>> s.str.get(-1)

1114 0 g

1115 1 3

1116 2 c

1117 3 NaN

1118 4 NaN

1119 5 None

1120 dtype: object

1121

1122 Return element with given key

1123

1124 >>> s = pd.Series([{"name": "Hello", "value": "World"},

1125 ... {"name": "Goodbye", "value": "Planet"}])

1126 >>> s.str.get('name')

1127 0 Hello

1128 1 Goodbye

1129 dtype: object

1130 """

1131 result = self._data.array._str_get(i)

1132 return self._wrap_result(result)

1133

1134 @forbid_nonstring_types(["bytes"])

1135 def join(self, sep: str):

1136 """

1137 Join lists contained as elements in the Series/Index with passed delimiter.

1138

1139 If the elements of a Series are lists themselves, join the content of these

1140 lists using the delimiter passed to the function.

1141 This function is an equivalent to :meth:`str.join`.

1142

1143 Parameters

1144 ----------

1145 sep : str

1146 Delimiter to use between list entries.

1147

1148 Returns

1149 -------

1150 Series/Index: object

1151 The list entries concatenated by intervening occurrences of the

1152 delimiter.

1153

1154 Raises

1155 ------

1156 AttributeError

1157 If the supplied Series contains neither strings nor lists.

1158

1159 See Also

1160 --------

1161 str.join : Standard library version of this method.

1162 Series.str.split : Split strings around given separator/delimiter.

1163

1164 Notes

1165 -----

1166 If any of the list items is not a string object, the result of the join

1167 will be `NaN`.

1168

1169 Examples

1170 --------

1171 Example with a list that contains non-string elements.

1172

1173 >>> s = pd.Series([['lion', 'elephant', 'zebra'],

1174 ... [1.1, 2.2, 3.3],

1175 ... ['cat', np.nan, 'dog'],

1176 ... ['cow', 4.5, 'goat'],

1177 ... ['duck', ['swan', 'fish'], 'guppy']])

1178 >>> s

1179 0 [lion, elephant, zebra]

1180 1 [1.1, 2.2, 3.3]

1181 2 [cat, nan, dog]

1182 3 [cow, 4.5, goat]

1183 4 [duck, [swan, fish], guppy]

1184 dtype: object

1185

1186 Join all lists using a '-'. The lists containing object(s) of types other

1187 than str will produce a NaN.

1188

1189 >>> s.str.join('-')

1190 0 lion-elephant-zebra

1191 1 NaN

1192 2 NaN

1193 3 NaN

1194 4 NaN

1195 dtype: object

1196 """

1197 result = self._data.array._str_join(sep)

1198 return self._wrap_result(result)

1199

1200 @forbid_nonstring_types(["bytes"])

1201 def contains(

1202 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True

1203 ):

1204 r"""

1205 Test if pattern or regex is contained within a string of a Series or Index.

1206

1207 Return boolean Series or Index based on whether a given pattern or regex is

1208 contained within a string of a Series or Index.

1209

1210 Parameters

1211 ----------

1212 pat : str

1213 Character sequence or regular expression.

1214 case : bool, default True

1215 If True, case sensitive.

1216 flags : int, default 0 (no flags)

1217 Flags to pass through to the re module, e.g. re.IGNORECASE.

1218 na : scalar, optional

1219 Fill value for missing values. The default depends on dtype of the

1220 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,

1221 ``pandas.NA`` is used.

1222 regex : bool, default True

1223 If True, assumes the pat is a regular expression.

1224

1225 If False, treats the pat as a literal string.

1226

1227 Returns

1228 -------

1229 Series or Index of boolean values

1230 A Series or Index of boolean values indicating whether the

1231 given pattern is contained within the string of each element

1232 of the Series or Index.

1233

1234 See Also

1235 --------

1236 match : Analogous, but stricter, relying on re.match instead of re.search.

1237 Series.str.startswith : Test if the start of each string element matches a

1238 pattern.

1239 Series.str.endswith : Same as startswith, but tests the end of string.

1240

1241 Examples

1242 --------

1243 Returning a Series of booleans using only a literal pattern.

1244

1245 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])

1246 >>> s1.str.contains('og', regex=False)

1247 0 False

1248 1 True

1249 2 False

1250 3 False

1251 4 NaN

1252 dtype: object

1253

1254 Returning an Index of booleans using only a literal pattern.

1255

1256 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.nan])

1257 >>> ind.str.contains('23', regex=False)

1258 Index([False, False, False, True, nan], dtype='object')

1259

1260 Specifying case sensitivity using `case`.

1261

1262 >>> s1.str.contains('oG', case=True, regex=True)

1263 0 False

1264 1 False

1265 2 False

1266 3 False

1267 4 NaN

1268 dtype: object

1269

1270 Specifying `na` to be `False` instead of `NaN` replaces NaN values

1271 with `False`. If Series or Index does not contain NaN values

1272 the resultant dtype will be `bool`, otherwise, an `object` dtype.

1273

1274 >>> s1.str.contains('og', na=False, regex=True)

1275 0 False

1276 1 True

1277 2 False

1278 3 False

1279 4 False

1280 dtype: bool

1281

1282 Returning 'house' or 'dog' when either expression occurs in a string.

1283

1284 >>> s1.str.contains('house|dog', regex=True)

1285 0 False

1286 1 True

1287 2 True

1288 3 False

1289 4 NaN

1290 dtype: object

1291

1292 Ignoring case sensitivity using `flags` with regex.

1293

1294 >>> import re

1295 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)

1296 0 False

1297 1 False

1298 2 True

1299 3 False

1300 4 NaN

1301 dtype: object

1302

1303 Returning any digit using regular expression.

1304

1305 >>> s1.str.contains('\\d', regex=True)

1306 0 False

1307 1 False

1308 2 False

1309 3 True

1310 4 NaN

1311 dtype: object

1312

1313 Ensure `pat` is a not a literal pattern when `regex` is set to True.

1314 Note in the following example one might expect only `s2[1]` and `s2[3]` to

1315 return `True`. However, '.0' as a regex matches any character

1316 followed by a 0.

1317

1318 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])

1319 >>> s2.str.contains('.0', regex=True)

1320 0 True

1321 1 True

1322 2 False

1323 3 True

1324 4 False

1325 dtype: bool

1326 """

1327 if regex and re.compile(pat).groups:

1328 warnings.warn(

1329 "This pattern is interpreted as a regular expression, and has "

1330 "match groups. To actually get the groups, use str.extract.",

1331 UserWarning,

1332 stacklevel=find_stack_level(),

1333 )

1334

1335 result = self._data.array._str_contains(pat, case, flags, na, regex)

1336 return self._wrap_result(result, fill_value=na, returns_string=False)

1337

1338 @forbid_nonstring_types(["bytes"])

1339 def match(self, pat: str, case: bool = True, flags: int = 0, na=None):

1340 """

1341 Determine if each string starts with a match of a regular expression.

1342

1343 Parameters

1344 ----------

1345 pat : str

1346 Character sequence.

1347 case : bool, default True

1348 If True, case sensitive.

1349 flags : int, default 0 (no flags)

1350 Regex module flags, e.g. re.IGNORECASE.

1351 na : scalar, optional

1352 Fill value for missing values. The default depends on dtype of the

1353 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,

1354 ``pandas.NA`` is used.

1355

1356 Returns

1357 -------

1358 Series/Index/array of boolean values

1359

1360 See Also

1361 --------

1362 fullmatch : Stricter matching that requires the entire string to match.

1363 contains : Analogous, but less strict, relying on re.search instead of

1364 re.match.

1365 extract : Extract matched groups.

1366

1367 Examples

1368 --------

1369 >>> ser = pd.Series(["horse", "eagle", "donkey"])

1370 >>> ser.str.match("e")

1371 0 False

1372 1 True

1373 2 False

1374 dtype: bool

1375 """

1376 result = self._data.array._str_match(pat, case=case, flags=flags, na=na)

1377 return self._wrap_result(result, fill_value=na, returns_string=False)

1378

1379 @forbid_nonstring_types(["bytes"])

1380 def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None):

1381 """

1382 Determine if each string entirely matches a regular expression.

1383

1384 Parameters

1385 ----------

1386 pat : str

1387 Character sequence or regular expression.

1388 case : bool, default True

1389 If True, case sensitive.

1390 flags : int, default 0 (no flags)

1391 Regex module flags, e.g. re.IGNORECASE.

1392 na : scalar, optional

1393 Fill value for missing values. The default depends on dtype of the

1394 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,

1395 ``pandas.NA`` is used.

1396

1397 Returns

1398 -------

1399 Series/Index/array of boolean values

1400

1401 See Also

1402 --------

1403 match : Similar, but also returns `True` when only a *prefix* of the string

1404 matches the regular expression.

1405 extract : Extract matched groups.

1406

1407 Examples

1408 --------

1409 >>> ser = pd.Series(["cat", "duck", "dove"])

1410 >>> ser.str.fullmatch(r'd.+')

1411 0 False

1412 1 True

1413 2 True

1414 dtype: bool

1415 """

1416 result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)

1417 return self._wrap_result(result, fill_value=na, returns_string=False)

1418

1419 @forbid_nonstring_types(["bytes"])

1420 def replace(

1421 self,

1422 pat: str | re.Pattern,

1423 repl: str | Callable,

1424 n: int = -1,

1425 case: bool | None = None,

1426 flags: int = 0,

1427 regex: bool = False,

1428 ):

1429 r"""

1430 Replace each occurrence of pattern/regex in the Series/Index.

1431

1432 Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on

1433 the regex value.

1434

1435 Parameters

1436 ----------

1437 pat : str or compiled regex

1438 String can be a character sequence or regular expression.

1439 repl : str or callable

1440 Replacement string or a callable. The callable is passed the regex

1441 match object and must return a replacement string to be used.

1442 See :func:`re.sub`.

1443 n : int, default -1 (all)

1444 Number of replacements to make from start.

1445 case : bool, default None

1446 Determines if replace is case sensitive:

1447

1448 - If True, case sensitive (the default if `pat` is a string)

1449 - Set to False for case insensitive

1450 - Cannot be set if `pat` is a compiled regex.

1451

1452 flags : int, default 0 (no flags)

1453 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled

1454 regex.

1455 regex : bool, default False

1456 Determines if the passed-in pattern is a regular expression:

1457

1458 - If True, assumes the passed-in pattern is a regular expression.

1459 - If False, treats the pattern as a literal string

1460 - Cannot be set to False if `pat` is a compiled regex or `repl` is

1461 a callable.

1462

1463 Returns

1464 -------

1465 Series or Index of object

1466 A copy of the object with all matching occurrences of `pat` replaced by

1467 `repl`.

1468

1469 Raises

1470 ------

1471 ValueError

1472 * if `regex` is False and `repl` is a callable or `pat` is a compiled

1473 regex

1474 * if `pat` is a compiled regex and `case` or `flags` is set

1475

1476 Notes

1477 -----

1478 When `pat` is a compiled regex, all flags should be included in the

1479 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled

1480 regex will raise an error.

1481

1482 Examples

1483 --------

1484 When `pat` is a string and `regex` is True, the given `pat`

1485 is compiled as a regex. When `repl` is a string, it replaces matching

1486 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are

1487 left as is:

1488

1489 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)

1490 0 bao

1491 1 baz

1492 2 NaN

1493 dtype: object

1494

1495 When `pat` is a string and `regex` is False, every `pat` is replaced with

1496 `repl` as with :meth:`str.replace`:

1497

1498 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)

1499 0 bao

1500 1 fuz

1501 2 NaN

1502 dtype: object

1503

1504 When `repl` is a callable, it is called on every `pat` using

1505 :func:`re.sub`. The callable should expect one positional argument

1506 (a regex object) and return a string.

1507

1508 To get the idea:

1509

1510 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True)

1511 0 <re.Match object; span=(0, 1), match='f'>oo

1512 1 <re.Match object; span=(0, 1), match='f'>uz

1513 2 NaN

1514 dtype: object

1515

1516 Reverse every lowercase alphabetic word:

1517

1518 >>> repl = lambda m: m.group(0)[::-1]

1519 >>> ser = pd.Series(['foo 123', 'bar baz', np.nan])

1520 >>> ser.str.replace(r'[a-z]+', repl, regex=True)

1521 0 oof 123

1522 1 rab zab

1523 2 NaN

1524 dtype: object

1525

1526 Using regex groups (extract second group and swap case):

1527

1528 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"

1529 >>> repl = lambda m: m.group('two').swapcase()

1530 >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz'])

1531 >>> ser.str.replace(pat, repl, regex=True)

1532 0 tWO

1533 1 bAR

1534 dtype: object

1535

1536 Using a compiled regex with flags

1537

1538 >>> import re

1539 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)

1540 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)

1541 0 foo

1542 1 bar

1543 2 NaN

1544 dtype: object

1545 """

1546 # Check whether repl is valid (GH 13438, GH 15055)

1547 if not (isinstance(repl, str) or callable(repl)):

1548 raise TypeError("repl must be a string or callable")

1549

1550 is_compiled_re = is_re(pat)

1551 if regex or regex is None:

1552 if is_compiled_re and (case is not None or flags != 0):

1553 raise ValueError(

1554 "case and flags cannot be set when pat is a compiled regex"

1555 )

1556

1557 elif is_compiled_re:

1558 raise ValueError(

1559 "Cannot use a compiled regex as replacement pattern with regex=False"

1560 )

1561 elif callable(repl):

1562 raise ValueError("Cannot use a callable replacement when regex=False")

1563

1564 if case is None:

1565 case = True

1566

1567 result = self._data.array._str_replace(

1568 pat, repl, n=n, case=case, flags=flags, regex=regex

1569 )

1570 return self._wrap_result(result)

1571

1572 @forbid_nonstring_types(["bytes"])

1573 def repeat(self, repeats):

1574 """

1575 Duplicate each string in the Series or Index.

1576

1577 Parameters

1578 ----------

1579 repeats : int or sequence of int

1580 Same value for all (int) or different value per (sequence).

1581

1582 Returns

1583 -------

1584 Series or pandas.Index

1585 Series or Index of repeated string objects specified by

1586 input parameter repeats.

1587

1588 Examples

1589 --------

1590 >>> s = pd.Series(['a', 'b', 'c'])

1591 >>> s

1592 0 a

1593 1 b

1594 2 c

1595 dtype: object

1596

1597 Single int repeats string in Series

1598

1599 >>> s.str.repeat(repeats=2)

1600 0 aa

1601 1 bb

1602 2 cc

1603 dtype: object

1604

1605 Sequence of int repeats corresponding string in Series

1606

1607 >>> s.str.repeat(repeats=[1, 2, 3])

1608 0 a

1609 1 bb

1610 2 ccc

1611 dtype: object

1612 """

1613 result = self._data.array._str_repeat(repeats)

1614 return self._wrap_result(result)

1615

1616 @forbid_nonstring_types(["bytes"])

1617 def pad(

1618 self,

1619 width: int,

1620 side: Literal["left", "right", "both"] = "left",

1621 fillchar: str = " ",

1622 ):

1623 """

1624 Pad strings in the Series/Index up to width.

1625

1626 Parameters

1627 ----------

1628 width : int

1629 Minimum width of resulting string; additional characters will be filled

1630 with character defined in `fillchar`.

1631 side : {'left', 'right', 'both'}, default 'left'

1632 Side from which to fill resulting string.

1633 fillchar : str, default ' '

1634 Additional character for filling, default is whitespace.

1635

1636 Returns

1637 -------

1638 Series or Index of object

1639 Returns Series or Index with minimum number of char in object.

1640

1641 See Also

1642 --------

1643 Series.str.rjust : Fills the left side of strings with an arbitrary

1644 character. Equivalent to ``Series.str.pad(side='left')``.

1645 Series.str.ljust : Fills the right side of strings with an arbitrary

1646 character. Equivalent to ``Series.str.pad(side='right')``.

1647 Series.str.center : Fills both sides of strings with an arbitrary

1648 character. Equivalent to ``Series.str.pad(side='both')``.

1649 Series.str.zfill : Pad strings in the Series/Index by prepending '0'

1650 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.

1651

1652 Examples

1653 --------

1654 >>> s = pd.Series(["caribou", "tiger"])

1655 >>> s

1656 0 caribou

1657 1 tiger

1658 dtype: object

1659

1660 >>> s.str.pad(width=10)

1661 0 caribou

1662 1 tiger

1663 dtype: object

1664

1665 >>> s.str.pad(width=10, side='right', fillchar='-')

1666 0 caribou---

1667 1 tiger-----

1668 dtype: object

1669

1670 >>> s.str.pad(width=10, side='both', fillchar='-')

1671 0 -caribou--

1672 1 --tiger---

1673 dtype: object

1674 """

1675 if not isinstance(fillchar, str):

1676 msg = f"fillchar must be a character, not {type(fillchar).__name__}"

1677 raise TypeError(msg)

1678

1679 if len(fillchar) != 1:

1680 raise TypeError("fillchar must be a character, not str")

1681

1682 if not is_integer(width):

1683 msg = f"width must be of integer type, not {type(width).__name__}"

1684 raise TypeError(msg)

1685

1686 result = self._data.array._str_pad(width, side=side, fillchar=fillchar)

1687 return self._wrap_result(result)

1688

1689 _shared_docs[

1690 "str_pad"

1691 ] = """

1692 Pad %(side)s side of strings in the Series/Index.

1693

1694 Equivalent to :meth:`str.%(method)s`.

1695

1696 Parameters

1697 ----------

1698 width : int

1699 Minimum width of resulting string; additional characters will be filled

1700 with ``fillchar``.

1701 fillchar : str

1702 Additional character for filling, default is whitespace.

1703

1704 Returns

1705 -------

1706 Series/Index of objects.

1707

1708 Examples

1709 --------

1710 For Series.str.center:

1711

1712 >>> ser = pd.Series(['dog', 'bird', 'mouse'])

1713 >>> ser.str.center(8, fillchar='.')

1714 0 ..dog...

1715 1 ..bird..

1716 2 .mouse..

1717 dtype: object

1718

1719 For Series.str.ljust:

1720

1721 >>> ser = pd.Series(['dog', 'bird', 'mouse'])

1722 >>> ser.str.ljust(8, fillchar='.')

1723 0 dog.....

1724 1 bird....

1725 2 mouse...

1726 dtype: object

1727

1728 For Series.str.rjust:

1729

1730 >>> ser = pd.Series(['dog', 'bird', 'mouse'])

1731 >>> ser.str.rjust(8, fillchar='.')

1732 0 .....dog

1733 1 ....bird

1734 2 ...mouse

1735 dtype: object

1736 """

1737

1738 @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"})

1739 @forbid_nonstring_types(["bytes"])

1740 def center(self, width: int, fillchar: str = " "):

1741 return self.pad(width, side="both", fillchar=fillchar)

1742

1743 @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"})

1744 @forbid_nonstring_types(["bytes"])

1745 def ljust(self, width: int, fillchar: str = " "):

1746 return self.pad(width, side="right", fillchar=fillchar)

1747

1748 @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"})

1749 @forbid_nonstring_types(["bytes"])

1750 def rjust(self, width: int, fillchar: str = " "):

1751 return self.pad(width, side="left", fillchar=fillchar)

1752

1753 @forbid_nonstring_types(["bytes"])

1754 def zfill(self, width: int):

1755 """

1756 Pad strings in the Series/Index by prepending '0' characters.

1757

1758 Strings in the Series/Index are padded with '0' characters on the

1759 left of the string to reach a total string length `width`. Strings

1760 in the Series/Index with length greater or equal to `width` are

1761 unchanged.

1762

1763 Parameters

1764 ----------

1765 width : int

1766 Minimum length of resulting string; strings with length less

1767 than `width` be prepended with '0' characters.

1768

1769 Returns

1770 -------

1771 Series/Index of objects.

1772

1773 See Also

1774 --------

1775 Series.str.rjust : Fills the left side of strings with an arbitrary

1776 character.

1777 Series.str.ljust : Fills the right side of strings with an arbitrary

1778 character.

1779 Series.str.pad : Fills the specified sides of strings with an arbitrary

1780 character.

1781 Series.str.center : Fills both sides of strings with an arbitrary

1782 character.

1783

1784 Notes

1785 -----

1786 Differs from :meth:`str.zfill` which has special handling

1787 for '+'/'-' in the string.

1788

1789 Examples

1790 --------

1791 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])

1792 >>> s

1793 0 -1

1794 1 1

1795 2 1000

1796 3 10

1797 4 NaN

1798 dtype: object

1799

1800 Note that ``10`` and ``NaN`` are not strings, therefore they are

1801 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a

1802 special character and the zero is added to the right of it

1803 (:meth:`str.zfill` would have moved it to the left). ``1000``

1804 remains unchanged as it is longer than `width`.

1805

1806 >>> s.str.zfill(3)

1807 0 -01

1808 1 001

1809 2 1000

1810 3 NaN

1811 4 NaN

1812 dtype: object

1813 """

1814 if not is_integer(width):

1815 msg = f"width must be of integer type, not {type(width).__name__}"

1816 raise TypeError(msg)

1817 f = lambda x: x.zfill(width)

1818 result = self._data.array._str_map(f)

1819 return self._wrap_result(result)

1820

1821 def slice(self, start=None, stop=None, step=None):

1822 """

1823 Slice substrings from each element in the Series or Index.

1824

1825 Parameters

1826 ----------

1827 start : int, optional

1828 Start position for slice operation.

1829 stop : int, optional

1830 Stop position for slice operation.

1831 step : int, optional

1832 Step size for slice operation.

1833

1834 Returns

1835 -------

1836 Series or Index of object

1837 Series or Index from sliced substring from original string object.

1838

1839 See Also

1840 --------

1841 Series.str.slice_replace : Replace a slice with a string.

1842 Series.str.get : Return element at position.

1843 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`

1844 being the position.

1845

1846 Examples

1847 --------

1848 >>> s = pd.Series(["koala", "dog", "chameleon"])

1849 >>> s

1850 0 koala

1851 1 dog

1852 2 chameleon

1853 dtype: object

1854

1855 >>> s.str.slice(start=1)

1856 0 oala

1857 1 og

1858 2 hameleon

1859 dtype: object

1860

1861 >>> s.str.slice(start=-1)

1862 0 a

1863 1 g

1864 2 n

1865 dtype: object

1866

1867 >>> s.str.slice(stop=2)

1868 0 ko

1869 1 do

1870 2 ch

1871 dtype: object

1872

1873 >>> s.str.slice(step=2)

1874 0 kaa

1875 1 dg

1876 2 caeen

1877 dtype: object

1878

1879 >>> s.str.slice(start=0, stop=5, step=3)

1880 0 kl

1881 1 d

1882 2 cm

1883 dtype: object

1884

1885 Equivalent behaviour to:

1886

1887 >>> s.str[0:5:3]

1888 0 kl

1889 1 d

1890 2 cm

1891 dtype: object

1892 """

1893 result = self._data.array._str_slice(start, stop, step)

1894 return self._wrap_result(result)

1895

1896 @forbid_nonstring_types(["bytes"])

1897 def slice_replace(self, start=None, stop=None, repl=None):

1898 """

1899 Replace a positional slice of a string with another value.

1900

1901 Parameters

1902 ----------

1903 start : int, optional

1904 Left index position to use for the slice. If not specified (None),

1905 the slice is unbounded on the left, i.e. slice from the start

1906 of the string.

1907 stop : int, optional

1908 Right index position to use for the slice. If not specified (None),

1909 the slice is unbounded on the right, i.e. slice until the

1910 end of the string.

1911 repl : str, optional

1912 String for replacement. If not specified (None), the sliced region

1913 is replaced with an empty string.

1914

1915 Returns

1916 -------

1917 Series or Index

1918 Same type as the original object.

1919

1920 See Also

1921 --------

1922 Series.str.slice : Just slicing without replacement.

1923

1924 Examples

1925 --------

1926 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])

1927 >>> s

1928 0 a

1929 1 ab

1930 2 abc

1931 3 abdc

1932 4 abcde

1933 dtype: object

1934

1935 Specify just `start`, meaning replace `start` until the end of the

1936 string with `repl`.

1937

1938 >>> s.str.slice_replace(1, repl='X')

1939 0 aX

1940 1 aX

1941 2 aX

1942 3 aX

1943 4 aX

1944 dtype: object

1945

1946 Specify just `stop`, meaning the start of the string to `stop` is replaced

1947 with `repl`, and the rest of the string is included.

1948

1949 >>> s.str.slice_replace(stop=2, repl='X')

1950 0 X

1951 1 X

1952 2 Xc

1953 3 Xdc

1954 4 Xcde

1955 dtype: object

1956

1957 Specify `start` and `stop`, meaning the slice from `start` to `stop` is

1958 replaced with `repl`. Everything before or after `start` and `stop` is

1959 included as is.

1960

1961 >>> s.str.slice_replace(start=1, stop=3, repl='X')

1962 0 aX

1963 1 aX

1964 2 aX

1965 3 aXc

1966 4 aXde

1967 dtype: object

1968 """

1969 result = self._data.array._str_slice_replace(start, stop, repl)

1970 return self._wrap_result(result)

1971

1972 def decode(self, encoding, errors: str = "strict"):

1973 """

1974 Decode character string in the Series/Index using indicated encoding.

1975

1976 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in

1977 python3.

1978

1979 Parameters

1980 ----------

1981 encoding : str

1982 errors : str, optional

1983

1984 Returns

1985 -------

1986 Series or Index

1987

1988 Examples

1989 --------

1990 For Series:

1991

1992 >>> ser = pd.Series([b'cow', b'123', b'()'])

1993 >>> ser.str.decode('ascii')

1994 0 cow

1995 1 123

1996 2 ()

1997 dtype: object

1998 """

1999 # TODO: Add a similar _bytes interface.

2000 if encoding in _cpython_optimized_decoders:

2001 # CPython optimized implementation

2002 f = lambda x: x.decode(encoding, errors)

2003 else:

2004 decoder = codecs.getdecoder(encoding)

2005 f = lambda x: decoder(x, errors)[0]

2006 arr = self._data.array

2007 # assert isinstance(arr, (StringArray,))

2008 result = arr._str_map(f)

2009 return self._wrap_result(result)

2010

2011 @forbid_nonstring_types(["bytes"])

2012 def encode(self, encoding, errors: str = "strict"):

2013 """

2014 Encode character string in the Series/Index using indicated encoding.

2015

2016 Equivalent to :meth:`str.encode`.

2017

2018 Parameters

2019 ----------

2020 encoding : str

2021 errors : str, optional

2022

2023 Returns

2024 -------

2025 Series/Index of objects

2026

2027 Examples

2028 --------

2029 >>> ser = pd.Series(['cow', '123', '()'])

2030 >>> ser.str.encode(encoding='ascii')

2031 0 b'cow'

2032 1 b'123'

2033 2 b'()'

2034 dtype: object

2035 """

2036 result = self._data.array._str_encode(encoding, errors)

2037 return self._wrap_result(result, returns_string=False)

2038

2039 _shared_docs[

2040 "str_strip"

2041 ] = r"""

2042 Remove %(position)s characters.

2043

2044 Strip whitespaces (including newlines) or a set of specified characters

2045 from each string in the Series/Index from %(side)s.

2046 Replaces any non-strings in Series with NaNs.

2047 Equivalent to :meth:`str.%(method)s`.

2048

2049 Parameters

2050 ----------

2051 to_strip : str or None, default None

2052 Specifying the set of characters to be removed.

2053 All combinations of this set of characters will be stripped.

2054 If None then whitespaces are removed.

2055

2056 Returns

2057 -------

2058 Series or Index of object

2059

2060 See Also

2061 --------

2062 Series.str.strip : Remove leading and trailing characters in Series/Index.

2063 Series.str.lstrip : Remove leading characters in Series/Index.

2064 Series.str.rstrip : Remove trailing characters in Series/Index.

2065

2066 Examples

2067 --------

2068 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])

2069 >>> s

2070 0 1. Ant.

2071 1 2. Bee!\n

2072 2 3. Cat?\t

2073 3 NaN

2074 4 10

2075 5 True

2076 dtype: object

2077

2078 >>> s.str.strip()

2079 0 1. Ant.

2080 1 2. Bee!

2081 2 3. Cat?

2082 3 NaN

2083 4 NaN

2084 5 NaN

2085 dtype: object

2086

2087 >>> s.str.lstrip('123.')

2088 0 Ant.

2089 1 Bee!\n

2090 2 Cat?\t

2091 3 NaN

2092 4 NaN

2093 5 NaN

2094 dtype: object

2095

2096 >>> s.str.rstrip('.!? \n\t')

2097 0 1. Ant

2098 1 2. Bee

2099 2 3. Cat

2100 3 NaN

2101 4 NaN

2102 5 NaN

2103 dtype: object

2104

2105 >>> s.str.strip('123.!? \n\t')

2106 0 Ant

2107 1 Bee

2108 2 Cat

2109 3 NaN

2110 4 NaN

2111 5 NaN

2112 dtype: object

2113 """

2114

2115 @Appender(

2116 _shared_docs["str_strip"]

2117 % {

2118 "side": "left and right sides",

2119 "method": "strip",

2120 "position": "leading and trailing",

2121 }

2122 )

2123 @forbid_nonstring_types(["bytes"])

2124 def strip(self, to_strip=None):

2125 result = self._data.array._str_strip(to_strip)

2126 return self._wrap_result(result)

2127

2128 @Appender(

2129 _shared_docs["str_strip"]

2130 % {"side": "left side", "method": "lstrip", "position": "leading"}

2131 )

2132 @forbid_nonstring_types(["bytes"])

2133 def lstrip(self, to_strip=None):

2134 result = self._data.array._str_lstrip(to_strip)

2135 return self._wrap_result(result)

2136

2137 @Appender(

2138 _shared_docs["str_strip"]

2139 % {"side": "right side", "method": "rstrip", "position": "trailing"}

2140 )

2141 @forbid_nonstring_types(["bytes"])

2142 def rstrip(self, to_strip=None):

2143 result = self._data.array._str_rstrip(to_strip)

2144 return self._wrap_result(result)

2145

2146 _shared_docs[

2147 "str_removefix"

2148 ] = r"""

2149 Remove a %(side)s from an object series.

2150

2151 If the %(side)s is not present, the original string will be returned.

2152

2153 Parameters

2154 ----------

2155 %(side)s : str

2156 Remove the %(side)s of the string.

2157

2158 Returns

2159 -------

2160 Series/Index: object

2161 The Series or Index with given %(side)s removed.

2162

2163 See Also

2164 --------

2165 Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.

2166

2167 Examples

2168 --------

2169 >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])

2170 >>> s

2171 0 str_foo

2172 1 str_bar

2173 2 no_prefix

2174 dtype: object

2175 >>> s.str.removeprefix("str_")

2176 0 foo

2177 1 bar

2178 2 no_prefix

2179 dtype: object

2180

2181 >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])

2182 >>> s

2183 0 foo_str

2184 1 bar_str

2185 2 no_suffix

2186 dtype: object

2187 >>> s.str.removesuffix("_str")

2188 0 foo

2189 1 bar

2190 2 no_suffix

2191 dtype: object

2192 """

2193

2194 @Appender(

2195 _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"}

2196 )

2197 @forbid_nonstring_types(["bytes"])

2198 def removeprefix(self, prefix: str):

2199 result = self._data.array._str_removeprefix(prefix)

2200 return self._wrap_result(result)

2201

2202 @Appender(

2203 _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"}

2204 )

2205 @forbid_nonstring_types(["bytes"])

2206 def removesuffix(self, suffix: str):

2207 result = self._data.array._str_removesuffix(suffix)

2208 return self._wrap_result(result)

2209

2210 @forbid_nonstring_types(["bytes"])

2211 def wrap(self, width: int, **kwargs):

2212 r"""

2213 Wrap strings in Series/Index at specified line width.

2214

2215 This method has the same keyword parameters and defaults as

2216 :class:`textwrap.TextWrapper`.

2217

2218 Parameters

2219 ----------

2220 width : int

2221 Maximum line width.

2222 expand_tabs : bool, optional

2223 If True, tab characters will be expanded to spaces (default: True).

2224 replace_whitespace : bool, optional

2225 If True, each whitespace character (as defined by string.whitespace)

2226 remaining after tab expansion will be replaced by a single space

2227 (default: True).

2228 drop_whitespace : bool, optional

2229 If True, whitespace that, after wrapping, happens to end up at the

2230 beginning or end of a line is dropped (default: True).

2231 break_long_words : bool, optional

2232 If True, then words longer than width will be broken in order to ensure

2233 that no lines are longer than width. If it is false, long words will

2234 not be broken, and some lines may be longer than width (default: True).

2235 break_on_hyphens : bool, optional

2236 If True, wrapping will occur preferably on whitespace and right after

2237 hyphens in compound words, as it is customary in English. If false,

2238 only whitespaces will be considered as potentially good places for line

2239 breaks, but you need to set break_long_words to false if you want truly

2240 insecable words (default: True).

2241

2242 Returns

2243 -------

2244 Series or Index

2245

2246 Notes

2247 -----

2248 Internally, this method uses a :class:`textwrap.TextWrapper` instance with

2249 default settings. To achieve behavior matching R's stringr library str_wrap

2250 function, use the arguments:

2251

2252 - expand_tabs = False

2253 - replace_whitespace = True

2254 - drop_whitespace = True

2255 - break_long_words = False

2256 - break_on_hyphens = False

2257

2258 Examples

2259 --------

2260 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])

2261 >>> s.str.wrap(12)

2262 0 line to be\nwrapped

2263 1 another line\nto be\nwrapped

2264 dtype: object

2265 """

2266 result = self._data.array._str_wrap(width, **kwargs)

2267 return self._wrap_result(result)

2268

2269 @forbid_nonstring_types(["bytes"])

2270 def get_dummies(self, sep: str = "|"):

2271 """

2272 Return DataFrame of dummy/indicator variables for Series.

2273

2274 Each string in Series is split by sep and returned as a DataFrame

2275 of dummy/indicator variables.

2276

2277 Parameters

2278 ----------

2279 sep : str, default "|"

2280 String to split on.

2281

2282 Returns

2283 -------

2284 DataFrame

2285 Dummy variables corresponding to values of the Series.

2286

2287 See Also

2288 --------

2289 get_dummies : Convert categorical variable into dummy/indicator

2290 variables.

2291

2292 Examples

2293 --------

2294 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()

2295 a b c

2296 0 1 1 0

2297 1 1 0 0

2298 2 1 0 1

2299

2300 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()

2301 a b c

2302 0 1 1 0

2303 1 0 0 0

2304 2 1 0 1

2305 """

2306 # we need to cast to Series of strings as only that has all

2307 # methods available for making the dummies...

2308 result, name = self._data.array._str_get_dummies(sep)

2309 return self._wrap_result(

2310 result,

2311 name=name,

2312 expand=True,

2313 returns_string=False,

2314 )

2315

2316 @forbid_nonstring_types(["bytes"])

2317 def translate(self, table):

2318 """

2319 Map all characters in the string through the given mapping table.

2320

2321 Equivalent to standard :meth:`str.translate`.

2322

2323 Parameters

2324 ----------

2325 table : dict

2326 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or

2327 None. Unmapped characters are left untouched.

2328 Characters mapped to None are deleted. :meth:`str.maketrans` is a

2329 helper function for making translation tables.

2330

2331 Returns

2332 -------

2333 Series or Index

2334

2335 Examples

2336 --------

2337 >>> ser = pd.Series(["El niño", "Françoise"])

2338 >>> mytable = str.maketrans({'ñ': 'n', 'ç': 'c'})

2339 >>> ser.str.translate(mytable)

2340 0 El nino

2341 1 Francoise

2342 dtype: object

2343 """

2344 result = self._data.array._str_translate(table)

2345 dtype = object if self._data.dtype == "object" else None

2346 return self._wrap_result(result, dtype=dtype)

2347

2348 @forbid_nonstring_types(["bytes"])

2349 def count(self, pat, flags: int = 0):

2350 r"""

2351 Count occurrences of pattern in each string of the Series/Index.

2352

2353 This function is used to count the number of times a particular regex

2354 pattern is repeated in each of the string elements of the

2355 :class:`~pandas.Series`.

2356

2357 Parameters

2358 ----------

2359 pat : str

2360 Valid regular expression.

2361 flags : int, default 0, meaning no flags

2362 Flags for the `re` module. For a complete list, `see here

2363 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.

2364 **kwargs

2365 For compatibility with other string methods. Not used.

2366

2367 Returns

2368 -------

2369 Series or Index

2370 Same type as the calling object containing the integer counts.

2371

2372 See Also

2373 --------

2374 re : Standard library module for regular expressions.

2375 str.count : Standard library version, without regular expression support.

2376

2377 Notes

2378 -----

2379 Some characters need to be escaped when passing in `pat`.

2380 eg. ``'$'`` has a special meaning in regex and must be escaped when

2381 finding this literal character.

2382

2383 Examples

2384 --------

2385 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])

2386 >>> s.str.count('a')

2387 0 0.0

2388 1 0.0

2389 2 2.0

2390 3 2.0

2391 4 NaN

2392 5 0.0

2393 6 1.0

2394 dtype: float64

2395

2396 Escape ``'$'`` to find the literal dollar sign.

2397

2398 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])

2399 >>> s.str.count('\\$')

2400 0 1

2401 1 0

2402 2 1

2403 3 2

2404 4 2

2405 5 0

2406 dtype: int64

2407

2408 This is also available on Index

2409

2410 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')

2411 Index([0, 0, 2, 1], dtype='int64')

2412 """

2413 result = self._data.array._str_count(pat, flags)

2414 return self._wrap_result(result, returns_string=False)

2415

2416 @forbid_nonstring_types(["bytes"])

2417 def startswith(

2418 self, pat: str | tuple[str, ...], na: Scalar | None = None

2419 ) -> Series | Index:

2420 """

2421 Test if the start of each string element matches a pattern.

2422

2423 Equivalent to :meth:`str.startswith`.

2424

2425 Parameters

2426 ----------

2427 pat : str or tuple[str, ...]

2428 Character sequence or tuple of strings. Regular expressions are not

2429 accepted.

2430 na : object, default NaN

2431 Object shown if element tested is not a string. The default depends

2432 on dtype of the array. For object-dtype, ``numpy.nan`` is used.

2433 For ``StringDtype``, ``pandas.NA`` is used.

2434

2435 Returns

2436 -------

2437 Series or Index of bool

2438 A Series of booleans indicating whether the given pattern matches

2439 the start of each string element.

2440

2441 See Also

2442 --------

2443 str.startswith : Python standard library string method.

2444 Series.str.endswith : Same as startswith, but tests the end of string.

2445 Series.str.contains : Tests if string element contains a pattern.

2446

2447 Examples

2448 --------

2449 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])

2450 >>> s

2451 0 bat

2452 1 Bear

2453 2 cat

2454 3 NaN

2455 dtype: object

2456

2457 >>> s.str.startswith('b')

2458 0 True

2459 1 False

2460 2 False

2461 3 NaN

2462 dtype: object

2463

2464 >>> s.str.startswith(('b', 'B'))

2465 0 True

2466 1 True

2467 2 False

2468 3 NaN

2469 dtype: object

2470

2471 Specifying `na` to be `False` instead of `NaN`.

2472

2473 >>> s.str.startswith('b', na=False)

2474 0 True

2475 1 False

2476 2 False

2477 3 False

2478 dtype: bool

2479 """

2480 if not isinstance(pat, (str, tuple)):

2481 msg = f"expected a string or tuple, not {type(pat).__name__}"

2482 raise TypeError(msg)

2483 result = self._data.array._str_startswith(pat, na=na)

2484 return self._wrap_result(result, returns_string=False)

2485

2486 @forbid_nonstring_types(["bytes"])

2487 def endswith(

2488 self, pat: str | tuple[str, ...], na: Scalar | None = None

2489 ) -> Series | Index:

2490 """

2491 Test if the end of each string element matches a pattern.

2492

2493 Equivalent to :meth:`str.endswith`.

2494

2495 Parameters

2496 ----------

2497 pat : str or tuple[str, ...]

2498 Character sequence or tuple of strings. Regular expressions are not

2499 accepted.

2500 na : object, default NaN

2501 Object shown if element tested is not a string. The default depends

2502 on dtype of the array. For object-dtype, ``numpy.nan`` is used.

2503 For ``StringDtype``, ``pandas.NA`` is used.

2504

2505 Returns

2506 -------

2507 Series or Index of bool

2508 A Series of booleans indicating whether the given pattern matches

2509 the end of each string element.

2510

2511 See Also

2512 --------

2513 str.endswith : Python standard library string method.

2514 Series.str.startswith : Same as endswith, but tests the start of string.

2515 Series.str.contains : Tests if string element contains a pattern.

2516

2517 Examples

2518 --------

2519 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])

2520 >>> s

2521 0 bat

2522 1 bear

2523 2 caT

2524 3 NaN

2525 dtype: object

2526

2527 >>> s.str.endswith('t')

2528 0 True

2529 1 False

2530 2 False

2531 3 NaN

2532 dtype: object

2533

2534 >>> s.str.endswith(('t', 'T'))

2535 0 True

2536 1 False

2537 2 True

2538 3 NaN

2539 dtype: object

2540

2541 Specifying `na` to be `False` instead of `NaN`.

2542

2543 >>> s.str.endswith('t', na=False)

2544 0 True

2545 1 False

2546 2 False

2547 3 False

2548 dtype: bool

2549 """

2550 if not isinstance(pat, (str, tuple)):

2551 msg = f"expected a string or tuple, not {type(pat).__name__}"

2552 raise TypeError(msg)

2553 result = self._data.array._str_endswith(pat, na=na)

2554 return self._wrap_result(result, returns_string=False)

2555

2556 @forbid_nonstring_types(["bytes"])

2557 def findall(self, pat, flags: int = 0):

2558 """

2559 Find all occurrences of pattern or regular expression in the Series/Index.

2560

2561 Equivalent to applying :func:`re.findall` to all the elements in the

2562 Series/Index.

2563

2564 Parameters

2565 ----------

2566 pat : str

2567 Pattern or regular expression.

2568 flags : int, default 0

2569 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which

2570 means no flags).

2571

2572 Returns

2573 -------

2574 Series/Index of lists of strings

2575 All non-overlapping matches of pattern or regular expression in each

2576 string of this Series/Index.

2577

2578 See Also

2579 --------

2580 count : Count occurrences of pattern or regular expression in each string

2581 of the Series/Index.

2582 extractall : For each string in the Series, extract groups from all matches

2583 of regular expression and return a DataFrame with one row for each

2584 match and one column for each group.

2585 re.findall : The equivalent ``re`` function to all non-overlapping matches

2586 of pattern or regular expression in string, as a list of strings.

2587

2588 Examples

2589 --------

2590 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])

2591

2592 The search for the pattern 'Monkey' returns one match:

2593

2594 >>> s.str.findall('Monkey')

2595 0 []

2596 1 [Monkey]

2597 2 []

2598 dtype: object

2599

2600 On the other hand, the search for the pattern 'MONKEY' doesn't return any

2601 match:

2602

2603 >>> s.str.findall('MONKEY')

2604 0 []

2605 1 []

2606 2 []

2607 dtype: object

2608

2609 Flags can be added to the pattern or regular expression. For instance,

2610 to find the pattern 'MONKEY' ignoring the case:

2611

2612 >>> import re

2613 >>> s.str.findall('MONKEY', flags=re.IGNORECASE)

2614 0 []

2615 1 [Monkey]

2616 2 []

2617 dtype: object

2618

2619 When the pattern matches more than one string in the Series, all matches

2620 are returned:

2621

2622 >>> s.str.findall('on')

2623 0 [on]

2624 1 [on]

2625 2 []

2626 dtype: object

2627

2628 Regular expressions are supported too. For instance, the search for all the

2629 strings ending with the word 'on' is shown next:

2630

2631 >>> s.str.findall('on$')

2632 0 [on]

2633 1 []

2634 2 []

2635 dtype: object

2636

2637 If the pattern is found more than once in the same string, then a list of

2638 multiple strings is returned:

2639

2640 >>> s.str.findall('b')

2641 0 []

2642 1 []

2643 2 [b, b]

2644 dtype: object

2645 """

2646 result = self._data.array._str_findall(pat, flags)

2647 return self._wrap_result(result, returns_string=False)

2648

2649 @forbid_nonstring_types(["bytes"])

2650 def extract(

2651 self, pat: str, flags: int = 0, expand: bool = True

2652 ) -> DataFrame | Series | Index:

2653 r"""

2654 Extract capture groups in the regex `pat` as columns in a DataFrame.

2655

2656 For each subject string in the Series, extract groups from the

2657 first match of regular expression `pat`.

2658

2659 Parameters

2660 ----------

2661 pat : str

2662 Regular expression pattern with capturing groups.

2663 flags : int, default 0 (no flags)

2664 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that

2665 modify regular expression matching for things like case,

2666 spaces, etc. For more details, see :mod:`re`.

2667 expand : bool, default True

2668 If True, return DataFrame with one column per capture group.

2669 If False, return a Series/Index if there is one capture group

2670 or DataFrame if there are multiple capture groups.

2671

2672 Returns

2673 -------

2674 DataFrame or Series or Index

2675 A DataFrame with one row for each subject string, and one

2676 column for each group. Any capture group names in regular

2677 expression pat will be used for column names; otherwise

2678 capture group numbers will be used. The dtype of each result

2679 column is always object, even when no match is found. If

2680 ``expand=False`` and pat has only one capture group, then

2681 return a Series (if subject is a Series) or Index (if subject

2682 is an Index).

2683

2684 See Also

2685 --------

2686 extractall : Returns all matches (not just the first match).

2687

2688 Examples

2689 --------

2690 A pattern with two groups will return a DataFrame with two columns.

2691 Non-matches will be NaN.

2692

2693 >>> s = pd.Series(['a1', 'b2', 'c3'])

2694 >>> s.str.extract(r'([ab])(\d)')

2695 0 1

2696 0 a 1

2697 1 b 2

2698 2 NaN NaN

2699

2700 A pattern may contain optional groups.

2701

2702 >>> s.str.extract(r'([ab])?(\d)')

2703 0 1

2704 0 a 1

2705 1 b 2

2706 2 NaN 3

2707

2708 Named groups will become column names in the result.

2709

2710 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')

2711 letter digit

2712 0 a 1

2713 1 b 2

2714 2 NaN NaN

2715

2716 A pattern with one group will return a DataFrame with one column

2717 if expand=True.

2718

2719 >>> s.str.extract(r'[ab](\d)', expand=True)

2720 0

2721 0 1

2722 1 2

2723 2 NaN

2724

2725 A pattern with one group will return a Series if expand=False.

2726

2727 >>> s.str.extract(r'[ab](\d)', expand=False)

2728 0 1

2729 1 2

2730 2 NaN

2731 dtype: object

2732 """

2733 from pandas import DataFrame

2734

2735 if not isinstance(expand, bool):

2736 raise ValueError("expand must be True or False")

2737

2738 regex = re.compile(pat, flags=flags)

2739 if regex.groups == 0:

2740 raise ValueError("pattern contains no capture groups")

2741

2742 if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex):

2743 raise ValueError("only one regex group is supported with Index")

2744

2745 obj = self._data

2746 result_dtype = _result_dtype(obj)

2747

2748 returns_df = regex.groups > 1 or expand

2749

2750 if returns_df:

2751 name = None

2752 columns = _get_group_names(regex)

2753

2754 if obj.array.size == 0:

2755 result = DataFrame(columns=columns, dtype=result_dtype)

2756

2757 else:

2758 result_list = self._data.array._str_extract(

2759 pat, flags=flags, expand=returns_df

2760 )

2761

2762 result_index: Index | None

2763 if isinstance(obj, ABCSeries):

2764 result_index = obj.index

2765 else:

2766 result_index = None

2767

2768 result = DataFrame(

2769 result_list, columns=columns, index=result_index, dtype=result_dtype

2770 )

2771

2772 else:

2773 name = _get_single_group_name(regex)

2774 result = self._data.array._str_extract(pat, flags=flags, expand=returns_df)

2775 return self._wrap_result(result, name=name, dtype=result_dtype)

2776

2777 @forbid_nonstring_types(["bytes"])

2778 def extractall(self, pat, flags: int = 0) -> DataFrame:

2779 r"""

2780 Extract capture groups in the regex `pat` as columns in DataFrame.

2781

2782 For each subject string in the Series, extract groups from all

2783 matches of regular expression pat. When each subject string in the

2784 Series has exactly one match, extractall(pat).xs(0, level='match')

2785 is the same as extract(pat).

2786

2787 Parameters

2788 ----------

2789 pat : str

2790 Regular expression pattern with capturing groups.

2791 flags : int, default 0 (no flags)

2792 A ``re`` module flag, for example ``re.IGNORECASE``. These allow

2793 to modify regular expression matching for things like case, spaces,

2794 etc. Multiple flags can be combined with the bitwise OR operator,

2795 for example ``re.IGNORECASE | re.MULTILINE``.

2796

2797 Returns

2798 -------

2799 DataFrame

2800 A ``DataFrame`` with one row for each match, and one column for each

2801 group. Its rows have a ``MultiIndex`` with first levels that come from

2802 the subject ``Series``. The last level is named 'match' and indexes the

2803 matches in each item of the ``Series``. Any capture group names in

2804 regular expression pat will be used for column names; otherwise capture

2805 group numbers will be used.

2806

2807 See Also

2808 --------

2809 extract : Returns first match only (not all matches).

2810

2811 Examples

2812 --------

2813 A pattern with one group will return a DataFrame with one column.

2814 Indices with no matches will not appear in the result.

2815

2816 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])

2817 >>> s.str.extractall(r"[ab](\d)")

2818 0

2819 match

2820 A 0 1

2821 1 2

2822 B 0 1

2823

2824 Capture group names are used for column names of the result.

2825

2826 >>> s.str.extractall(r"[ab](?P<digit>\d)")

2827 digit

2828 match

2829 A 0 1

2830 1 2

2831 B 0 1

2832

2833 A pattern with two groups will return a DataFrame with two columns.

2834

2835 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")

2836 letter digit

2837 match

2838 A 0 a 1

2839 1 a 2

2840 B 0 b 1

2841

2842 Optional groups that do not match are NaN in the result.

2843

2844 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")

2845 letter digit

2846 match

2847 A 0 a 1

2848 1 a 2

2849 B 0 b 1

2850 C 0 NaN 1

2851 """

2852 # TODO: dispatch

2853 return str_extractall(self._orig, pat, flags)

2854

2855 _shared_docs[

2856 "find"

2857 ] = """

2858 Return %(side)s indexes in each strings in the Series/Index.

2859

2860 Each of returned indexes corresponds to the position where the

2861 substring is fully contained between [start:end]. Return -1 on

2862 failure. Equivalent to standard :meth:`str.%(method)s`.

2863

2864 Parameters

2865 ----------

2866 sub : str

2867 Substring being searched.

2868 start : int

2869 Left edge index.

2870 end : int

2871 Right edge index.

2872

2873 Returns

2874 -------

2875 Series or Index of int.

2876

2877 See Also

2878 --------

2879 %(also)s

2880

2881 Examples

2882 --------

2883 For Series.str.find:

2884

2885 >>> ser = pd.Series(["cow_", "duck_", "do_ve"])

2886 >>> ser.str.find("_")

2887 0 3

2888 1 4

2889 2 2

2890 dtype: int64

2891

2892 For Series.str.rfind:

2893

2894 >>> ser = pd.Series(["_cow_", "duck_", "do_v_e"])

2895 >>> ser.str.rfind("_")

2896 0 4

2897 1 4

2898 2 4

2899 dtype: int64

2900 """

2901

2902 @Appender(

2903 _shared_docs["find"]

2904 % {

2905 "side": "lowest",

2906 "method": "find",

2907 "also": "rfind : Return highest indexes in each strings.",

2908 }

2909 )

2910 @forbid_nonstring_types(["bytes"])

2911 def find(self, sub, start: int = 0, end=None):

2912 if not isinstance(sub, str):

2913 msg = f"expected a string object, not {type(sub).__name__}"

2914 raise TypeError(msg)

2915

2916 result = self._data.array._str_find(sub, start, end)

2917 return self._wrap_result(result, returns_string=False)

2918

2919 @Appender(

2920 _shared_docs["find"]

2921 % {

2922 "side": "highest",

2923 "method": "rfind",

2924 "also": "find : Return lowest indexes in each strings.",

2925 }

2926 )

2927 @forbid_nonstring_types(["bytes"])

2928 def rfind(self, sub, start: int = 0, end=None):

2929 if not isinstance(sub, str):

2930 msg = f"expected a string object, not {type(sub).__name__}"

2931 raise TypeError(msg)

2932

2933 result = self._data.array._str_rfind(sub, start=start, end=end)

2934 return self._wrap_result(result, returns_string=False)

2935

2936 @forbid_nonstring_types(["bytes"])

2937 def normalize(self, form):

2938 """

2939 Return the Unicode normal form for the strings in the Series/Index.

2940

2941 For more information on the forms, see the

2942 :func:`unicodedata.normalize`.

2943

2944 Parameters

2945 ----------

2946 form : {'NFC', 'NFKC', 'NFD', 'NFKD'}

2947 Unicode form.

2948

2949 Returns

2950 -------

2951 Series/Index of objects

2952

2953 Examples

2954 --------

2955 >>> ser = pd.Series(['ñ'])

2956 >>> ser.str.normalize('NFC') == ser.str.normalize('NFD')

2957 0 False

2958 dtype: bool

2959 """

2960 result = self._data.array._str_normalize(form)

2961 return self._wrap_result(result)

2962

2963 _shared_docs[

2964 "index"

2965 ] = """

2966 Return %(side)s indexes in each string in Series/Index.

2967

2968 Each of the returned indexes corresponds to the position where the

2969 substring is fully contained between [start:end]. This is the same

2970 as ``str.%(similar)s`` except instead of returning -1, it raises a

2971 ValueError when the substring is not found. Equivalent to standard

2972 ``str.%(method)s``.

2973

2974 Parameters

2975 ----------

2976 sub : str

2977 Substring being searched.

2978 start : int

2979 Left edge index.

2980 end : int

2981 Right edge index.

2982

2983 Returns

2984 -------

2985 Series or Index of object

2986

2987 See Also

2988 --------

2989 %(also)s

2990

2991 Examples

2992 --------

2993 For Series.str.index:

2994

2995 >>> ser = pd.Series(["horse", "eagle", "donkey"])

2996 >>> ser.str.index("e")

2997 0 4

2998 1 0

2999 2 4

3000 dtype: int64

3001

3002 For Series.str.rindex:

3003

3004 >>> ser = pd.Series(["Deer", "eagle", "Sheep"])

3005 >>> ser.str.rindex("e")

3006 0 2

3007 1 4

3008 2 3

3009 dtype: int64

3010 """

3011

3012 @Appender(

3013 _shared_docs["index"]

3014 % {

3015 "side": "lowest",

3016 "similar": "find",

3017 "method": "index",

3018 "also": "rindex : Return highest indexes in each strings.",

3019 }

3020 )

3021 @forbid_nonstring_types(["bytes"])

3022 def index(self, sub, start: int = 0, end=None):

3023 if not isinstance(sub, str):

3024 msg = f"expected a string object, not {type(sub).__name__}"

3025 raise TypeError(msg)

3026

3027 result = self._data.array._str_index(sub, start=start, end=end)

3028 return self._wrap_result(result, returns_string=False)

3029

3030 @Appender(

3031 _shared_docs["index"]

3032 % {

3033 "side": "highest",

3034 "similar": "rfind",

3035 "method": "rindex",

3036 "also": "index : Return lowest indexes in each strings.",

3037 }

3038 )

3039 @forbid_nonstring_types(["bytes"])

3040 def rindex(self, sub, start: int = 0, end=None):

3041 if not isinstance(sub, str):

3042 msg = f"expected a string object, not {type(sub).__name__}"

3043 raise TypeError(msg)

3044

3045 result = self._data.array._str_rindex(sub, start=start, end=end)

3046 return self._wrap_result(result, returns_string=False)

3047

3048 def len(self):

3049 """

3050 Compute the length of each element in the Series/Index.

3051

3052 The element may be a sequence (such as a string, tuple or list) or a collection

3053 (such as a dictionary).

3054

3055 Returns

3056 -------

3057 Series or Index of int

3058 A Series or Index of integer values indicating the length of each

3059 element in the Series or Index.

3060

3061 See Also

3062 --------

3063 str.len : Python built-in function returning the length of an object.

3064 Series.size : Returns the length of the Series.

3065

3066 Examples

3067 --------

3068 Returns the length (number of characters) in a string. Returns the

3069 number of entries for dictionaries, lists or tuples.

3070

3071 >>> s = pd.Series(['dog',

3072 ... '',

3073 ... 5,

3074 ... {'foo' : 'bar'},

3075 ... [2, 3, 5, 7],

3076 ... ('one', 'two', 'three')])

3077 >>> s

3078 0 dog

3079 1

3080 2 5

3081 3 {'foo': 'bar'}

3082 4 [2, 3, 5, 7]

3083 5 (one, two, three)

3084 dtype: object

3085 >>> s.str.len()

3086 0 3.0

3087 1 0.0

3088 2 NaN

3089 3 1.0

3090 4 4.0

3091 5 3.0

3092 dtype: float64

3093 """

3094 result = self._data.array._str_len()

3095 return self._wrap_result(result, returns_string=False)

3096

3097 _shared_docs[

3098 "casemethods"

3099 ] = """

3100 Convert strings in the Series/Index to %(type)s.

3101 %(version)s

3102 Equivalent to :meth:`str.%(method)s`.

3103

3104 Returns

3105 -------

3106 Series or Index of object

3107

3108 See Also

3109 --------

3110 Series.str.lower : Converts all characters to lowercase.

3111 Series.str.upper : Converts all characters to uppercase.

3112 Series.str.title : Converts first character of each word to uppercase and

3113 remaining to lowercase.

3114 Series.str.capitalize : Converts first character to uppercase and

3115 remaining to lowercase.

3116 Series.str.swapcase : Converts uppercase to lowercase and lowercase to

3117 uppercase.

3118 Series.str.casefold: Removes all case distinctions in the string.

3119

3120 Examples

3121 --------

3122 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])

3123 >>> s

3124 0 lower

3125 1 CAPITALS

3126 2 this is a sentence

3127 3 SwApCaSe

3128 dtype: object

3129

3130 >>> s.str.lower()

3131 0 lower

3132 1 capitals

3133 2 this is a sentence

3134 3 swapcase

3135 dtype: object

3136

3137 >>> s.str.upper()

3138 0 LOWER

3139 1 CAPITALS

3140 2 THIS IS A SENTENCE

3141 3 SWAPCASE

3142 dtype: object

3143

3144 >>> s.str.title()

3145 0 Lower

3146 1 Capitals

3147 2 This Is A Sentence

3148 3 Swapcase

3149 dtype: object

3150

3151 >>> s.str.capitalize()

3152 0 Lower

3153 1 Capitals

3154 2 This is a sentence

3155 3 Swapcase

3156 dtype: object

3157

3158 >>> s.str.swapcase()

3159 0 LOWER

3160 1 capitals

3161 2 THIS IS A SENTENCE

3162 3 sWaPcAsE

3163 dtype: object

3164 """

3165 # Types:

3166 # cases:

3167 # upper, lower, title, capitalize, swapcase, casefold

3168 # boolean:

3169 # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle

3170 # _doc_args holds dict of strings to use in substituting casemethod docs

3171 _doc_args: dict[str, dict[str, str]] = {}

3172 _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""}

3173 _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""}

3174 _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""}

3175 _doc_args["capitalize"] = {

3176 "type": "be capitalized",

3177 "method": "capitalize",

3178 "version": "",

3179 }

3180 _doc_args["swapcase"] = {

3181 "type": "be swapcased",

3182 "method": "swapcase",

3183 "version": "",

3184 }

3185 _doc_args["casefold"] = {

3186 "type": "be casefolded",

3187 "method": "casefold",

3188 "version": "",

3189 }

3190

3191 @Appender(_shared_docs["casemethods"] % _doc_args["lower"])

3192 @forbid_nonstring_types(["bytes"])

3193 def lower(self):

3194 result = self._data.array._str_lower()

3195 return self._wrap_result(result)

3196

3197 @Appender(_shared_docs["casemethods"] % _doc_args["upper"])

3198 @forbid_nonstring_types(["bytes"])

3199 def upper(self):

3200 result = self._data.array._str_upper()

3201 return self._wrap_result(result)

3202

3203 @Appender(_shared_docs["casemethods"] % _doc_args["title"])

3204 @forbid_nonstring_types(["bytes"])

3205 def title(self):

3206 result = self._data.array._str_title()

3207 return self._wrap_result(result)

3208

3209 @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])

3210 @forbid_nonstring_types(["bytes"])

3211 def capitalize(self):

3212 result = self._data.array._str_capitalize()

3213 return self._wrap_result(result)

3214

3215 @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])

3216 @forbid_nonstring_types(["bytes"])

3217 def swapcase(self):

3218 result = self._data.array._str_swapcase()

3219 return self._wrap_result(result)

3220

3221 @Appender(_shared_docs["casemethods"] % _doc_args["casefold"])

3222 @forbid_nonstring_types(["bytes"])

3223 def casefold(self):

3224 result = self._data.array._str_casefold()

3225 return self._wrap_result(result)

3226

3227 _shared_docs[

3228 "ismethods"

3229 ] = """

3230 Check whether all characters in each string are %(type)s.

3231

3232 This is equivalent to running the Python string method

3233 :meth:`str.%(method)s` for each element of the Series/Index. If a string

3234 has zero characters, ``False`` is returned for that check.

3235

3236 Returns

3237 -------

3238 Series or Index of bool

3239 Series or Index of boolean values with the same length as the original

3240 Series/Index.

3241

3242 See Also

3243 --------

3244 Series.str.isalpha : Check whether all characters are alphabetic.

3245 Series.str.isnumeric : Check whether all characters are numeric.

3246 Series.str.isalnum : Check whether all characters are alphanumeric.

3247 Series.str.isdigit : Check whether all characters are digits.

3248 Series.str.isdecimal : Check whether all characters are decimal.

3249 Series.str.isspace : Check whether all characters are whitespace.

3250 Series.str.islower : Check whether all characters are lowercase.

3251 Series.str.isupper : Check whether all characters are uppercase.

3252 Series.str.istitle : Check whether all characters are titlecase.

3253

3254 Examples

3255 --------

3256 **Checks for Alphabetic and Numeric Characters**

3257

3258 >>> s1 = pd.Series(['one', 'one1', '1', ''])

3259

3260 >>> s1.str.isalpha()

3261 0 True

3262 1 False

3263 2 False

3264 3 False

3265 dtype: bool

3266

3267 >>> s1.str.isnumeric()

3268 0 False

3269 1 False

3270 2 True

3271 3 False

3272 dtype: bool

3273

3274 >>> s1.str.isalnum()

3275 0 True

3276 1 True

3277 2 True

3278 3 False

3279 dtype: bool

3280

3281 Note that checks against characters mixed with any additional punctuation

3282 or whitespace will evaluate to false for an alphanumeric check.

3283

3284 >>> s2 = pd.Series(['A B', '1.5', '3,000'])

3285 >>> s2.str.isalnum()

3286 0 False

3287 1 False

3288 2 False

3289 dtype: bool

3290

3291 **More Detailed Checks for Numeric Characters**

3292

3293 There are several different but overlapping sets of numeric characters that

3294 can be checked for.

3295

3296 >>> s3 = pd.Series(['23', '³', '⅕', ''])

3297

3298 The ``s3.str.isdecimal`` method checks for characters used to form numbers

3299 in base 10.

3300

3301 >>> s3.str.isdecimal()

3302 0 True

3303 1 False

3304 2 False

3305 3 False

3306 dtype: bool

3307

3308 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also

3309 includes special digits, like superscripted and subscripted digits in

3310 unicode.

3311

3312 >>> s3.str.isdigit()

3313 0 True

3314 1 True

3315 2 False

3316 3 False

3317 dtype: bool

3318

3319 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also

3320 includes other characters that can represent quantities such as unicode

3321 fractions.

3322

3323 >>> s3.str.isnumeric()

3324 0 True

3325 1 True

3326 2 True

3327 3 False

3328 dtype: bool

3329

3330 **Checks for Whitespace**

3331

3332 >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])

3333 >>> s4.str.isspace()

3334 0 True

3335 1 True

3336 2 False

3337 dtype: bool

3338

3339 **Checks for Character Case**

3340

3341 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])

3342

3343 >>> s5.str.islower()

3344 0 True

3345 1 False

3346 2 False

3347 3 False

3348 dtype: bool

3349

3350 >>> s5.str.isupper()

3351 0 False

3352 1 False

3353 2 True

3354 3 False

3355 dtype: bool

3356

3357 The ``s5.str.istitle`` method checks for whether all words are in title

3358 case (whether only the first letter of each word is capitalized). Words are

3359 assumed to be as any sequence of non-numeric characters separated by

3360 whitespace characters.

3361

3362 >>> s5.str.istitle()

3363 0 False

3364 1 True

3365 2 False

3366 3 False

3367 dtype: bool

3368 """

3369 _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"}

3370 _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"}

3371 _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"}

3372 _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"}

3373 _doc_args["islower"] = {"type": "lowercase", "method": "islower"}

3374 _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"}

3375 _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"}

3376 _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"}

3377 _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"}

3378 # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)

3379

3380 isalnum = _map_and_wrap(

3381 "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]

3382 )

3383 isalpha = _map_and_wrap(

3384 "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]

3385 )

3386 isdigit = _map_and_wrap(

3387 "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]

3388 )

3389 isspace = _map_and_wrap(

3390 "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]

3391 )

3392 islower = _map_and_wrap(

3393 "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]

3394 )

3395 isupper = _map_and_wrap(

3396 "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"]

3397 )

3398 istitle = _map_and_wrap(

3399 "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"]

3400 )

3401 isnumeric = _map_and_wrap(

3402 "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]

3403 )

3404 isdecimal = _map_and_wrap(

3405 "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]

3406 )

3407

3408

3409def cat_safe(list_of_columns: list[npt.NDArray[np.object_]], sep: str):

3410 """

3411 Auxiliary function for :meth:`str.cat`.

3412

3413 Same signature as cat_core, but handles TypeErrors in concatenation, which

3414 happen if the arrays in list_of columns have the wrong dtypes or content.

3415

3416 Parameters

3417 ----------

3418 list_of_columns : list of numpy arrays

3419 List of arrays to be concatenated with sep;

3420 these arrays may not contain NaNs!

3421 sep : string

3422 The separator string for concatenating the columns.

3423

3424 Returns

3425 -------

3426 nd.array

3427 The concatenation of list_of_columns with sep.

3428 """

3429 try:

3430 result = cat_core(list_of_columns, sep)

3431 except TypeError:

3432 # if there are any non-string values (wrong dtype or hidden behind

3433 # object dtype), np.sum will fail; catch and return with better message

3434 for column in list_of_columns:

3435 dtype = lib.infer_dtype(column, skipna=True)

3436 if dtype not in ["string", "empty"]:

3437 raise TypeError(

3438 "Concatenation requires list-likes containing only "

3439 "strings (or missing values). Offending values found in "

3440 f"column {dtype}"

3441 ) from None

3442 return result

3443

3444

3445def cat_core(list_of_columns: list, sep: str):

3446 """

3447 Auxiliary function for :meth:`str.cat`

3448

3449 Parameters

3450 ----------

3451 list_of_columns : list of numpy arrays

3452 List of arrays to be concatenated with sep;

3453 these arrays may not contain NaNs!

3454 sep : string

3455 The separator string for concatenating the columns.

3456

3457 Returns

3458 -------

3459 nd.array

3460 The concatenation of list_of_columns with sep.

3461 """

3462 if sep == "":

3463 # no need to interleave sep if it is empty

3464 arr_of_cols = np.asarray(list_of_columns, dtype=object)

3465 return np.sum(arr_of_cols, axis=0)

3466 list_with_sep = [sep] * (2 * len(list_of_columns) - 1)

3467 list_with_sep[::2] = list_of_columns

3468 arr_with_sep = np.asarray(list_with_sep, dtype=object)

3469 return np.sum(arr_with_sep, axis=0)

3470

3471

3472def _result_dtype(arr):

3473 # workaround #27953

3474 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails

3475 # when the list of values is empty.

3476 from pandas.core.arrays.string_ import StringDtype

3477

3478 if isinstance(arr.dtype, (ArrowDtype, StringDtype)):

3479 return arr.dtype

3480 return object

3481

3482

3483def _get_single_group_name(regex: re.Pattern) -> Hashable:

3484 if regex.groupindex:

3485 return next(iter(regex.groupindex))

3486 else:

3487 return None

3488

3489

3490def _get_group_names(regex: re.Pattern) -> list[Hashable]:

3491 """

3492 Get named groups from compiled regex.

3493

3494 Unnamed groups are numbered.

3495

3496 Parameters

3497 ----------

3498 regex : compiled regex

3499

3500 Returns

3501 -------

3502 list of column labels

3503 """

3504 names = {v: k for k, v in regex.groupindex.items()}

3505 return [names.get(1 + i, i) for i in range(regex.groups)]

3506

3507

3508def str_extractall(arr, pat, flags: int = 0) -> DataFrame:

3509 regex = re.compile(pat, flags=flags)

3510 # the regex must contain capture groups.

3511 if regex.groups == 0:

3512 raise ValueError("pattern contains no capture groups")

3513

3514 if isinstance(arr, ABCIndex):

3515 arr = arr.to_series().reset_index(drop=True).astype(arr.dtype)

3516

3517 columns = _get_group_names(regex)

3518 match_list = []

3519 index_list = []

3520 is_mi = arr.index.nlevels > 1

3521

3522 for subject_key, subject in arr.items():

3523 if isinstance(subject, str):

3524 if not is_mi:

3525 subject_key = (subject_key,)

3526

3527 for match_i, match_tuple in enumerate(regex.findall(subject)):

3528 if isinstance(match_tuple, str):

3529 match_tuple = (match_tuple,)

3530 na_tuple = [np.nan if group == "" else group for group in match_tuple]

3531 match_list.append(na_tuple)

3532 result_key = tuple(subject_key + (match_i,))

3533 index_list.append(result_key)

3534

3535 from pandas import MultiIndex

3536

3537 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])

3538 dtype = _result_dtype(arr)

3539

3540 result = arr._constructor_expanddim(

3541 match_list, index=index, columns=columns, dtype=dtype

3542 )

3543 return result