Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/strings/accessor.py: 34%

1from __future__ import annotations

3import codecs

4from functools import wraps

5import re

6from typing import (

7 TYPE_CHECKING,

8 Callable,

9 Hashable,

10 Literal,

11 cast,

12)

13import warnings

15import numpy as np

17from pandas._libs import lib

18from pandas._typing import (

19 AlignJoin,

20 DtypeObj,

21 F,

22 Scalar,

23)

24from pandas.util._decorators import Appender

25from pandas.util._exceptions import find_stack_level

27from pandas.core.dtypes.common import (

28 ensure_object,

29 is_bool_dtype,

30 is_categorical_dtype,

31 is_integer,

32 is_list_like,

33 is_object_dtype,

34 is_re,

35)

36from pandas.core.dtypes.generic import (

37 ABCDataFrame,

38 ABCIndex,

39 ABCMultiIndex,

40 ABCSeries,

41)

42from pandas.core.dtypes.missing import isna

44from pandas.core.arrays.arrow.dtype import ArrowDtype

45from pandas.core.base import NoNewAttributesMixin

46from pandas.core.construction import extract_array

48if TYPE_CHECKING:

49 from pandas import (

50 DataFrame,

51 Index,

52 Series,

53 )

55_shared_docs: dict[str, str] = {}

56_cpython_optimized_encoders = (

57 "utf-8",

58 "utf8",

59 "latin-1",

60 "latin1",

61 "iso-8859-1",

62 "mbcs",

63 "ascii",

64)

65_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32")

68def forbid_nonstring_types(

69 forbidden: list[str] | None, name: str | None = None

70) -> Callable[[F], F]:

71 """

72 Decorator to forbid specific types for a method of StringMethods.

74 For calling `.str.{method}` on a Series or Index, it is necessary to first

75 initialize the :class:`StringMethods` object, and then call the method.

76 However, different methods allow different input types, and so this can not

77 be checked during :meth:`StringMethods.__init__`, but must be done on a

78 per-method basis. This decorator exists to facilitate this process, and

79 make it explicit which (inferred) types are disallowed by the method.

81 :meth:`StringMethods.__init__` allows the *union* of types its different

82 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`),

83 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer'].

85 The default string types ['string', 'empty'] are allowed for all methods.

86 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method

87 then needs to forbid the types it is not intended for.

89 Parameters

90 ----------

91 forbidden : list-of-str or None

92 List of forbidden non-string types, may be one or more of

93 `['bytes', 'mixed', 'mixed-integer']`.

94 name : str, default None

95 Name of the method to use in the error message. By default, this is

96 None, in which case the name from the method being wrapped will be

97 copied. However, for working with further wrappers (like _pat_wrapper

98 and _noarg_wrapper), it is necessary to specify the name.

100 Returns

101 -------

102 func : wrapper

103 The method to which the decorator is applied, with an added check that

104 enforces the inferred type to not be in the list of forbidden types.

105

106 Raises

107 ------

108 TypeError

109 If the inferred type of the underlying data is in `forbidden`.

110 """

111 # deal with None

112 forbidden = [] if forbidden is None else forbidden

113

114 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set(

115 forbidden

116 )

117

118 def _forbid_nonstring_types(func: F) -> F:

119 func_name = func.__name__ if name is None else name

120

121 @wraps(func)

122 def wrapper(self, *args, **kwargs):

123 if self._inferred_dtype not in allowed_types:

124 msg = (

125 f"Cannot use .str.{func_name} with values of "

126 f"inferred dtype '{self._inferred_dtype}'."

127 )

128 raise TypeError(msg)

129 return func(self, *args, **kwargs)

130

131 wrapper.__name__ = func_name

132 return cast(F, wrapper)

133

134 return _forbid_nonstring_types

135

136

137def _map_and_wrap(name, docstring):

138 @forbid_nonstring_types(["bytes"], name=name)

139 def wrapper(self):

140 result = getattr(self._data.array, f"_str_{name}")()

141 return self._wrap_result(result)

142

143 wrapper.__doc__ = docstring

144 return wrapper

145

146

147class StringMethods(NoNewAttributesMixin):

148 """

149 Vectorized string functions for Series and Index.

150

151 NAs stay NA unless handled otherwise by a particular method.

152 Patterned after Python's string methods, with some inspiration from

153 R's stringr package.

154

155 Examples

156 --------

157 >>> s = pd.Series(["A_Str_Series"])

158 >>> s

159 0 A_Str_Series

160 dtype: object

161

162 >>> s.str.split("_")

163 0 [A, Str, Series]

164 dtype: object

165

166 >>> s.str.replace("_", "")

167 0 AStrSeries

168 dtype: object

169 """

170

171 # Note: see the docstring in pandas.core.strings.__init__

172 # for an explanation of the implementation.

173 # TODO: Dispatch all the methods

174 # Currently the following are not dispatched to the array

175 # * cat

176 # * extractall

177

178 def __init__(self, data) -> None:

179 from pandas.core.arrays.string_ import StringDtype

180

181 self._inferred_dtype = self._validate(data)

182 self._is_categorical = is_categorical_dtype(data.dtype)

183 self._is_string = isinstance(data.dtype, StringDtype)

184 self._data = data

185

186 self._index = self._name = None

187 if isinstance(data, ABCSeries):

188 self._index = data.index

189 self._name = data.name

190

191 # ._values.categories works for both Series/Index

192 self._parent = data._values.categories if self._is_categorical else data

193 # save orig to blow up categoricals to the right type

194 self._orig = data

195 self._freeze()

196

197 @staticmethod

198 def _validate(data):

199 """

200 Auxiliary function for StringMethods, infers and checks dtype of data.

201

202 This is a "first line of defence" at the creation of the StringMethods-

203 object, and just checks that the dtype is in the

204 *union* of the allowed types over all string methods below; this

205 restriction is then refined on a per-method basis using the decorator

206 @forbid_nonstring_types (more info in the corresponding docstring).

207

208 This really should exclude all series/index with any non-string values,

209 but that isn't practical for performance reasons until we have a str

210 dtype (GH 9343 / 13877)

211

212 Parameters

213 ----------

214 data : The content of the Series

215

216 Returns

217 -------

218 dtype : inferred dtype of data

219 """

220 if isinstance(data, ABCMultiIndex):

221 raise AttributeError(

222 "Can only use .str accessor with Index, not MultiIndex"

223 )

224

225 # see _libs/lib.pyx for list of inferred types

226 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"]

227

228 data = extract_array(data)

229

230 values = getattr(data, "categories", data) # categorical / normal

231

232 inferred_dtype = lib.infer_dtype(values, skipna=True)

233

234 if inferred_dtype not in allowed_types:

235 raise AttributeError("Can only use .str accessor with string values!")

236 return inferred_dtype

237

238 def __getitem__(self, key):

239 result = self._data.array._str_getitem(key)

240 return self._wrap_result(result)

241

242 def _wrap_result(

243 self,

244 result,

245 name=None,

246 expand: bool | None = None,

247 fill_value=np.nan,

248 returns_string: bool = True,

249 returns_bool: bool = False,

250 ):

251 from pandas import (

252 Index,

253 MultiIndex,

254 )

255

256 if not hasattr(result, "ndim") or not hasattr(result, "dtype"):

257 if isinstance(result, ABCDataFrame):

258 result = result.__finalize__(self._orig, name="str")

259 return result

260 assert result.ndim < 3

261

262 # We can be wrapping a string / object / categorical result, in which

263 # case we'll want to return the same dtype as the input.

264 # Or we can be wrapping a numeric output, in which case we don't want

265 # to return a StringArray.

266 # Ideally the array method returns the right array type.

267 if expand is None:

268 # infer from ndim if expand is not specified

269 expand = result.ndim != 1

270 elif expand is True and not isinstance(self._orig, ABCIndex):

271 # required when expand=True is explicitly specified

272 # not needed when inferred

273 if isinstance(result.dtype, ArrowDtype):

274 import pyarrow as pa

275

276 from pandas.compat import pa_version_under11p0

277

278 from pandas.core.arrays.arrow.array import ArrowExtensionArray

279

280 value_lengths = result._data.combine_chunks().value_lengths()

281 max_len = pa.compute.max(value_lengths).as_py()

282 min_len = pa.compute.min(value_lengths).as_py()

283 if result._hasna:

284 # ArrowExtensionArray.fillna doesn't work for list scalars

285 result = ArrowExtensionArray(

286 result._data.fill_null([None] * max_len)

287 )

288 if min_len < max_len:

289 # append nulls to each scalar list element up to max_len

290 if not pa_version_under11p0:

291 result = ArrowExtensionArray(

292 pa.compute.list_slice(

293 result._data,

294 start=0,

295 stop=max_len,

296 return_fixed_size_list=True,

297 )

298 )

299 else:

300 all_null = np.full(max_len, fill_value=None, dtype=object)

301 values = result.to_numpy()

302 new_values = []

303 for row in values:

304 if len(row) < max_len:

305 nulls = all_null[: max_len - len(row)]

306 row = np.append(row, nulls)

307 new_values.append(row)

308 pa_type = result._data.type

309 result = ArrowExtensionArray(pa.array(new_values, type=pa_type))

310 if name is not None:

311 labels = name

312 else:

313 labels = range(max_len)

314 result = {

315 label: ArrowExtensionArray(pa.array(res))

316 for label, res in zip(labels, (zip(*result.tolist())))

317 }

318 elif is_object_dtype(result):

319

320 def cons_row(x):

321 if is_list_like(x):

322 return x

323 else:

324 return [x]

325

326 result = [cons_row(x) for x in result]

327 if result and not self._is_string:

328 # propagate nan values to match longest sequence (GH 18450)

329 max_len = max(len(x) for x in result)

330 result = [

331 x * max_len if len(x) == 0 or x[0] is np.nan else x

332 for x in result

333 ]

334

335 if not isinstance(expand, bool):

336 raise ValueError("expand must be True or False")

337

338 if expand is False:

339 # if expand is False, result should have the same name

340 # as the original otherwise specified

341 if name is None:

342 name = getattr(result, "name", None)

343 if name is None:

344 # do not use logical or, _orig may be a DataFrame

345 # which has "name" column

346 name = self._orig.name

347

348 # Wait until we are sure result is a Series or Index before

349 # checking attributes (GH 12180)

350 if isinstance(self._orig, ABCIndex):

351 # if result is a boolean np.array, return the np.array

352 # instead of wrapping it into a boolean Index (GH 8875)

353 if is_bool_dtype(result):

354 return result

355

356 if expand:

357 result = list(result)

358 out = MultiIndex.from_tuples(result, names=name)

359 if out.nlevels == 1:

360 # We had all tuples of length-one, which are

361 # better represented as a regular Index.

362 out = out.get_level_values(0)

363 return out

364 else:

365 return Index(result, name=name)

366 else:

367 index = self._orig.index

368 # This is a mess.

369 dtype: DtypeObj | str | None

370 vdtype = getattr(result, "dtype", None)

371 if self._is_string:

372 if is_bool_dtype(vdtype):

373 dtype = result.dtype

374 elif returns_string:

375 dtype = self._orig.dtype

376 else:

377 dtype = vdtype

378 else:

379 dtype = vdtype

380

381 if expand:

382 cons = self._orig._constructor_expanddim

383 result = cons(result, columns=name, index=index, dtype=dtype)

384 else:

385 # Must be a Series

386 cons = self._orig._constructor

387 result = cons(result, name=name, index=index, dtype=dtype)

388 result = result.__finalize__(self._orig, method="str")

389 if name is not None and result.ndim == 1:

390 # __finalize__ might copy over the original name, but we may

391 # want the new name (e.g. str.extract).

392 result.name = name

393 return result

394

395 def _get_series_list(self, others):

396 """

397 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input

398 into a list of Series (elements without an index must match the length

399 of the calling Series/Index).

400

401 Parameters

402 ----------

403 others : Series, DataFrame, np.ndarray, list-like or list-like of

404 Objects that are either Series, Index or np.ndarray (1-dim).

405

406 Returns

407 -------

408 list of Series

409 Others transformed into list of Series.

410 """

411 from pandas import (

412 DataFrame,

413 Series,

414 )

415

416 # self._orig is either Series or Index

417 idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index

418

419 # Generally speaking, all objects without an index inherit the index

420 # `idx` of the calling Series/Index - i.e. must have matching length.

421 # Objects with an index (i.e. Series/Index/DataFrame) keep their own.

422 if isinstance(others, ABCSeries):

423 return [others]

424 elif isinstance(others, ABCIndex):

425 return [Series(others, index=idx, dtype=others.dtype)]

426 elif isinstance(others, ABCDataFrame):

427 return [others[x] for x in others]

428 elif isinstance(others, np.ndarray) and others.ndim == 2:

429 others = DataFrame(others, index=idx)

430 return [others[x] for x in others]

431 elif is_list_like(others, allow_sets=False):

432 others = list(others) # ensure iterators do not get read twice etc

433

434 # in case of list-like `others`, all elements must be

435 # either Series/Index/np.ndarray (1-dim)...

436 if all(

437 isinstance(x, (ABCSeries, ABCIndex))

438 or (isinstance(x, np.ndarray) and x.ndim == 1)

439 for x in others

440 ):

441 los: list[Series] = []

442 while others: # iterate through list and append each element

443 los = los + self._get_series_list(others.pop(0))

444 return los

445 # ... or just strings

446 elif all(not is_list_like(x) for x in others):

447 return [Series(others, index=idx)]

448 raise TypeError(

449 "others must be Series, Index, DataFrame, np.ndarray "

450 "or list-like (either containing only strings or "

451 "containing only objects of type Series/Index/"

452 "np.ndarray[1-dim])"

453 )

454

455 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"])

456 def cat(

457 self,

458 others=None,

459 sep=None,

460 na_rep=None,

461 join: AlignJoin = "left",

462 ) -> str | Series | Index:

463 """

464 Concatenate strings in the Series/Index with given separator.

465

466 If `others` is specified, this function concatenates the Series/Index

467 and elements of `others` element-wise.

468 If `others` is not passed, then all values in the Series/Index are

469 concatenated into a single string with a given `sep`.

470

471 Parameters

472 ----------

473 others : Series, Index, DataFrame, np.ndarray or list-like

474 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and

475 other list-likes of strings must have the same length as the

476 calling Series/Index, with the exception of indexed objects (i.e.

477 Series/Index/DataFrame) if `join` is not None.

478

479 If others is a list-like that contains a combination of Series,

480 Index or np.ndarray (1-dim), then all elements will be unpacked and

481 must satisfy the above criteria individually.

482

483 If others is None, the method returns the concatenation of all

484 strings in the calling Series/Index.

485 sep : str, default ''

486 The separator between the different elements/columns. By default

487 the empty string `''` is used.

488 na_rep : str or None, default None

489 Representation that is inserted for all missing values:

490

491 - If `na_rep` is None, and `others` is None, missing values in the

492 Series/Index are omitted from the result.

493 - If `na_rep` is None, and `others` is not None, a row containing a

494 missing value in any of the columns (before concatenation) will

495 have a missing value in the result.

496 join : {'left', 'right', 'outer', 'inner'}, default 'left'

497 Determines the join-style between the calling Series/Index and any

498 Series/Index/DataFrame in `others` (objects without an index need

499 to match the length of the calling Series/Index). To disable

500 alignment, use `.values` on any Series/Index/DataFrame in `others`.

501

502 Returns

503 -------

504 str, Series or Index

505 If `others` is None, `str` is returned, otherwise a `Series/Index`

506 (same type as caller) of objects is returned.

507

508 See Also

509 --------

510 split : Split each string in the Series/Index.

511 join : Join lists contained as elements in the Series/Index.

512

513 Examples

514 --------

515 When not passing `others`, all values are concatenated into a single

516 string:

517

518 >>> s = pd.Series(['a', 'b', np.nan, 'd'])

519 >>> s.str.cat(sep=' ')

520 'a b d'

521

522 By default, NA values in the Series are ignored. Using `na_rep`, they

523 can be given a representation:

524

525 >>> s.str.cat(sep=' ', na_rep='?')

526 'a b ? d'

527

528 If `others` is specified, corresponding values are concatenated with

529 the separator. Result will be a Series of strings.

530

531 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',')

532 0 a,A

533 1 b,B

534 2 NaN

535 3 d,D

536 dtype: object

537

538 Missing values will remain missing in the result, but can again be

539 represented using `na_rep`

540

541 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-')

542 0 a,A

543 1 b,B

544 2 -,C

545 3 d,D

546 dtype: object

547

548 If `sep` is not specified, the values are concatenated without

549 separation.

550

551 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-')

552 0 aA

553 1 bB

554 2 -C

555 3 dD

556 dtype: object

557

558 Series with different indexes can be aligned before concatenation. The

559 `join`-keyword works as in other methods.

560

561 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2])

562 >>> s.str.cat(t, join='left', na_rep='-')

563 0 aa

564 1 b-

565 2 -c

566 3 dd

567 dtype: object

568 >>>

569 >>> s.str.cat(t, join='outer', na_rep='-')

570 0 aa

571 1 b-

572 2 -c

573 3 dd

574 4 -e

575 dtype: object

576 >>>

577 >>> s.str.cat(t, join='inner', na_rep='-')

578 0 aa

579 2 -c

580 3 dd

581 dtype: object

582 >>>

583 >>> s.str.cat(t, join='right', na_rep='-')

584 3 dd

585 0 aa

586 4 -e

587 2 -c

588 dtype: object

589

590 For more examples, see :ref:`here <text.concatenate>`.

591 """

592 # TODO: dispatch

593 from pandas import (

594 Index,

595 Series,

596 concat,

597 )

598

599 if isinstance(others, str):

600 raise ValueError("Did you mean to supply a `sep` keyword?")

601 if sep is None:

602 sep = ""

603

604 if isinstance(self._orig, ABCIndex):

605 data = Series(self._orig, index=self._orig, dtype=self._orig.dtype)

606 else: # Series

607 data = self._orig

608

609 # concatenate Series/Index with itself if no "others"

610 if others is None:

611 # error: Incompatible types in assignment (expression has type

612 # "ndarray", variable has type "Series")

613 data = ensure_object(data) # type: ignore[assignment]

614 na_mask = isna(data)

615 if na_rep is None and na_mask.any():

616 return sep.join(data[~na_mask])

617 elif na_rep is not None and na_mask.any():

618 return sep.join(np.where(na_mask, na_rep, data))

619 else:

620 return sep.join(data)

621

622 try:

623 # turn anything in "others" into lists of Series

624 others = self._get_series_list(others)

625 except ValueError as err: # do not catch TypeError raised by _get_series_list

626 raise ValueError(

627 "If `others` contains arrays or lists (or other "

628 "list-likes without an index), these must all be "

629 "of the same length as the calling Series/Index."

630 ) from err

631

632 # align if required

633 if any(not data.index.equals(x.index) for x in others):

634 # Need to add keys for uniqueness in case of duplicate columns

635 others = concat(

636 others,

637 axis=1,

638 join=(join if join == "inner" else "outer"),

639 keys=range(len(others)),

640 sort=False,

641 copy=False,

642 )

643 data, others = data.align(others, join=join)

644 others = [others[x] for x in others] # again list of Series

645

646 all_cols = [ensure_object(x) for x in [data] + others]

647 na_masks = np.array([isna(x) for x in all_cols])

648 union_mask = np.logical_or.reduce(na_masks, axis=0)

649

650 if na_rep is None and union_mask.any():

651 # no na_rep means NaNs for all rows where any column has a NaN

652 # only necessary if there are actually any NaNs

653 result = np.empty(len(data), dtype=object)

654 np.putmask(result, union_mask, np.nan)

655

656 not_masked = ~union_mask

657 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep)

658 elif na_rep is not None and union_mask.any():

659 # fill NaNs with na_rep in case there are actually any NaNs

660 all_cols = [

661 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols)

662 ]

663 result = cat_safe(all_cols, sep)

664 else:

665 # no NaNs - can just concatenate

666 result = cat_safe(all_cols, sep)

667

668 out: Index | Series

669 if isinstance(self._orig, ABCIndex):

670 # add dtype for case that result is all-NA

671

672 out = Index(result, dtype=object, name=self._orig.name)

673 else: # Series

674 if is_categorical_dtype(self._orig.dtype):

675 # We need to infer the new categories.

676 dtype = None

677 else:

678 dtype = self._orig.dtype

679 res_ser = Series(

680 result, dtype=dtype, index=data.index, name=self._orig.name, copy=False

681 )

682 out = res_ser.__finalize__(self._orig, method="str_cat")

683 return out

684

685 _shared_docs[

686 "str_split"

687 ] = r"""

688 Split strings around given separator/delimiter.

689

690 Splits the string in the Series/Index from the %(side)s,

691 at the specified delimiter string.

692

693 Parameters

694 ----------

695 pat : str%(pat_regex)s, optional

696 %(pat_description)s.

697 If not specified, split on whitespace.

698 n : int, default -1 (all)

699 Limit number of splits in output.

700 ``None``, 0 and -1 will be interpreted as return all splits.

701 expand : bool, default False

702 Expand the split strings into separate columns.

703

704 - If ``True``, return DataFrame/MultiIndex expanding dimensionality.

705 - If ``False``, return Series/Index, containing lists of strings.

706 %(regex_argument)s

707 Returns

708 -------

709 Series, Index, DataFrame or MultiIndex

710 Type matches caller unless ``expand=True`` (see Notes).

711 %(raises_split)s

712 See Also

713 --------

714 Series.str.split : Split strings around given separator/delimiter.

715 Series.str.rsplit : Splits string around given separator/delimiter,

716 starting from the right.

717 Series.str.join : Join lists contained as elements in the Series/Index

718 with passed delimiter.

719 str.split : Standard library version for split.

720 str.rsplit : Standard library version for rsplit.

721

722 Notes

723 -----

724 The handling of the `n` keyword depends on the number of found splits:

725

726 - If found splits > `n`, make first `n` splits only

727 - If found splits <= `n`, make all splits

728 - If for a certain row the number of found splits < `n`,

729 append `None` for padding up to `n` if ``expand=True``

730

731 If using ``expand=True``, Series and Index callers return DataFrame and

732 MultiIndex objects, respectively.

733 %(regex_pat_note)s

734 Examples

735 --------

736 >>> s = pd.Series(

737 ... [

738 ... "this is a regular sentence",

739 ... "https://docs.python.org/3/tutorial/index.html",

740 ... np.nan

741 ... ]

742 ... )

743 >>> s

744 0 this is a regular sentence

745 1 https://docs.python.org/3/tutorial/index.html

746 2 NaN

747 dtype: object

748

749 In the default setting, the string is split by whitespace.

750

751 >>> s.str.split()

752 0 [this, is, a, regular, sentence]

753 1 [https://docs.python.org/3/tutorial/index.html]

754 2 NaN

755 dtype: object

756

757 Without the `n` parameter, the outputs of `rsplit` and `split`

758 are identical.

759

760 >>> s.str.rsplit()

761 0 [this, is, a, regular, sentence]

762 1 [https://docs.python.org/3/tutorial/index.html]

763 2 NaN

764 dtype: object

765

766 The `n` parameter can be used to limit the number of splits on the

767 delimiter. The outputs of `split` and `rsplit` are different.

768

769 >>> s.str.split(n=2)

770 0 [this, is, a regular sentence]

771 1 [https://docs.python.org/3/tutorial/index.html]

772 2 NaN

773 dtype: object

774

775 >>> s.str.rsplit(n=2)

776 0 [this is a, regular, sentence]

777 1 [https://docs.python.org/3/tutorial/index.html]

778 2 NaN

779 dtype: object

780

781 The `pat` parameter can be used to split by other characters.

782

783 >>> s.str.split(pat="/")

784 0 [this is a regular sentence]

785 1 [https:, , docs.python.org, 3, tutorial, index...

786 2 NaN

787 dtype: object

788

789 When using ``expand=True``, the split elements will expand out into

790 separate columns. If NaN is present, it is propagated throughout

791 the columns during the split.

792

793 >>> s.str.split(expand=True)

794 0 1 2 3 4

795 0 this is a regular sentence

796 1 https://docs.python.org/3/tutorial/index.html None None None None

797 2 NaN NaN NaN NaN NaN

798

799 For slightly more complex use cases like splitting the html document name

800 from a url, a combination of parameter settings can be used.

801

802 >>> s.str.rsplit("/", n=1, expand=True)

803 0 1

804 0 this is a regular sentence None

805 1 https://docs.python.org/3/tutorial index.html

806 2 NaN NaN

807 %(regex_examples)s"""

808

809 @Appender(

810 _shared_docs["str_split"]

811 % {

812 "side": "beginning",

813 "pat_regex": " or compiled regex",

814 "pat_description": "String or regular expression to split on",

815 "regex_argument": """

816 regex : bool, default None

817 Determines if the passed-in pattern is a regular expression:

818

819 - If ``True``, assumes the passed-in pattern is a regular expression

820 - If ``False``, treats the pattern as a literal string.

821 - If ``None`` and `pat` length is 1, treats `pat` as a literal string.

822 - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression.

823 - Cannot be set to False if `pat` is a compiled regex

824

825 .. versionadded:: 1.4.0

826 """,

827 "raises_split": """

828 Raises

829 ------

830 ValueError

831 * if `regex` is False and `pat` is a compiled regex

832 """,

833 "regex_pat_note": """

834 Use of `regex =False` with a `pat` as a compiled regex will raise an error.

835 """,

836 "method": "split",

837 "regex_examples": r"""

838 Remember to escape special characters when explicitly using regular expressions.

839

840 >>> s = pd.Series(["foo and bar plus baz"])

841 >>> s.str.split(r"and|plus", expand=True)

842 0 1 2

843 0 foo bar baz

844

845 Regular expressions can be used to handle urls or file names.

846 When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled

847 as a regex only if ``len(pat) != 1``.

848

849 >>> s = pd.Series(['foojpgbar.jpg'])

850 >>> s.str.split(r".", expand=True)

851 0 1

852 0 foojpgbar jpg

853

854 >>> s.str.split(r"\.jpg", expand=True)

855 0 1

856 0 foojpgbar

857

858 When ``regex=True``, `pat` is interpreted as a regex

859

860 >>> s.str.split(r"\.jpg", regex=True, expand=True)

861 0 1

862 0 foojpgbar

863

864 A compiled regex can be passed as `pat`

865

866 >>> import re

867 >>> s.str.split(re.compile(r"\.jpg"), expand=True)

868 0 1

869 0 foojpgbar

870

871 When ``regex=False``, `pat` is interpreted as the string itself

872

873 >>> s.str.split(r"\.jpg", regex=False, expand=True)

874 0

875 0 foojpgbar.jpg

876 """,

877 }

878 )

879 @forbid_nonstring_types(["bytes"])

880 def split(

881 self,

882 pat: str | re.Pattern | None = None,

883 *,

884 n=-1,

885 expand: bool = False,

886 regex: bool | None = None,

887 ):

888 if regex is False and is_re(pat):

889 raise ValueError(

890 "Cannot use a compiled regex as replacement pattern with regex=False"

891 )

892 if is_re(pat):

893 regex = True

894 result = self._data.array._str_split(pat, n, expand, regex)

895 return self._wrap_result(result, returns_string=expand, expand=expand)

896

897 @Appender(

898 _shared_docs["str_split"]

899 % {

900 "side": "end",

901 "pat_regex": "",

902 "pat_description": "String to split on",

903 "regex_argument": "",

904 "raises_split": "",

905 "regex_pat_note": "",

906 "method": "rsplit",

907 "regex_examples": "",

908 }

909 )

910 @forbid_nonstring_types(["bytes"])

911 def rsplit(self, pat=None, *, n=-1, expand: bool = False):

912 result = self._data.array._str_rsplit(pat, n=n)

913 return self._wrap_result(result, expand=expand, returns_string=expand)

914

915 _shared_docs[

916 "str_partition"

917 ] = """

918 Split the string at the %(side)s occurrence of `sep`.

919

920 This method splits the string at the %(side)s occurrence of `sep`,

921 and returns 3 elements containing the part before the separator,

922 the separator itself, and the part after the separator.

923 If the separator is not found, return %(return)s.

924

925 Parameters

926 ----------

927 sep : str, default whitespace

928 String to split on.

929 expand : bool, default True

930 If True, return DataFrame/MultiIndex expanding dimensionality.

931 If False, return Series/Index.

932

933 Returns

934 -------

935 DataFrame/MultiIndex or Series/Index of objects

936

937 See Also

938 --------

939 %(also)s

940 Series.str.split : Split strings around given separators.

941 str.partition : Standard library version.

942

943 Examples

944 --------

945

946 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])

947 >>> s

948 0 Linda van der Berg

949 1 George Pitt-Rivers

950 dtype: object

951

952 >>> s.str.partition()

953 0 1 2

954 0 Linda van der Berg

955 1 George Pitt-Rivers

956

957 To partition by the last space instead of the first one:

958

959 >>> s.str.rpartition()

960 0 1 2

961 0 Linda van der Berg

962 1 George Pitt-Rivers

963

964 To partition by something different than a space:

965

966 >>> s.str.partition('-')

967 0 1 2

968 0 Linda van der Berg

969 1 George Pitt - Rivers

970

971 To return a Series containing tuples instead of a DataFrame:

972

973 >>> s.str.partition('-', expand=False)

974 0 (Linda van der Berg, , )

975 1 (George Pitt, -, Rivers)

976 dtype: object

977

978 Also available on indices:

979

980 >>> idx = pd.Index(['X 123', 'Y 999'])

981 >>> idx

982 Index(['X 123', 'Y 999'], dtype='object')

983

984 Which will create a MultiIndex:

985

986 >>> idx.str.partition()

987 MultiIndex([('X', ' ', '123'),

988 ('Y', ' ', '999')],

989 )

990

991 Or an index with tuples with ``expand=False``:

992

993 >>> idx.str.partition(expand=False)

994 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object')

995 """

996

997 @Appender(

998 _shared_docs["str_partition"]

999 % {

1000 "side": "first",

1001 "return": "3 elements containing the string itself, followed by two "

1002 "empty strings",

1003 "also": "rpartition : Split the string at the last occurrence of `sep`.",

1004 }

1005 )

1006 @forbid_nonstring_types(["bytes"])

1007 def partition(self, sep: str = " ", expand: bool = True):

1008 result = self._data.array._str_partition(sep, expand)

1009 return self._wrap_result(result, expand=expand, returns_string=expand)

1010

1011 @Appender(

1012 _shared_docs["str_partition"]

1013 % {

1014 "side": "last",

1015 "return": "3 elements containing two empty strings, followed by the "

1016 "string itself",

1017 "also": "partition : Split the string at the first occurrence of `sep`.",

1018 }

1019 )

1020 @forbid_nonstring_types(["bytes"])

1021 def rpartition(self, sep: str = " ", expand: bool = True):

1022 result = self._data.array._str_rpartition(sep, expand)

1023 return self._wrap_result(result, expand=expand, returns_string=expand)

1024

1025 def get(self, i):

1026 """

1027 Extract element from each component at specified position or with specified key.

1028

1029 Extract element from lists, tuples, dict, or strings in each element in the

1030 Series/Index.

1031

1032 Parameters

1033 ----------

1034 i : int or hashable dict label

1035 Position or key of element to extract.

1036

1037 Returns

1038 -------

1039 Series or Index

1040

1041 Examples

1042 --------

1043 >>> s = pd.Series(["String",

1044 ... (1, 2, 3),

1045 ... ["a", "b", "c"],

1046 ... 123,

1047 ... -456,

1048 ... {1: "Hello", "2": "World"}])

1049 >>> s

1050 0 String

1051 1 (1, 2, 3)

1052 2 [a, b, c]

1053 3 123

1054 4 -456

1055 5 {1: 'Hello', '2': 'World'}

1056 dtype: object

1057

1058 >>> s.str.get(1)

1059 0 t

1060 1 2

1061 2 b

1062 3 NaN

1063 4 NaN

1064 5 Hello

1065 dtype: object

1066

1067 >>> s.str.get(-1)

1068 0 g

1069 1 3

1070 2 c

1071 3 NaN

1072 4 NaN

1073 5 None

1074 dtype: object

1075

1076 Return element with given key

1077

1078 >>> s = pd.Series([{"name": "Hello", "value": "World"},

1079 ... {"name": "Goodbye", "value": "Planet"}])

1080 >>> s.str.get('name')

1081 0 Hello

1082 1 Goodbye

1083 dtype: object

1084 """

1085 result = self._data.array._str_get(i)

1086 return self._wrap_result(result)

1087

1088 @forbid_nonstring_types(["bytes"])

1089 def join(self, sep):

1090 """

1091 Join lists contained as elements in the Series/Index with passed delimiter.

1092

1093 If the elements of a Series are lists themselves, join the content of these

1094 lists using the delimiter passed to the function.

1095 This function is an equivalent to :meth:`str.join`.

1096

1097 Parameters

1098 ----------

1099 sep : str

1100 Delimiter to use between list entries.

1101

1102 Returns

1103 -------

1104 Series/Index: object

1105 The list entries concatenated by intervening occurrences of the

1106 delimiter.

1107

1108 Raises

1109 ------

1110 AttributeError

1111 If the supplied Series contains neither strings nor lists.

1112

1113 See Also

1114 --------

1115 str.join : Standard library version of this method.

1116 Series.str.split : Split strings around given separator/delimiter.

1117

1118 Notes

1119 -----

1120 If any of the list items is not a string object, the result of the join

1121 will be `NaN`.

1122

1123 Examples

1124 --------

1125 Example with a list that contains non-string elements.

1126

1127 >>> s = pd.Series([['lion', 'elephant', 'zebra'],

1128 ... [1.1, 2.2, 3.3],

1129 ... ['cat', np.nan, 'dog'],

1130 ... ['cow', 4.5, 'goat'],

1131 ... ['duck', ['swan', 'fish'], 'guppy']])

1132 >>> s

1133 0 [lion, elephant, zebra]

1134 1 [1.1, 2.2, 3.3]

1135 2 [cat, nan, dog]

1136 3 [cow, 4.5, goat]

1137 4 [duck, [swan, fish], guppy]

1138 dtype: object

1139

1140 Join all lists using a '-'. The lists containing object(s) of types other

1141 than str will produce a NaN.

1142

1143 >>> s.str.join('-')

1144 0 lion-elephant-zebra

1145 1 NaN

1146 2 NaN

1147 3 NaN

1148 4 NaN

1149 dtype: object

1150 """

1151 result = self._data.array._str_join(sep)

1152 return self._wrap_result(result)

1153

1154 @forbid_nonstring_types(["bytes"])

1155 def contains(

1156 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True

1157 ):

1158 r"""

1159 Test if pattern or regex is contained within a string of a Series or Index.

1160

1161 Return boolean Series or Index based on whether a given pattern or regex is

1162 contained within a string of a Series or Index.

1163

1164 Parameters

1165 ----------

1166 pat : str

1167 Character sequence or regular expression.

1168 case : bool, default True

1169 If True, case sensitive.

1170 flags : int, default 0 (no flags)

1171 Flags to pass through to the re module, e.g. re.IGNORECASE.

1172 na : scalar, optional

1173 Fill value for missing values. The default depends on dtype of the

1174 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,

1175 ``pandas.NA`` is used.

1176 regex : bool, default True

1177 If True, assumes the pat is a regular expression.

1178

1179 If False, treats the pat as a literal string.

1180

1181 Returns

1182 -------

1183 Series or Index of boolean values

1184 A Series or Index of boolean values indicating whether the

1185 given pattern is contained within the string of each element

1186 of the Series or Index.

1187

1188 See Also

1189 --------

1190 match : Analogous, but stricter, relying on re.match instead of re.search.

1191 Series.str.startswith : Test if the start of each string element matches a

1192 pattern.

1193 Series.str.endswith : Same as startswith, but tests the end of string.

1194

1195 Examples

1196 --------

1197 Returning a Series of booleans using only a literal pattern.

1198

1199 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])

1200 >>> s1.str.contains('og', regex=False)

1201 0 False

1202 1 True

1203 2 False

1204 3 False

1205 4 NaN

1206 dtype: object

1207

1208 Returning an Index of booleans using only a literal pattern.

1209

1210 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])

1211 >>> ind.str.contains('23', regex=False)

1212 Index([False, False, False, True, nan], dtype='object')

1213

1214 Specifying case sensitivity using `case`.

1215

1216 >>> s1.str.contains('oG', case=True, regex=True)

1217 0 False

1218 1 False

1219 2 False

1220 3 False

1221 4 NaN

1222 dtype: object

1223

1224 Specifying `na` to be `False` instead of `NaN` replaces NaN values

1225 with `False`. If Series or Index does not contain NaN values

1226 the resultant dtype will be `bool`, otherwise, an `object` dtype.

1227

1228 >>> s1.str.contains('og', na=False, regex=True)

1229 0 False

1230 1 True

1231 2 False

1232 3 False

1233 4 False

1234 dtype: bool

1235

1236 Returning 'house' or 'dog' when either expression occurs in a string.

1237

1238 >>> s1.str.contains('house|dog', regex=True)

1239 0 False

1240 1 True

1241 2 True

1242 3 False

1243 4 NaN

1244 dtype: object

1245

1246 Ignoring case sensitivity using `flags` with regex.

1247

1248 >>> import re

1249 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True)

1250 0 False

1251 1 False

1252 2 True

1253 3 False

1254 4 NaN

1255 dtype: object

1256

1257 Returning any digit using regular expression.

1258

1259 >>> s1.str.contains('\\d', regex=True)

1260 0 False

1261 1 False

1262 2 False

1263 3 True

1264 4 NaN

1265 dtype: object

1266

1267 Ensure `pat` is a not a literal pattern when `regex` is set to True.

1268 Note in the following example one might expect only `s2[1]` and `s2[3]` to

1269 return `True`. However, '.0' as a regex matches any character

1270 followed by a 0.

1271

1272 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35'])

1273 >>> s2.str.contains('.0', regex=True)

1274 0 True

1275 1 True

1276 2 False

1277 3 True

1278 4 False

1279 dtype: bool

1280 """

1281 if regex and re.compile(pat).groups:

1282 warnings.warn(

1283 "This pattern is interpreted as a regular expression, and has "

1284 "match groups. To actually get the groups, use str.extract.",

1285 UserWarning,

1286 stacklevel=find_stack_level(),

1287 )

1288

1289 result = self._data.array._str_contains(pat, case, flags, na, regex)

1290 return self._wrap_result(result, fill_value=na, returns_string=False)

1291

1292 @forbid_nonstring_types(["bytes"])

1293 def match(self, pat, case: bool = True, flags: int = 0, na=None):

1294 """

1295 Determine if each string starts with a match of a regular expression.

1296

1297 Parameters

1298 ----------

1299 pat : str

1300 Character sequence or regular expression.

1301 case : bool, default True

1302 If True, case sensitive.

1303 flags : int, default 0 (no flags)

1304 Regex module flags, e.g. re.IGNORECASE.

1305 na : scalar, optional

1306 Fill value for missing values. The default depends on dtype of the

1307 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,

1308 ``pandas.NA`` is used.

1309

1310 Returns

1311 -------

1312 Series/Index/array of boolean values

1313

1314 See Also

1315 --------

1316 fullmatch : Stricter matching that requires the entire string to match.

1317 contains : Analogous, but less strict, relying on re.search instead of

1318 re.match.

1319 extract : Extract matched groups.

1320 """

1321 result = self._data.array._str_match(pat, case=case, flags=flags, na=na)

1322 return self._wrap_result(result, fill_value=na, returns_string=False)

1323

1324 @forbid_nonstring_types(["bytes"])

1325 def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None):

1326 """

1327 Determine if each string entirely matches a regular expression.

1328

1329 .. versionadded:: 1.1.0

1330

1331 Parameters

1332 ----------

1333 pat : str

1334 Character sequence or regular expression.

1335 case : bool, default True

1336 If True, case sensitive.

1337 flags : int, default 0 (no flags)

1338 Regex module flags, e.g. re.IGNORECASE.

1339 na : scalar, optional

1340 Fill value for missing values. The default depends on dtype of the

1341 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``,

1342 ``pandas.NA`` is used.

1343

1344 Returns

1345 -------

1346 Series/Index/array of boolean values

1347

1348 See Also

1349 --------

1350 match : Similar, but also returns `True` when only a *prefix* of the string

1351 matches the regular expression.

1352 extract : Extract matched groups.

1353 """

1354 result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na)

1355 return self._wrap_result(result, fill_value=na, returns_string=False)

1356

1357 @forbid_nonstring_types(["bytes"])

1358 def replace(

1359 self,

1360 pat: str | re.Pattern,

1361 repl: str | Callable,

1362 n: int = -1,

1363 case: bool | None = None,

1364 flags: int = 0,

1365 regex: bool = False,

1366 ):

1367 r"""

1368 Replace each occurrence of pattern/regex in the Series/Index.

1369

1370 Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on

1371 the regex value.

1372

1373 Parameters

1374 ----------

1375 pat : str or compiled regex

1376 String can be a character sequence or regular expression.

1377 repl : str or callable

1378 Replacement string or a callable. The callable is passed the regex

1379 match object and must return a replacement string to be used.

1380 See :func:`re.sub`.

1381 n : int, default -1 (all)

1382 Number of replacements to make from start.

1383 case : bool, default None

1384 Determines if replace is case sensitive:

1385

1386 - If True, case sensitive (the default if `pat` is a string)

1387 - Set to False for case insensitive

1388 - Cannot be set if `pat` is a compiled regex.

1389

1390 flags : int, default 0 (no flags)

1391 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled

1392 regex.

1393 regex : bool, default False

1394 Determines if the passed-in pattern is a regular expression:

1395

1396 - If True, assumes the passed-in pattern is a regular expression.

1397 - If False, treats the pattern as a literal string

1398 - Cannot be set to False if `pat` is a compiled regex or `repl` is

1399 a callable.

1400

1401 Returns

1402 -------

1403 Series or Index of object

1404 A copy of the object with all matching occurrences of `pat` replaced by

1405 `repl`.

1406

1407 Raises

1408 ------

1409 ValueError

1410 * if `regex` is False and `repl` is a callable or `pat` is a compiled

1411 regex

1412 * if `pat` is a compiled regex and `case` or `flags` is set

1413

1414 Notes

1415 -----

1416 When `pat` is a compiled regex, all flags should be included in the

1417 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled

1418 regex will raise an error.

1419

1420 Examples

1421 --------

1422 When `pat` is a string and `regex` is True (the default), the given `pat`

1423 is compiled as a regex. When `repl` is a string, it replaces matching

1424 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are

1425 left as is:

1426

1427 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True)

1428 0 bao

1429 1 baz

1430 2 NaN

1431 dtype: object

1432

1433 When `pat` is a string and `regex` is False, every `pat` is replaced with

1434 `repl` as with :meth:`str.replace`:

1435

1436 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)

1437 0 bao

1438 1 fuz

1439 2 NaN

1440 dtype: object

1441

1442 When `repl` is a callable, it is called on every `pat` using

1443 :func:`re.sub`. The callable should expect one positional argument

1444 (a regex object) and return a string.

1445

1446 To get the idea:

1447

1448 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True)

1449 0 <re.Match object; span=(0, 1), match='f'>oo

1450 1 <re.Match object; span=(0, 1), match='f'>uz

1451 2 NaN

1452 dtype: object

1453

1454 Reverse every lowercase alphabetic word:

1455

1456 >>> repl = lambda m: m.group(0)[::-1]

1457 >>> ser = pd.Series(['foo 123', 'bar baz', np.nan])

1458 >>> ser.str.replace(r'[a-z]+', repl, regex=True)

1459 0 oof 123

1460 1 rab zab

1461 2 NaN

1462 dtype: object

1463

1464 Using regex groups (extract second group and swap case):

1465

1466 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)"

1467 >>> repl = lambda m: m.group('two').swapcase()

1468 >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz'])

1469 >>> ser.str.replace(pat, repl, regex=True)

1470 0 tWO

1471 1 bAR

1472 dtype: object

1473

1474 Using a compiled regex with flags

1475

1476 >>> import re

1477 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE)

1478 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True)

1479 0 foo

1480 1 bar

1481 2 NaN

1482 dtype: object

1483 """

1484 # Check whether repl is valid (GH 13438, GH 15055)

1485 if not (isinstance(repl, str) or callable(repl)):

1486 raise TypeError("repl must be a string or callable")

1487

1488 is_compiled_re = is_re(pat)

1489 if regex or regex is None:

1490 if is_compiled_re and (case is not None or flags != 0):

1491 raise ValueError(

1492 "case and flags cannot be set when pat is a compiled regex"

1493 )

1494

1495 elif is_compiled_re:

1496 raise ValueError(

1497 "Cannot use a compiled regex as replacement pattern with regex=False"

1498 )

1499 elif callable(repl):

1500 raise ValueError("Cannot use a callable replacement when regex=False")

1501

1502 if case is None:

1503 case = True

1504

1505 result = self._data.array._str_replace(

1506 pat, repl, n=n, case=case, flags=flags, regex=regex

1507 )

1508 return self._wrap_result(result)

1509

1510 @forbid_nonstring_types(["bytes"])

1511 def repeat(self, repeats):

1512 """

1513 Duplicate each string in the Series or Index.

1514

1515 Parameters

1516 ----------

1517 repeats : int or sequence of int

1518 Same value for all (int) or different value per (sequence).

1519

1520 Returns

1521 -------

1522 Series or pandas.Index

1523 Series or Index of repeated string objects specified by

1524 input parameter repeats.

1525

1526 Examples

1527 --------

1528 >>> s = pd.Series(['a', 'b', 'c'])

1529 >>> s

1530 0 a

1531 1 b

1532 2 c

1533 dtype: object

1534

1535 Single int repeats string in Series

1536

1537 >>> s.str.repeat(repeats=2)

1538 0 aa

1539 1 bb

1540 2 cc

1541 dtype: object

1542

1543 Sequence of int repeats corresponding string in Series

1544

1545 >>> s.str.repeat(repeats=[1, 2, 3])

1546 0 a

1547 1 bb

1548 2 ccc

1549 dtype: object

1550 """

1551 result = self._data.array._str_repeat(repeats)

1552 return self._wrap_result(result)

1553

1554 @forbid_nonstring_types(["bytes"])

1555 def pad(

1556 self,

1557 width,

1558 side: Literal["left", "right", "both"] = "left",

1559 fillchar: str = " ",

1560 ):

1561 """

1562 Pad strings in the Series/Index up to width.

1563

1564 Parameters

1565 ----------

1566 width : int

1567 Minimum width of resulting string; additional characters will be filled

1568 with character defined in `fillchar`.

1569 side : {'left', 'right', 'both'}, default 'left'

1570 Side from which to fill resulting string.

1571 fillchar : str, default ' '

1572 Additional character for filling, default is whitespace.

1573

1574 Returns

1575 -------

1576 Series or Index of object

1577 Returns Series or Index with minimum number of char in object.

1578

1579 See Also

1580 --------

1581 Series.str.rjust : Fills the left side of strings with an arbitrary

1582 character. Equivalent to ``Series.str.pad(side='left')``.

1583 Series.str.ljust : Fills the right side of strings with an arbitrary

1584 character. Equivalent to ``Series.str.pad(side='right')``.

1585 Series.str.center : Fills both sides of strings with an arbitrary

1586 character. Equivalent to ``Series.str.pad(side='both')``.

1587 Series.str.zfill : Pad strings in the Series/Index by prepending '0'

1588 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``.

1589

1590 Examples

1591 --------

1592 >>> s = pd.Series(["caribou", "tiger"])

1593 >>> s

1594 0 caribou

1595 1 tiger

1596 dtype: object

1597

1598 >>> s.str.pad(width=10)

1599 0 caribou

1600 1 tiger

1601 dtype: object

1602

1603 >>> s.str.pad(width=10, side='right', fillchar='-')

1604 0 caribou---

1605 1 tiger-----

1606 dtype: object

1607

1608 >>> s.str.pad(width=10, side='both', fillchar='-')

1609 0 -caribou--

1610 1 --tiger---

1611 dtype: object

1612 """

1613 if not isinstance(fillchar, str):

1614 msg = f"fillchar must be a character, not {type(fillchar).__name__}"

1615 raise TypeError(msg)

1616

1617 if len(fillchar) != 1:

1618 raise TypeError("fillchar must be a character, not str")

1619

1620 if not is_integer(width):

1621 msg = f"width must be of integer type, not {type(width).__name__}"

1622 raise TypeError(msg)

1623

1624 result = self._data.array._str_pad(width, side=side, fillchar=fillchar)

1625 return self._wrap_result(result)

1626

1627 _shared_docs[

1628 "str_pad"

1629 ] = """

1630 Pad %(side)s side of strings in the Series/Index.

1631

1632 Equivalent to :meth:`str.%(method)s`.

1633

1634 Parameters

1635 ----------

1636 width : int

1637 Minimum width of resulting string; additional characters will be filled

1638 with ``fillchar``.

1639 fillchar : str

1640 Additional character for filling, default is whitespace.

1641

1642 Returns

1643 -------

1644 Series/Index of objects.

1645 """

1646

1647 @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"})

1648 @forbid_nonstring_types(["bytes"])

1649 def center(self, width, fillchar: str = " "):

1650 return self.pad(width, side="both", fillchar=fillchar)

1651

1652 @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"})

1653 @forbid_nonstring_types(["bytes"])

1654 def ljust(self, width, fillchar: str = " "):

1655 return self.pad(width, side="right", fillchar=fillchar)

1656

1657 @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"})

1658 @forbid_nonstring_types(["bytes"])

1659 def rjust(self, width, fillchar: str = " "):

1660 return self.pad(width, side="left", fillchar=fillchar)

1661

1662 @forbid_nonstring_types(["bytes"])

1663 def zfill(self, width):

1664 """

1665 Pad strings in the Series/Index by prepending '0' characters.

1666

1667 Strings in the Series/Index are padded with '0' characters on the

1668 left of the string to reach a total string length `width`. Strings

1669 in the Series/Index with length greater or equal to `width` are

1670 unchanged.

1671

1672 Parameters

1673 ----------

1674 width : int

1675 Minimum length of resulting string; strings with length less

1676 than `width` be prepended with '0' characters.

1677

1678 Returns

1679 -------

1680 Series/Index of objects.

1681

1682 See Also

1683 --------

1684 Series.str.rjust : Fills the left side of strings with an arbitrary

1685 character.

1686 Series.str.ljust : Fills the right side of strings with an arbitrary

1687 character.

1688 Series.str.pad : Fills the specified sides of strings with an arbitrary

1689 character.

1690 Series.str.center : Fills both sides of strings with an arbitrary

1691 character.

1692

1693 Notes

1694 -----

1695 Differs from :meth:`str.zfill` which has special handling

1696 for '+'/'-' in the string.

1697

1698 Examples

1699 --------

1700 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan])

1701 >>> s

1702 0 -1

1703 1 1

1704 2 1000

1705 3 10

1706 4 NaN

1707 dtype: object

1708

1709 Note that ``10`` and ``NaN`` are not strings, therefore they are

1710 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a

1711 special character and the zero is added to the right of it

1712 (:meth:`str.zfill` would have moved it to the left). ``1000``

1713 remains unchanged as it is longer than `width`.

1714

1715 >>> s.str.zfill(3)

1716 0 -01

1717 1 001

1718 2 1000

1719 3 NaN

1720 4 NaN

1721 dtype: object

1722 """

1723 if not is_integer(width):

1724 msg = f"width must be of integer type, not {type(width).__name__}"

1725 raise TypeError(msg)

1726 f = lambda x: x.zfill(width)

1727 result = self._data.array._str_map(f)

1728 return self._wrap_result(result)

1729

1730 def slice(self, start=None, stop=None, step=None):

1731 """

1732 Slice substrings from each element in the Series or Index.

1733

1734 Parameters

1735 ----------

1736 start : int, optional

1737 Start position for slice operation.

1738 stop : int, optional

1739 Stop position for slice operation.

1740 step : int, optional

1741 Step size for slice operation.

1742

1743 Returns

1744 -------

1745 Series or Index of object

1746 Series or Index from sliced substring from original string object.

1747

1748 See Also

1749 --------

1750 Series.str.slice_replace : Replace a slice with a string.

1751 Series.str.get : Return element at position.

1752 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i`

1753 being the position.

1754

1755 Examples

1756 --------

1757 >>> s = pd.Series(["koala", "dog", "chameleon"])

1758 >>> s

1759 0 koala

1760 1 dog

1761 2 chameleon

1762 dtype: object

1763

1764 >>> s.str.slice(start=1)

1765 0 oala

1766 1 og

1767 2 hameleon

1768 dtype: object

1769

1770 >>> s.str.slice(start=-1)

1771 0 a

1772 1 g

1773 2 n

1774 dtype: object

1775

1776 >>> s.str.slice(stop=2)

1777 0 ko

1778 1 do

1779 2 ch

1780 dtype: object

1781

1782 >>> s.str.slice(step=2)

1783 0 kaa

1784 1 dg

1785 2 caeen

1786 dtype: object

1787

1788 >>> s.str.slice(start=0, stop=5, step=3)

1789 0 kl

1790 1 d

1791 2 cm

1792 dtype: object

1793

1794 Equivalent behaviour to:

1795

1796 >>> s.str[0:5:3]

1797 0 kl

1798 1 d

1799 2 cm

1800 dtype: object

1801 """

1802 result = self._data.array._str_slice(start, stop, step)

1803 return self._wrap_result(result)

1804

1805 @forbid_nonstring_types(["bytes"])

1806 def slice_replace(self, start=None, stop=None, repl=None):

1807 """

1808 Replace a positional slice of a string with another value.

1809

1810 Parameters

1811 ----------

1812 start : int, optional

1813 Left index position to use for the slice. If not specified (None),

1814 the slice is unbounded on the left, i.e. slice from the start

1815 of the string.

1816 stop : int, optional

1817 Right index position to use for the slice. If not specified (None),

1818 the slice is unbounded on the right, i.e. slice until the

1819 end of the string.

1820 repl : str, optional

1821 String for replacement. If not specified (None), the sliced region

1822 is replaced with an empty string.

1823

1824 Returns

1825 -------

1826 Series or Index

1827 Same type as the original object.

1828

1829 See Also

1830 --------

1831 Series.str.slice : Just slicing without replacement.

1832

1833 Examples

1834 --------

1835 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde'])

1836 >>> s

1837 0 a

1838 1 ab

1839 2 abc

1840 3 abdc

1841 4 abcde

1842 dtype: object

1843

1844 Specify just `start`, meaning replace `start` until the end of the

1845 string with `repl`.

1846

1847 >>> s.str.slice_replace(1, repl='X')

1848 0 aX

1849 1 aX

1850 2 aX

1851 3 aX

1852 4 aX

1853 dtype: object

1854

1855 Specify just `stop`, meaning the start of the string to `stop` is replaced

1856 with `repl`, and the rest of the string is included.

1857

1858 >>> s.str.slice_replace(stop=2, repl='X')

1859 0 X

1860 1 X

1861 2 Xc

1862 3 Xdc

1863 4 Xcde

1864 dtype: object

1865

1866 Specify `start` and `stop`, meaning the slice from `start` to `stop` is

1867 replaced with `repl`. Everything before or after `start` and `stop` is

1868 included as is.

1869

1870 >>> s.str.slice_replace(start=1, stop=3, repl='X')

1871 0 aX

1872 1 aX

1873 2 aX

1874 3 aXc

1875 4 aXde

1876 dtype: object

1877 """

1878 result = self._data.array._str_slice_replace(start, stop, repl)

1879 return self._wrap_result(result)

1880

1881 def decode(self, encoding, errors: str = "strict"):

1882 """

1883 Decode character string in the Series/Index using indicated encoding.

1884

1885 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in

1886 python3.

1887

1888 Parameters

1889 ----------

1890 encoding : str

1891 errors : str, optional

1892

1893 Returns

1894 -------

1895 Series or Index

1896 """

1897 # TODO: Add a similar _bytes interface.

1898 if encoding in _cpython_optimized_decoders:

1899 # CPython optimized implementation

1900 f = lambda x: x.decode(encoding, errors)

1901 else:

1902 decoder = codecs.getdecoder(encoding)

1903 f = lambda x: decoder(x, errors)[0]

1904 arr = self._data.array

1905 # assert isinstance(arr, (StringArray,))

1906 result = arr._str_map(f)

1907 return self._wrap_result(result)

1908

1909 @forbid_nonstring_types(["bytes"])

1910 def encode(self, encoding, errors: str = "strict"):

1911 """

1912 Encode character string in the Series/Index using indicated encoding.

1913

1914 Equivalent to :meth:`str.encode`.

1915

1916 Parameters

1917 ----------

1918 encoding : str

1919 errors : str, optional

1920

1921 Returns

1922 -------

1923 Series/Index of objects

1924 """

1925 result = self._data.array._str_encode(encoding, errors)

1926 return self._wrap_result(result, returns_string=False)

1927

1928 _shared_docs[

1929 "str_strip"

1930 ] = r"""

1931 Remove %(position)s characters.

1932

1933 Strip whitespaces (including newlines) or a set of specified characters

1934 from each string in the Series/Index from %(side)s.

1935 Replaces any non-strings in Series with NaNs.

1936 Equivalent to :meth:`str.%(method)s`.

1937

1938 Parameters

1939 ----------

1940 to_strip : str or None, default None

1941 Specifying the set of characters to be removed.

1942 All combinations of this set of characters will be stripped.

1943 If None then whitespaces are removed.

1944

1945 Returns

1946 -------

1947 Series or Index of object

1948

1949 See Also

1950 --------

1951 Series.str.strip : Remove leading and trailing characters in Series/Index.

1952 Series.str.lstrip : Remove leading characters in Series/Index.

1953 Series.str.rstrip : Remove trailing characters in Series/Index.

1954

1955 Examples

1956 --------

1957 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True])

1958 >>> s

1959 0 1. Ant.

1960 1 2. Bee!\n

1961 2 3. Cat?\t

1962 3 NaN

1963 4 10

1964 5 True

1965 dtype: object

1966

1967 >>> s.str.strip()

1968 0 1. Ant.

1969 1 2. Bee!

1970 2 3. Cat?

1971 3 NaN

1972 4 NaN

1973 5 NaN

1974 dtype: object

1975

1976 >>> s.str.lstrip('123.')

1977 0 Ant.

1978 1 Bee!\n

1979 2 Cat?\t

1980 3 NaN

1981 4 NaN

1982 5 NaN

1983 dtype: object

1984

1985 >>> s.str.rstrip('.!? \n\t')

1986 0 1. Ant

1987 1 2. Bee

1988 2 3. Cat

1989 3 NaN

1990 4 NaN

1991 5 NaN

1992 dtype: object

1993

1994 >>> s.str.strip('123.!? \n\t')

1995 0 Ant

1996 1 Bee

1997 2 Cat

1998 3 NaN

1999 4 NaN

2000 5 NaN

2001 dtype: object

2002 """

2003

2004 @Appender(

2005 _shared_docs["str_strip"]

2006 % {

2007 "side": "left and right sides",

2008 "method": "strip",

2009 "position": "leading and trailing",

2010 }

2011 )

2012 @forbid_nonstring_types(["bytes"])

2013 def strip(self, to_strip=None):

2014 result = self._data.array._str_strip(to_strip)

2015 return self._wrap_result(result)

2016

2017 @Appender(

2018 _shared_docs["str_strip"]

2019 % {"side": "left side", "method": "lstrip", "position": "leading"}

2020 )

2021 @forbid_nonstring_types(["bytes"])

2022 def lstrip(self, to_strip=None):

2023 result = self._data.array._str_lstrip(to_strip)

2024 return self._wrap_result(result)

2025

2026 @Appender(

2027 _shared_docs["str_strip"]

2028 % {"side": "right side", "method": "rstrip", "position": "trailing"}

2029 )

2030 @forbid_nonstring_types(["bytes"])

2031 def rstrip(self, to_strip=None):

2032 result = self._data.array._str_rstrip(to_strip)

2033 return self._wrap_result(result)

2034

2035 _shared_docs[

2036 "str_removefix"

2037 ] = r"""

2038 Remove a %(side)s from an object series.

2039

2040 If the %(side)s is not present, the original string will be returned.

2041

2042 Parameters

2043 ----------

2044 %(side)s : str

2045 Remove the %(side)s of the string.

2046

2047 Returns

2048 -------

2049 Series/Index: object

2050 The Series or Index with given %(side)s removed.

2051

2052 See Also

2053 --------

2054 Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series.

2055

2056 Examples

2057 --------

2058 >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"])

2059 >>> s

2060 0 str_foo

2061 1 str_bar

2062 2 no_prefix

2063 dtype: object

2064 >>> s.str.removeprefix("str_")

2065 0 foo

2066 1 bar

2067 2 no_prefix

2068 dtype: object

2069

2070 >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"])

2071 >>> s

2072 0 foo_str

2073 1 bar_str

2074 2 no_suffix

2075 dtype: object

2076 >>> s.str.removesuffix("_str")

2077 0 foo

2078 1 bar

2079 2 no_suffix

2080 dtype: object

2081 """

2082

2083 @Appender(

2084 _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"}

2085 )

2086 @forbid_nonstring_types(["bytes"])

2087 def removeprefix(self, prefix):

2088 result = self._data.array._str_removeprefix(prefix)

2089 return self._wrap_result(result)

2090

2091 @Appender(

2092 _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"}

2093 )

2094 @forbid_nonstring_types(["bytes"])

2095 def removesuffix(self, suffix):

2096 result = self._data.array._str_removesuffix(suffix)

2097 return self._wrap_result(result)

2098

2099 @forbid_nonstring_types(["bytes"])

2100 def wrap(self, width, **kwargs):

2101 r"""

2102 Wrap strings in Series/Index at specified line width.

2103

2104 This method has the same keyword parameters and defaults as

2105 :class:`textwrap.TextWrapper`.

2106

2107 Parameters

2108 ----------

2109 width : int

2110 Maximum line width.

2111 expand_tabs : bool, optional

2112 If True, tab characters will be expanded to spaces (default: True).

2113 replace_whitespace : bool, optional

2114 If True, each whitespace character (as defined by string.whitespace)

2115 remaining after tab expansion will be replaced by a single space

2116 (default: True).

2117 drop_whitespace : bool, optional

2118 If True, whitespace that, after wrapping, happens to end up at the

2119 beginning or end of a line is dropped (default: True).

2120 break_long_words : bool, optional

2121 If True, then words longer than width will be broken in order to ensure

2122 that no lines are longer than width. If it is false, long words will

2123 not be broken, and some lines may be longer than width (default: True).

2124 break_on_hyphens : bool, optional

2125 If True, wrapping will occur preferably on whitespace and right after

2126 hyphens in compound words, as it is customary in English. If false,

2127 only whitespaces will be considered as potentially good places for line

2128 breaks, but you need to set break_long_words to false if you want truly

2129 insecable words (default: True).

2130

2131 Returns

2132 -------

2133 Series or Index

2134

2135 Notes

2136 -----

2137 Internally, this method uses a :class:`textwrap.TextWrapper` instance with

2138 default settings. To achieve behavior matching R's stringr library str_wrap

2139 function, use the arguments:

2140

2141 - expand_tabs = False

2142 - replace_whitespace = True

2143 - drop_whitespace = True

2144 - break_long_words = False

2145 - break_on_hyphens = False

2146

2147 Examples

2148 --------

2149 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped'])

2150 >>> s.str.wrap(12)

2151 0 line to be\nwrapped

2152 1 another line\nto be\nwrapped

2153 dtype: object

2154 """

2155 result = self._data.array._str_wrap(width, **kwargs)

2156 return self._wrap_result(result)

2157

2158 @forbid_nonstring_types(["bytes"])

2159 def get_dummies(self, sep: str = "|"):

2160 """

2161 Return DataFrame of dummy/indicator variables for Series.

2162

2163 Each string in Series is split by sep and returned as a DataFrame

2164 of dummy/indicator variables.

2165

2166 Parameters

2167 ----------

2168 sep : str, default "|"

2169 String to split on.

2170

2171 Returns

2172 -------

2173 DataFrame

2174 Dummy variables corresponding to values of the Series.

2175

2176 See Also

2177 --------

2178 get_dummies : Convert categorical variable into dummy/indicator

2179 variables.

2180

2181 Examples

2182 --------

2183 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies()

2184 a b c

2185 0 1 1 0

2186 1 1 0 0

2187 2 1 0 1

2188

2189 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies()

2190 a b c

2191 0 1 1 0

2192 1 0 0 0

2193 2 1 0 1

2194 """

2195 # we need to cast to Series of strings as only that has all

2196 # methods available for making the dummies...

2197 result, name = self._data.array._str_get_dummies(sep)

2198 return self._wrap_result(

2199 result,

2200 name=name,

2201 expand=True,

2202 returns_string=False,

2203 )

2204

2205 @forbid_nonstring_types(["bytes"])

2206 def translate(self, table):

2207 """

2208 Map all characters in the string through the given mapping table.

2209

2210 Equivalent to standard :meth:`str.translate`.

2211

2212 Parameters

2213 ----------

2214 table : dict

2215 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or

2216 None. Unmapped characters are left untouched.

2217 Characters mapped to None are deleted. :meth:`str.maketrans` is a

2218 helper function for making translation tables.

2219

2220 Returns

2221 -------

2222 Series or Index

2223 """

2224 result = self._data.array._str_translate(table)

2225 return self._wrap_result(result)

2226

2227 @forbid_nonstring_types(["bytes"])

2228 def count(self, pat, flags: int = 0):

2229 r"""

2230 Count occurrences of pattern in each string of the Series/Index.

2231

2232 This function is used to count the number of times a particular regex

2233 pattern is repeated in each of the string elements of the

2234 :class:`~pandas.Series`.

2235

2236 Parameters

2237 ----------

2238 pat : str

2239 Valid regular expression.

2240 flags : int, default 0, meaning no flags

2241 Flags for the `re` module. For a complete list, `see here

2242 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_.

2243 **kwargs

2244 For compatibility with other string methods. Not used.

2245

2246 Returns

2247 -------

2248 Series or Index

2249 Same type as the calling object containing the integer counts.

2250

2251 See Also

2252 --------

2253 re : Standard library module for regular expressions.

2254 str.count : Standard library version, without regular expression support.

2255

2256 Notes

2257 -----

2258 Some characters need to be escaped when passing in `pat`.

2259 eg. ``'$'`` has a special meaning in regex and must be escaped when

2260 finding this literal character.

2261

2262 Examples

2263 --------

2264 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat'])

2265 >>> s.str.count('a')

2266 0 0.0

2267 1 0.0

2268 2 2.0

2269 3 2.0

2270 4 NaN

2271 5 0.0

2272 6 1.0

2273 dtype: float64

2274

2275 Escape ``'$'`` to find the literal dollar sign.

2276

2277 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat'])

2278 >>> s.str.count('\\$')

2279 0 1

2280 1 0

2281 2 1

2282 3 2

2283 4 2

2284 5 0

2285 dtype: int64

2286

2287 This is also available on Index

2288

2289 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a')

2290 Index([0, 0, 2, 1], dtype='int64')

2291 """

2292 result = self._data.array._str_count(pat, flags)

2293 return self._wrap_result(result, returns_string=False)

2294

2295 @forbid_nonstring_types(["bytes"])

2296 def startswith(

2297 self, pat: str | tuple[str, ...], na: Scalar | None = None

2298 ) -> Series | Index:

2299 """

2300 Test if the start of each string element matches a pattern.

2301

2302 Equivalent to :meth:`str.startswith`.

2303

2304 Parameters

2305 ----------

2306 pat : str or tuple[str, ...]

2307 Character sequence or tuple of strings. Regular expressions are not

2308 accepted.

2309 na : object, default NaN

2310 Object shown if element tested is not a string. The default depends

2311 on dtype of the array. For object-dtype, ``numpy.nan`` is used.

2312 For ``StringDtype``, ``pandas.NA`` is used.

2313

2314 Returns

2315 -------

2316 Series or Index of bool

2317 A Series of booleans indicating whether the given pattern matches

2318 the start of each string element.

2319

2320 See Also

2321 --------

2322 str.startswith : Python standard library string method.

2323 Series.str.endswith : Same as startswith, but tests the end of string.

2324 Series.str.contains : Tests if string element contains a pattern.

2325

2326 Examples

2327 --------

2328 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan])

2329 >>> s

2330 0 bat

2331 1 Bear

2332 2 cat

2333 3 NaN

2334 dtype: object

2335

2336 >>> s.str.startswith('b')

2337 0 True

2338 1 False

2339 2 False

2340 3 NaN

2341 dtype: object

2342

2343 >>> s.str.startswith(('b', 'B'))

2344 0 True

2345 1 True

2346 2 False

2347 3 NaN

2348 dtype: object

2349

2350 Specifying `na` to be `False` instead of `NaN`.

2351

2352 >>> s.str.startswith('b', na=False)

2353 0 True

2354 1 False

2355 2 False

2356 3 False

2357 dtype: bool

2358 """

2359 if not isinstance(pat, (str, tuple)):

2360 msg = f"expected a string or tuple, not {type(pat).__name__}"

2361 raise TypeError(msg)

2362 result = self._data.array._str_startswith(pat, na=na)

2363 return self._wrap_result(result, returns_string=False)

2364

2365 @forbid_nonstring_types(["bytes"])

2366 def endswith(

2367 self, pat: str | tuple[str, ...], na: Scalar | None = None

2368 ) -> Series | Index:

2369 """

2370 Test if the end of each string element matches a pattern.

2371

2372 Equivalent to :meth:`str.endswith`.

2373

2374 Parameters

2375 ----------

2376 pat : str or tuple[str, ...]

2377 Character sequence or tuple of strings. Regular expressions are not

2378 accepted.

2379 na : object, default NaN

2380 Object shown if element tested is not a string. The default depends

2381 on dtype of the array. For object-dtype, ``numpy.nan`` is used.

2382 For ``StringDtype``, ``pandas.NA`` is used.

2383

2384 Returns

2385 -------

2386 Series or Index of bool

2387 A Series of booleans indicating whether the given pattern matches

2388 the end of each string element.

2389

2390 See Also

2391 --------

2392 str.endswith : Python standard library string method.

2393 Series.str.startswith : Same as endswith, but tests the start of string.

2394 Series.str.contains : Tests if string element contains a pattern.

2395

2396 Examples

2397 --------

2398 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan])

2399 >>> s

2400 0 bat

2401 1 bear

2402 2 caT

2403 3 NaN

2404 dtype: object

2405

2406 >>> s.str.endswith('t')

2407 0 True

2408 1 False

2409 2 False

2410 3 NaN

2411 dtype: object

2412

2413 >>> s.str.endswith(('t', 'T'))

2414 0 True

2415 1 False

2416 2 True

2417 3 NaN

2418 dtype: object

2419

2420 Specifying `na` to be `False` instead of `NaN`.

2421

2422 >>> s.str.endswith('t', na=False)

2423 0 True

2424 1 False

2425 2 False

2426 3 False

2427 dtype: bool

2428 """

2429 if not isinstance(pat, (str, tuple)):

2430 msg = f"expected a string or tuple, not {type(pat).__name__}"

2431 raise TypeError(msg)

2432 result = self._data.array._str_endswith(pat, na=na)

2433 return self._wrap_result(result, returns_string=False)

2434

2435 @forbid_nonstring_types(["bytes"])

2436 def findall(self, pat, flags: int = 0):

2437 """

2438 Find all occurrences of pattern or regular expression in the Series/Index.

2439

2440 Equivalent to applying :func:`re.findall` to all the elements in the

2441 Series/Index.

2442

2443 Parameters

2444 ----------

2445 pat : str

2446 Pattern or regular expression.

2447 flags : int, default 0

2448 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which

2449 means no flags).

2450

2451 Returns

2452 -------

2453 Series/Index of lists of strings

2454 All non-overlapping matches of pattern or regular expression in each

2455 string of this Series/Index.

2456

2457 See Also

2458 --------

2459 count : Count occurrences of pattern or regular expression in each string

2460 of the Series/Index.

2461 extractall : For each string in the Series, extract groups from all matches

2462 of regular expression and return a DataFrame with one row for each

2463 match and one column for each group.

2464 re.findall : The equivalent ``re`` function to all non-overlapping matches

2465 of pattern or regular expression in string, as a list of strings.

2466

2467 Examples

2468 --------

2469 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit'])

2470

2471 The search for the pattern 'Monkey' returns one match:

2472

2473 >>> s.str.findall('Monkey')

2474 0 []

2475 1 [Monkey]

2476 2 []

2477 dtype: object

2478

2479 On the other hand, the search for the pattern 'MONKEY' doesn't return any

2480 match:

2481

2482 >>> s.str.findall('MONKEY')

2483 0 []

2484 1 []

2485 2 []

2486 dtype: object

2487

2488 Flags can be added to the pattern or regular expression. For instance,

2489 to find the pattern 'MONKEY' ignoring the case:

2490

2491 >>> import re

2492 >>> s.str.findall('MONKEY', flags=re.IGNORECASE)

2493 0 []

2494 1 [Monkey]

2495 2 []

2496 dtype: object

2497

2498 When the pattern matches more than one string in the Series, all matches

2499 are returned:

2500

2501 >>> s.str.findall('on')

2502 0 [on]

2503 1 [on]

2504 2 []

2505 dtype: object

2506

2507 Regular expressions are supported too. For instance, the search for all the

2508 strings ending with the word 'on' is shown next:

2509

2510 >>> s.str.findall('on$')

2511 0 [on]

2512 1 []

2513 2 []

2514 dtype: object

2515

2516 If the pattern is found more than once in the same string, then a list of

2517 multiple strings is returned:

2518

2519 >>> s.str.findall('b')

2520 0 []

2521 1 []

2522 2 [b, b]

2523 dtype: object

2524 """

2525 result = self._data.array._str_findall(pat, flags)

2526 return self._wrap_result(result, returns_string=False)

2527

2528 @forbid_nonstring_types(["bytes"])

2529 def extract(

2530 self, pat: str, flags: int = 0, expand: bool = True

2531 ) -> DataFrame | Series | Index:

2532 r"""

2533 Extract capture groups in the regex `pat` as columns in a DataFrame.

2534

2535 For each subject string in the Series, extract groups from the

2536 first match of regular expression `pat`.

2537

2538 Parameters

2539 ----------

2540 pat : str

2541 Regular expression pattern with capturing groups.

2542 flags : int, default 0 (no flags)

2543 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that

2544 modify regular expression matching for things like case,

2545 spaces, etc. For more details, see :mod:`re`.

2546 expand : bool, default True

2547 If True, return DataFrame with one column per capture group.

2548 If False, return a Series/Index if there is one capture group

2549 or DataFrame if there are multiple capture groups.

2550

2551 Returns

2552 -------

2553 DataFrame or Series or Index

2554 A DataFrame with one row for each subject string, and one

2555 column for each group. Any capture group names in regular

2556 expression pat will be used for column names; otherwise

2557 capture group numbers will be used. The dtype of each result

2558 column is always object, even when no match is found. If

2559 ``expand=False`` and pat has only one capture group, then

2560 return a Series (if subject is a Series) or Index (if subject

2561 is an Index).

2562

2563 See Also

2564 --------

2565 extractall : Returns all matches (not just the first match).

2566

2567 Examples

2568 --------

2569 A pattern with two groups will return a DataFrame with two columns.

2570 Non-matches will be NaN.

2571

2572 >>> s = pd.Series(['a1', 'b2', 'c3'])

2573 >>> s.str.extract(r'([ab])(\d)')

2574 0 1

2575 0 a 1

2576 1 b 2

2577 2 NaN NaN

2578

2579 A pattern may contain optional groups.

2580

2581 >>> s.str.extract(r'([ab])?(\d)')

2582 0 1

2583 0 a 1

2584 1 b 2

2585 2 NaN 3

2586

2587 Named groups will become column names in the result.

2588

2589 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)')

2590 letter digit

2591 0 a 1

2592 1 b 2

2593 2 NaN NaN

2594

2595 A pattern with one group will return a DataFrame with one column

2596 if expand=True.

2597

2598 >>> s.str.extract(r'[ab](\d)', expand=True)

2599 0

2600 0 1

2601 1 2

2602 2 NaN

2603

2604 A pattern with one group will return a Series if expand=False.

2605

2606 >>> s.str.extract(r'[ab](\d)', expand=False)

2607 0 1

2608 1 2

2609 2 NaN

2610 dtype: object

2611 """

2612 from pandas import DataFrame

2613

2614 if not isinstance(expand, bool):

2615 raise ValueError("expand must be True or False")

2616

2617 regex = re.compile(pat, flags=flags)

2618 if regex.groups == 0:

2619 raise ValueError("pattern contains no capture groups")

2620

2621 if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex):

2622 raise ValueError("only one regex group is supported with Index")

2623

2624 obj = self._data

2625 result_dtype = _result_dtype(obj)

2626

2627 returns_df = regex.groups > 1 or expand

2628

2629 if returns_df:

2630 name = None

2631 columns = _get_group_names(regex)

2632

2633 if obj.array.size == 0:

2634 result = DataFrame(columns=columns, dtype=result_dtype)

2635

2636 else:

2637 result_list = self._data.array._str_extract(

2638 pat, flags=flags, expand=returns_df

2639 )

2640

2641 result_index: Index | None

2642 if isinstance(obj, ABCSeries):

2643 result_index = obj.index

2644 else:

2645 result_index = None

2646

2647 result = DataFrame(

2648 result_list, columns=columns, index=result_index, dtype=result_dtype

2649 )

2650

2651 else:

2652 name = _get_single_group_name(regex)

2653 result = self._data.array._str_extract(pat, flags=flags, expand=returns_df)

2654 return self._wrap_result(result, name=name)

2655

2656 @forbid_nonstring_types(["bytes"])

2657 def extractall(self, pat, flags: int = 0):

2658 r"""

2659 Extract capture groups in the regex `pat` as columns in DataFrame.

2660

2661 For each subject string in the Series, extract groups from all

2662 matches of regular expression pat. When each subject string in the

2663 Series has exactly one match, extractall(pat).xs(0, level='match')

2664 is the same as extract(pat).

2665

2666 Parameters

2667 ----------

2668 pat : str

2669 Regular expression pattern with capturing groups.

2670 flags : int, default 0 (no flags)

2671 A ``re`` module flag, for example ``re.IGNORECASE``. These allow

2672 to modify regular expression matching for things like case, spaces,

2673 etc. Multiple flags can be combined with the bitwise OR operator,

2674 for example ``re.IGNORECASE | re.MULTILINE``.

2675

2676 Returns

2677 -------

2678 DataFrame

2679 A ``DataFrame`` with one row for each match, and one column for each

2680 group. Its rows have a ``MultiIndex`` with first levels that come from

2681 the subject ``Series``. The last level is named 'match' and indexes the

2682 matches in each item of the ``Series``. Any capture group names in

2683 regular expression pat will be used for column names; otherwise capture

2684 group numbers will be used.

2685

2686 See Also

2687 --------

2688 extract : Returns first match only (not all matches).

2689

2690 Examples

2691 --------

2692 A pattern with one group will return a DataFrame with one column.

2693 Indices with no matches will not appear in the result.

2694

2695 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])

2696 >>> s.str.extractall(r"[ab](\d)")

2697 0

2698 match

2699 A 0 1

2700 1 2

2701 B 0 1

2702

2703 Capture group names are used for column names of the result.

2704

2705 >>> s.str.extractall(r"[ab](?P<digit>\d)")

2706 digit

2707 match

2708 A 0 1

2709 1 2

2710 B 0 1

2711

2712 A pattern with two groups will return a DataFrame with two columns.

2713

2714 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)")

2715 letter digit

2716 match

2717 A 0 a 1

2718 1 a 2

2719 B 0 b 1

2720

2721 Optional groups that do not match are NaN in the result.

2722

2723 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)")

2724 letter digit

2725 match

2726 A 0 a 1

2727 1 a 2

2728 B 0 b 1

2729 C 0 NaN 1

2730 """

2731 # TODO: dispatch

2732 return str_extractall(self._orig, pat, flags)

2733

2734 _shared_docs[

2735 "find"

2736 ] = """

2737 Return %(side)s indexes in each strings in the Series/Index.

2738

2739 Each of returned indexes corresponds to the position where the

2740 substring is fully contained between [start:end]. Return -1 on

2741 failure. Equivalent to standard :meth:`str.%(method)s`.

2742

2743 Parameters

2744 ----------

2745 sub : str

2746 Substring being searched.

2747 start : int

2748 Left edge index.

2749 end : int

2750 Right edge index.

2751

2752 Returns

2753 -------

2754 Series or Index of int.

2755

2756 See Also

2757 --------

2758 %(also)s

2759 """

2760

2761 @Appender(

2762 _shared_docs["find"]

2763 % {

2764 "side": "lowest",

2765 "method": "find",

2766 "also": "rfind : Return highest indexes in each strings.",

2767 }

2768 )

2769 @forbid_nonstring_types(["bytes"])

2770 def find(self, sub, start: int = 0, end=None):

2771 if not isinstance(sub, str):

2772 msg = f"expected a string object, not {type(sub).__name__}"

2773 raise TypeError(msg)

2774

2775 result = self._data.array._str_find(sub, start, end)

2776 return self._wrap_result(result, returns_string=False)

2777

2778 @Appender(

2779 _shared_docs["find"]

2780 % {

2781 "side": "highest",

2782 "method": "rfind",

2783 "also": "find : Return lowest indexes in each strings.",

2784 }

2785 )

2786 @forbid_nonstring_types(["bytes"])

2787 def rfind(self, sub, start: int = 0, end=None):

2788 if not isinstance(sub, str):

2789 msg = f"expected a string object, not {type(sub).__name__}"

2790 raise TypeError(msg)

2791

2792 result = self._data.array._str_rfind(sub, start=start, end=end)

2793 return self._wrap_result(result, returns_string=False)

2794

2795 @forbid_nonstring_types(["bytes"])

2796 def normalize(self, form):

2797 """

2798 Return the Unicode normal form for the strings in the Series/Index.

2799

2800 For more information on the forms, see the

2801 :func:`unicodedata.normalize`.

2802

2803 Parameters

2804 ----------

2805 form : {'NFC', 'NFKC', 'NFD', 'NFKD'}

2806 Unicode form.

2807

2808 Returns

2809 -------

2810 Series/Index of objects

2811 """

2812 result = self._data.array._str_normalize(form)

2813 return self._wrap_result(result)

2814

2815 _shared_docs[

2816 "index"

2817 ] = """

2818 Return %(side)s indexes in each string in Series/Index.

2819

2820 Each of the returned indexes corresponds to the position where the

2821 substring is fully contained between [start:end]. This is the same

2822 as ``str.%(similar)s`` except instead of returning -1, it raises a

2823 ValueError when the substring is not found. Equivalent to standard

2824 ``str.%(method)s``.

2825

2826 Parameters

2827 ----------

2828 sub : str

2829 Substring being searched.

2830 start : int

2831 Left edge index.

2832 end : int

2833 Right edge index.

2834

2835 Returns

2836 -------

2837 Series or Index of object

2838

2839 See Also

2840 --------

2841 %(also)s

2842 """

2843

2844 @Appender(

2845 _shared_docs["index"]

2846 % {

2847 "side": "lowest",

2848 "similar": "find",

2849 "method": "index",

2850 "also": "rindex : Return highest indexes in each strings.",

2851 }

2852 )

2853 @forbid_nonstring_types(["bytes"])

2854 def index(self, sub, start: int = 0, end=None):

2855 if not isinstance(sub, str):

2856 msg = f"expected a string object, not {type(sub).__name__}"

2857 raise TypeError(msg)

2858

2859 result = self._data.array._str_index(sub, start=start, end=end)

2860 return self._wrap_result(result, returns_string=False)

2861

2862 @Appender(

2863 _shared_docs["index"]

2864 % {

2865 "side": "highest",

2866 "similar": "rfind",

2867 "method": "rindex",

2868 "also": "index : Return lowest indexes in each strings.",

2869 }

2870 )

2871 @forbid_nonstring_types(["bytes"])

2872 def rindex(self, sub, start: int = 0, end=None):

2873 if not isinstance(sub, str):

2874 msg = f"expected a string object, not {type(sub).__name__}"

2875 raise TypeError(msg)

2876

2877 result = self._data.array._str_rindex(sub, start=start, end=end)

2878 return self._wrap_result(result, returns_string=False)

2879

2880 def len(self):

2881 """

2882 Compute the length of each element in the Series/Index.

2883

2884 The element may be a sequence (such as a string, tuple or list) or a collection

2885 (such as a dictionary).

2886

2887 Returns

2888 -------

2889 Series or Index of int

2890 A Series or Index of integer values indicating the length of each

2891 element in the Series or Index.

2892

2893 See Also

2894 --------

2895 str.len : Python built-in function returning the length of an object.

2896 Series.size : Returns the length of the Series.

2897

2898 Examples

2899 --------

2900 Returns the length (number of characters) in a string. Returns the

2901 number of entries for dictionaries, lists or tuples.

2902

2903 >>> s = pd.Series(['dog',

2904 ... '',

2905 ... 5,

2906 ... {'foo' : 'bar'},

2907 ... [2, 3, 5, 7],

2908 ... ('one', 'two', 'three')])

2909 >>> s

2910 0 dog

2911 1

2912 2 5

2913 3 {'foo': 'bar'}

2914 4 [2, 3, 5, 7]

2915 5 (one, two, three)

2916 dtype: object

2917 >>> s.str.len()

2918 0 3.0

2919 1 0.0

2920 2 NaN

2921 3 1.0

2922 4 4.0

2923 5 3.0

2924 dtype: float64

2925 """

2926 result = self._data.array._str_len()

2927 return self._wrap_result(result, returns_string=False)

2928

2929 _shared_docs[

2930 "casemethods"

2931 ] = """

2932 Convert strings in the Series/Index to %(type)s.

2933 %(version)s

2934 Equivalent to :meth:`str.%(method)s`.

2935

2936 Returns

2937 -------

2938 Series or Index of object

2939

2940 See Also

2941 --------

2942 Series.str.lower : Converts all characters to lowercase.

2943 Series.str.upper : Converts all characters to uppercase.

2944 Series.str.title : Converts first character of each word to uppercase and

2945 remaining to lowercase.

2946 Series.str.capitalize : Converts first character to uppercase and

2947 remaining to lowercase.

2948 Series.str.swapcase : Converts uppercase to lowercase and lowercase to

2949 uppercase.

2950 Series.str.casefold: Removes all case distinctions in the string.

2951

2952 Examples

2953 --------

2954 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe'])

2955 >>> s

2956 0 lower

2957 1 CAPITALS

2958 2 this is a sentence

2959 3 SwApCaSe

2960 dtype: object

2961

2962 >>> s.str.lower()

2963 0 lower

2964 1 capitals

2965 2 this is a sentence

2966 3 swapcase

2967 dtype: object

2968

2969 >>> s.str.upper()

2970 0 LOWER

2971 1 CAPITALS

2972 2 THIS IS A SENTENCE

2973 3 SWAPCASE

2974 dtype: object

2975

2976 >>> s.str.title()

2977 0 Lower

2978 1 Capitals

2979 2 This Is A Sentence

2980 3 Swapcase

2981 dtype: object

2982

2983 >>> s.str.capitalize()

2984 0 Lower

2985 1 Capitals

2986 2 This is a sentence

2987 3 Swapcase

2988 dtype: object

2989

2990 >>> s.str.swapcase()

2991 0 LOWER

2992 1 capitals

2993 2 THIS IS A SENTENCE

2994 3 sWaPcAsE

2995 dtype: object

2996 """

2997 # Types:

2998 # cases:

2999 # upper, lower, title, capitalize, swapcase, casefold

3000 # boolean:

3001 # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle

3002 # _doc_args holds dict of strings to use in substituting casemethod docs

3003 _doc_args: dict[str, dict[str, str]] = {}

3004 _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""}

3005 _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""}

3006 _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""}

3007 _doc_args["capitalize"] = {

3008 "type": "be capitalized",

3009 "method": "capitalize",

3010 "version": "",

3011 }

3012 _doc_args["swapcase"] = {

3013 "type": "be swapcased",

3014 "method": "swapcase",

3015 "version": "",

3016 }

3017 _doc_args["casefold"] = {

3018 "type": "be casefolded",

3019 "method": "casefold",

3020 "version": "",

3021 }

3022

3023 @Appender(_shared_docs["casemethods"] % _doc_args["lower"])

3024 @forbid_nonstring_types(["bytes"])

3025 def lower(self):

3026 result = self._data.array._str_lower()

3027 return self._wrap_result(result)

3028

3029 @Appender(_shared_docs["casemethods"] % _doc_args["upper"])

3030 @forbid_nonstring_types(["bytes"])

3031 def upper(self):

3032 result = self._data.array._str_upper()

3033 return self._wrap_result(result)

3034

3035 @Appender(_shared_docs["casemethods"] % _doc_args["title"])

3036 @forbid_nonstring_types(["bytes"])

3037 def title(self):

3038 result = self._data.array._str_title()

3039 return self._wrap_result(result)

3040

3041 @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"])

3042 @forbid_nonstring_types(["bytes"])

3043 def capitalize(self):

3044 result = self._data.array._str_capitalize()

3045 return self._wrap_result(result)

3046

3047 @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"])

3048 @forbid_nonstring_types(["bytes"])

3049 def swapcase(self):

3050 result = self._data.array._str_swapcase()

3051 return self._wrap_result(result)

3052

3053 @Appender(_shared_docs["casemethods"] % _doc_args["casefold"])

3054 @forbid_nonstring_types(["bytes"])

3055 def casefold(self):

3056 result = self._data.array._str_casefold()

3057 return self._wrap_result(result)

3058

3059 _shared_docs[

3060 "ismethods"

3061 ] = """

3062 Check whether all characters in each string are %(type)s.

3063

3064 This is equivalent to running the Python string method

3065 :meth:`str.%(method)s` for each element of the Series/Index. If a string

3066 has zero characters, ``False`` is returned for that check.

3067

3068 Returns

3069 -------

3070 Series or Index of bool

3071 Series or Index of boolean values with the same length as the original

3072 Series/Index.

3073

3074 See Also

3075 --------

3076 Series.str.isalpha : Check whether all characters are alphabetic.

3077 Series.str.isnumeric : Check whether all characters are numeric.

3078 Series.str.isalnum : Check whether all characters are alphanumeric.

3079 Series.str.isdigit : Check whether all characters are digits.

3080 Series.str.isdecimal : Check whether all characters are decimal.

3081 Series.str.isspace : Check whether all characters are whitespace.

3082 Series.str.islower : Check whether all characters are lowercase.

3083 Series.str.isupper : Check whether all characters are uppercase.

3084 Series.str.istitle : Check whether all characters are titlecase.

3085

3086 Examples

3087 --------

3088 **Checks for Alphabetic and Numeric Characters**

3089

3090 >>> s1 = pd.Series(['one', 'one1', '1', ''])

3091

3092 >>> s1.str.isalpha()

3093 0 True

3094 1 False

3095 2 False

3096 3 False

3097 dtype: bool

3098

3099 >>> s1.str.isnumeric()

3100 0 False

3101 1 False

3102 2 True

3103 3 False

3104 dtype: bool

3105

3106 >>> s1.str.isalnum()

3107 0 True

3108 1 True

3109 2 True

3110 3 False

3111 dtype: bool

3112

3113 Note that checks against characters mixed with any additional punctuation

3114 or whitespace will evaluate to false for an alphanumeric check.

3115

3116 >>> s2 = pd.Series(['A B', '1.5', '3,000'])

3117 >>> s2.str.isalnum()

3118 0 False

3119 1 False

3120 2 False

3121 dtype: bool

3122

3123 **More Detailed Checks for Numeric Characters**

3124

3125 There are several different but overlapping sets of numeric characters that

3126 can be checked for.

3127

3128 >>> s3 = pd.Series(['23', '³', '⅕', ''])

3129

3130 The ``s3.str.isdecimal`` method checks for characters used to form numbers

3131 in base 10.

3132

3133 >>> s3.str.isdecimal()

3134 0 True

3135 1 False

3136 2 False

3137 3 False

3138 dtype: bool

3139

3140 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also

3141 includes special digits, like superscripted and subscripted digits in

3142 unicode.

3143

3144 >>> s3.str.isdigit()

3145 0 True

3146 1 True

3147 2 False

3148 3 False

3149 dtype: bool

3150

3151 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also

3152 includes other characters that can represent quantities such as unicode

3153 fractions.

3154

3155 >>> s3.str.isnumeric()

3156 0 True

3157 1 True

3158 2 True

3159 3 False

3160 dtype: bool

3161

3162 **Checks for Whitespace**

3163

3164 >>> s4 = pd.Series([' ', '\\t\\r\\n ', ''])

3165 >>> s4.str.isspace()

3166 0 True

3167 1 True

3168 2 False

3169 dtype: bool

3170

3171 **Checks for Character Case**

3172

3173 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', ''])

3174

3175 >>> s5.str.islower()

3176 0 True

3177 1 False

3178 2 False

3179 3 False

3180 dtype: bool

3181

3182 >>> s5.str.isupper()

3183 0 False

3184 1 False

3185 2 True

3186 3 False

3187 dtype: bool

3188

3189 The ``s5.str.istitle`` method checks for whether all words are in title

3190 case (whether only the first letter of each word is capitalized). Words are

3191 assumed to be as any sequence of non-numeric characters separated by

3192 whitespace characters.

3193

3194 >>> s5.str.istitle()

3195 0 False

3196 1 True

3197 2 False

3198 3 False

3199 dtype: bool

3200 """

3201 _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"}

3202 _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"}

3203 _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"}

3204 _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"}

3205 _doc_args["islower"] = {"type": "lowercase", "method": "islower"}

3206 _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"}

3207 _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"}

3208 _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"}

3209 _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"}

3210 # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624)

3211

3212 isalnum = _map_and_wrap(

3213 "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"]

3214 )

3215 isalpha = _map_and_wrap(

3216 "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"]

3217 )

3218 isdigit = _map_and_wrap(

3219 "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"]

3220 )

3221 isspace = _map_and_wrap(

3222 "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"]

3223 )

3224 islower = _map_and_wrap(

3225 "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"]

3226 )

3227 isupper = _map_and_wrap(

3228 "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"]

3229 )

3230 istitle = _map_and_wrap(

3231 "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"]

3232 )

3233 isnumeric = _map_and_wrap(

3234 "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"]

3235 )

3236 isdecimal = _map_and_wrap(

3237 "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"]

3238 )

3239

3240

3241def cat_safe(list_of_columns: list, sep: str):

3242 """

3243 Auxiliary function for :meth:`str.cat`.

3244

3245 Same signature as cat_core, but handles TypeErrors in concatenation, which

3246 happen if the arrays in list_of columns have the wrong dtypes or content.

3247

3248 Parameters

3249 ----------

3250 list_of_columns : list of numpy arrays

3251 List of arrays to be concatenated with sep;

3252 these arrays may not contain NaNs!

3253 sep : string

3254 The separator string for concatenating the columns.

3255

3256 Returns

3257 -------

3258 nd.array

3259 The concatenation of list_of_columns with sep.

3260 """

3261 try:

3262 result = cat_core(list_of_columns, sep)

3263 except TypeError:

3264 # if there are any non-string values (wrong dtype or hidden behind

3265 # object dtype), np.sum will fail; catch and return with better message

3266 for column in list_of_columns:

3267 dtype = lib.infer_dtype(column, skipna=True)

3268 if dtype not in ["string", "empty"]:

3269 raise TypeError(

3270 "Concatenation requires list-likes containing only "

3271 "strings (or missing values). Offending values found in "

3272 f"column {dtype}"

3273 ) from None

3274 return result

3275

3276

3277def cat_core(list_of_columns: list, sep: str):

3278 """

3279 Auxiliary function for :meth:`str.cat`

3280

3281 Parameters

3282 ----------

3283 list_of_columns : list of numpy arrays

3284 List of arrays to be concatenated with sep;

3285 these arrays may not contain NaNs!

3286 sep : string

3287 The separator string for concatenating the columns.

3288

3289 Returns

3290 -------

3291 nd.array

3292 The concatenation of list_of_columns with sep.

3293 """

3294 if sep == "":

3295 # no need to interleave sep if it is empty

3296 arr_of_cols = np.asarray(list_of_columns, dtype=object)

3297 return np.sum(arr_of_cols, axis=0)

3298 list_with_sep = [sep] * (2 * len(list_of_columns) - 1)

3299 list_with_sep[::2] = list_of_columns

3300 arr_with_sep = np.asarray(list_with_sep, dtype=object)

3301 return np.sum(arr_with_sep, axis=0)

3302

3303

3304def _result_dtype(arr):

3305 # workaround #27953

3306 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails

3307 # when the list of values is empty.

3308 from pandas.core.arrays.string_ import StringDtype

3309

3310 if isinstance(arr.dtype, StringDtype):

3311 return arr.dtype

3312 else:

3313 return object

3314

3315

3316def _get_single_group_name(regex: re.Pattern) -> Hashable:

3317 if regex.groupindex:

3318 return next(iter(regex.groupindex))

3319 else:

3320 return None

3321

3322

3323def _get_group_names(regex: re.Pattern) -> list[Hashable]:

3324 """

3325 Get named groups from compiled regex.

3326

3327 Unnamed groups are numbered.

3328

3329 Parameters

3330 ----------

3331 regex : compiled regex

3332

3333 Returns

3334 -------

3335 list of column labels

3336 """

3337 names = {v: k for k, v in regex.groupindex.items()}

3338 return [names.get(1 + i, i) for i in range(regex.groups)]

3339

3340

3341def str_extractall(arr, pat, flags: int = 0):

3342 regex = re.compile(pat, flags=flags)

3343 # the regex must contain capture groups.

3344 if regex.groups == 0:

3345 raise ValueError("pattern contains no capture groups")

3346

3347 if isinstance(arr, ABCIndex):

3348 arr = arr.to_series().reset_index(drop=True)

3349

3350 columns = _get_group_names(regex)

3351 match_list = []

3352 index_list = []

3353 is_mi = arr.index.nlevels > 1

3354

3355 for subject_key, subject in arr.items():

3356 if isinstance(subject, str):

3357 if not is_mi:

3358 subject_key = (subject_key,)

3359

3360 for match_i, match_tuple in enumerate(regex.findall(subject)):

3361 if isinstance(match_tuple, str):

3362 match_tuple = (match_tuple,)

3363 na_tuple = [np.NaN if group == "" else group for group in match_tuple]

3364 match_list.append(na_tuple)

3365 result_key = tuple(subject_key + (match_i,))

3366 index_list.append(result_key)

3367

3368 from pandas import MultiIndex

3369

3370 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"])

3371 dtype = _result_dtype(arr)

3372

3373 result = arr._constructor_expanddim(

3374 match_list, index=index, columns=columns, dtype=dtype

3375 )

3376 return result