Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/strings/accessor.py: 33%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

604 statements  

1from __future__ import annotations 

2 

3import codecs 

4from functools import wraps 

5import re 

6from typing import ( 

7 TYPE_CHECKING, 

8 Callable, 

9 Literal, 

10 cast, 

11) 

12import warnings 

13 

14import numpy as np 

15 

16from pandas._libs import lib 

17from pandas._typing import ( 

18 AlignJoin, 

19 DtypeObj, 

20 F, 

21 Scalar, 

22 npt, 

23) 

24from pandas.util._decorators import Appender 

25from pandas.util._exceptions import find_stack_level 

26 

27from pandas.core.dtypes.common import ( 

28 ensure_object, 

29 is_bool_dtype, 

30 is_integer, 

31 is_list_like, 

32 is_object_dtype, 

33 is_re, 

34) 

35from pandas.core.dtypes.dtypes import ( 

36 ArrowDtype, 

37 CategoricalDtype, 

38) 

39from pandas.core.dtypes.generic import ( 

40 ABCDataFrame, 

41 ABCIndex, 

42 ABCMultiIndex, 

43 ABCSeries, 

44) 

45from pandas.core.dtypes.missing import isna 

46 

47from pandas.core.arrays import ExtensionArray 

48from pandas.core.base import NoNewAttributesMixin 

49from pandas.core.construction import extract_array 

50 

51if TYPE_CHECKING: 

52 from collections.abc import ( 

53 Hashable, 

54 Iterator, 

55 ) 

56 

57 from pandas import ( 

58 DataFrame, 

59 Index, 

60 Series, 

61 ) 

62 

63_shared_docs: dict[str, str] = {} 

64_cpython_optimized_encoders = ( 

65 "utf-8", 

66 "utf8", 

67 "latin-1", 

68 "latin1", 

69 "iso-8859-1", 

70 "mbcs", 

71 "ascii", 

72) 

73_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") 

74 

75 

76def forbid_nonstring_types( 

77 forbidden: list[str] | None, name: str | None = None 

78) -> Callable[[F], F]: 

79 """ 

80 Decorator to forbid specific types for a method of StringMethods. 

81 

82 For calling `.str.{method}` on a Series or Index, it is necessary to first 

83 initialize the :class:`StringMethods` object, and then call the method. 

84 However, different methods allow different input types, and so this can not 

85 be checked during :meth:`StringMethods.__init__`, but must be done on a 

86 per-method basis. This decorator exists to facilitate this process, and 

87 make it explicit which (inferred) types are disallowed by the method. 

88 

89 :meth:`StringMethods.__init__` allows the *union* of types its different 

90 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), 

91 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. 

92 

93 The default string types ['string', 'empty'] are allowed for all methods. 

94 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method 

95 then needs to forbid the types it is not intended for. 

96 

97 Parameters 

98 ---------- 

99 forbidden : list-of-str or None 

100 List of forbidden non-string types, may be one or more of 

101 `['bytes', 'mixed', 'mixed-integer']`. 

102 name : str, default None 

103 Name of the method to use in the error message. By default, this is 

104 None, in which case the name from the method being wrapped will be 

105 copied. However, for working with further wrappers (like _pat_wrapper 

106 and _noarg_wrapper), it is necessary to specify the name. 

107 

108 Returns 

109 ------- 

110 func : wrapper 

111 The method to which the decorator is applied, with an added check that 

112 enforces the inferred type to not be in the list of forbidden types. 

113 

114 Raises 

115 ------ 

116 TypeError 

117 If the inferred type of the underlying data is in `forbidden`. 

118 """ 

119 # deal with None 

120 forbidden = [] if forbidden is None else forbidden 

121 

122 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( 

123 forbidden 

124 ) 

125 

126 def _forbid_nonstring_types(func: F) -> F: 

127 func_name = func.__name__ if name is None else name 

128 

129 @wraps(func) 

130 def wrapper(self, *args, **kwargs): 

131 if self._inferred_dtype not in allowed_types: 

132 msg = ( 

133 f"Cannot use .str.{func_name} with values of " 

134 f"inferred dtype '{self._inferred_dtype}'." 

135 ) 

136 raise TypeError(msg) 

137 return func(self, *args, **kwargs) 

138 

139 wrapper.__name__ = func_name 

140 return cast(F, wrapper) 

141 

142 return _forbid_nonstring_types 

143 

144 

145def _map_and_wrap(name: str | None, docstring: str | None): 

146 @forbid_nonstring_types(["bytes"], name=name) 

147 def wrapper(self): 

148 result = getattr(self._data.array, f"_str_{name}")() 

149 return self._wrap_result( 

150 result, returns_string=name not in ("isnumeric", "isdecimal") 

151 ) 

152 

153 wrapper.__doc__ = docstring 

154 return wrapper 

155 

156 

157class StringMethods(NoNewAttributesMixin): 

158 """ 

159 Vectorized string functions for Series and Index. 

160 

161 NAs stay NA unless handled otherwise by a particular method. 

162 Patterned after Python's string methods, with some inspiration from 

163 R's stringr package. 

164 

165 Examples 

166 -------- 

167 >>> s = pd.Series(["A_Str_Series"]) 

168 >>> s 

169 0 A_Str_Series 

170 dtype: object 

171 

172 >>> s.str.split("_") 

173 0 [A, Str, Series] 

174 dtype: object 

175 

176 >>> s.str.replace("_", "") 

177 0 AStrSeries 

178 dtype: object 

179 """ 

180 

181 # Note: see the docstring in pandas.core.strings.__init__ 

182 # for an explanation of the implementation. 

183 # TODO: Dispatch all the methods 

184 # Currently the following are not dispatched to the array 

185 # * cat 

186 # * extractall 

187 

188 def __init__(self, data) -> None: 

189 from pandas.core.arrays.string_ import StringDtype 

190 

191 self._inferred_dtype = self._validate(data) 

192 self._is_categorical = isinstance(data.dtype, CategoricalDtype) 

193 self._is_string = isinstance(data.dtype, StringDtype) 

194 self._data = data 

195 

196 self._index = self._name = None 

197 if isinstance(data, ABCSeries): 

198 self._index = data.index 

199 self._name = data.name 

200 

201 # ._values.categories works for both Series/Index 

202 self._parent = data._values.categories if self._is_categorical else data 

203 # save orig to blow up categoricals to the right type 

204 self._orig = data 

205 self._freeze() 

206 

207 @staticmethod 

208 def _validate(data): 

209 """ 

210 Auxiliary function for StringMethods, infers and checks dtype of data. 

211 

212 This is a "first line of defence" at the creation of the StringMethods- 

213 object, and just checks that the dtype is in the 

214 *union* of the allowed types over all string methods below; this 

215 restriction is then refined on a per-method basis using the decorator 

216 @forbid_nonstring_types (more info in the corresponding docstring). 

217 

218 This really should exclude all series/index with any non-string values, 

219 but that isn't practical for performance reasons until we have a str 

220 dtype (GH 9343 / 13877) 

221 

222 Parameters 

223 ---------- 

224 data : The content of the Series 

225 

226 Returns 

227 ------- 

228 dtype : inferred dtype of data 

229 """ 

230 if isinstance(data, ABCMultiIndex): 

231 raise AttributeError( 

232 "Can only use .str accessor with Index, not MultiIndex" 

233 ) 

234 

235 # see _libs/lib.pyx for list of inferred types 

236 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] 

237 

238 data = extract_array(data) 

239 

240 values = getattr(data, "categories", data) # categorical / normal 

241 

242 inferred_dtype = lib.infer_dtype(values, skipna=True) 

243 

244 if inferred_dtype not in allowed_types: 

245 raise AttributeError("Can only use .str accessor with string values!") 

246 return inferred_dtype 

247 

248 def __getitem__(self, key): 

249 result = self._data.array._str_getitem(key) 

250 return self._wrap_result(result) 

251 

252 def __iter__(self) -> Iterator: 

253 raise TypeError(f"'{type(self).__name__}' object is not iterable") 

254 

255 def _wrap_result( 

256 self, 

257 result, 

258 name=None, 

259 expand: bool | None = None, 

260 fill_value=np.nan, 

261 returns_string: bool = True, 

262 returns_bool: bool = False, 

263 dtype=None, 

264 ): 

265 from pandas import ( 

266 Index, 

267 MultiIndex, 

268 ) 

269 

270 if not hasattr(result, "ndim") or not hasattr(result, "dtype"): 

271 if isinstance(result, ABCDataFrame): 

272 result = result.__finalize__(self._orig, name="str") 

273 return result 

274 assert result.ndim < 3 

275 

276 # We can be wrapping a string / object / categorical result, in which 

277 # case we'll want to return the same dtype as the input. 

278 # Or we can be wrapping a numeric output, in which case we don't want 

279 # to return a StringArray. 

280 # Ideally the array method returns the right array type. 

281 if expand is None: 

282 # infer from ndim if expand is not specified 

283 expand = result.ndim != 1 

284 elif expand is True and not isinstance(self._orig, ABCIndex): 

285 # required when expand=True is explicitly specified 

286 # not needed when inferred 

287 if isinstance(result.dtype, ArrowDtype): 

288 import pyarrow as pa 

289 

290 from pandas.compat import pa_version_under11p0 

291 

292 from pandas.core.arrays.arrow.array import ArrowExtensionArray 

293 

294 value_lengths = pa.compute.list_value_length(result._pa_array) 

295 max_len = pa.compute.max(value_lengths).as_py() 

296 min_len = pa.compute.min(value_lengths).as_py() 

297 if result._hasna: 

298 # ArrowExtensionArray.fillna doesn't work for list scalars 

299 result = ArrowExtensionArray( 

300 result._pa_array.fill_null([None] * max_len) 

301 ) 

302 if min_len < max_len: 

303 # append nulls to each scalar list element up to max_len 

304 if not pa_version_under11p0: 

305 result = ArrowExtensionArray( 

306 pa.compute.list_slice( 

307 result._pa_array, 

308 start=0, 

309 stop=max_len, 

310 return_fixed_size_list=True, 

311 ) 

312 ) 

313 else: 

314 all_null = np.full(max_len, fill_value=None, dtype=object) 

315 values = result.to_numpy() 

316 new_values = [] 

317 for row in values: 

318 if len(row) < max_len: 

319 nulls = all_null[: max_len - len(row)] 

320 row = np.append(row, nulls) 

321 new_values.append(row) 

322 pa_type = result._pa_array.type 

323 result = ArrowExtensionArray(pa.array(new_values, type=pa_type)) 

324 if name is not None: 

325 labels = name 

326 else: 

327 labels = range(max_len) 

328 result = ( 

329 pa.compute.list_flatten(result._pa_array) 

330 .to_numpy() 

331 .reshape(len(result), max_len) 

332 ) 

333 result = { 

334 label: ArrowExtensionArray(pa.array(res)) 

335 for label, res in zip(labels, result.T) 

336 } 

337 elif is_object_dtype(result): 

338 

339 def cons_row(x): 

340 if is_list_like(x): 

341 return x 

342 else: 

343 return [x] 

344 

345 result = [cons_row(x) for x in result] 

346 if result and not self._is_string: 

347 # propagate nan values to match longest sequence (GH 18450) 

348 max_len = max(len(x) for x in result) 

349 result = [ 

350 x * max_len if len(x) == 0 or x[0] is np.nan else x 

351 for x in result 

352 ] 

353 

354 if not isinstance(expand, bool): 

355 raise ValueError("expand must be True or False") 

356 

357 if expand is False: 

358 # if expand is False, result should have the same name 

359 # as the original otherwise specified 

360 if name is None: 

361 name = getattr(result, "name", None) 

362 if name is None: 

363 # do not use logical or, _orig may be a DataFrame 

364 # which has "name" column 

365 name = self._orig.name 

366 

367 # Wait until we are sure result is a Series or Index before 

368 # checking attributes (GH 12180) 

369 if isinstance(self._orig, ABCIndex): 

370 # if result is a boolean np.array, return the np.array 

371 # instead of wrapping it into a boolean Index (GH 8875) 

372 if is_bool_dtype(result): 

373 return result 

374 

375 if expand: 

376 result = list(result) 

377 out: Index = MultiIndex.from_tuples(result, names=name) 

378 if out.nlevels == 1: 

379 # We had all tuples of length-one, which are 

380 # better represented as a regular Index. 

381 out = out.get_level_values(0) 

382 return out 

383 else: 

384 return Index(result, name=name, dtype=dtype) 

385 else: 

386 index = self._orig.index 

387 # This is a mess. 

388 _dtype: DtypeObj | str | None = dtype 

389 vdtype = getattr(result, "dtype", None) 

390 if self._is_string: 

391 if is_bool_dtype(vdtype): 

392 _dtype = result.dtype 

393 elif returns_string: 

394 _dtype = self._orig.dtype 

395 else: 

396 _dtype = vdtype 

397 elif vdtype is not None: 

398 _dtype = vdtype 

399 

400 if expand: 

401 cons = self._orig._constructor_expanddim 

402 result = cons(result, columns=name, index=index, dtype=_dtype) 

403 else: 

404 # Must be a Series 

405 cons = self._orig._constructor 

406 result = cons(result, name=name, index=index, dtype=_dtype) 

407 result = result.__finalize__(self._orig, method="str") 

408 if name is not None and result.ndim == 1: 

409 # __finalize__ might copy over the original name, but we may 

410 # want the new name (e.g. str.extract). 

411 result.name = name 

412 return result 

413 

414 def _get_series_list(self, others): 

415 """ 

416 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input 

417 into a list of Series (elements without an index must match the length 

418 of the calling Series/Index). 

419 

420 Parameters 

421 ---------- 

422 others : Series, DataFrame, np.ndarray, list-like or list-like of 

423 Objects that are either Series, Index or np.ndarray (1-dim). 

424 

425 Returns 

426 ------- 

427 list of Series 

428 Others transformed into list of Series. 

429 """ 

430 from pandas import ( 

431 DataFrame, 

432 Series, 

433 ) 

434 

435 # self._orig is either Series or Index 

436 idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index 

437 

438 # Generally speaking, all objects without an index inherit the index 

439 # `idx` of the calling Series/Index - i.e. must have matching length. 

440 # Objects with an index (i.e. Series/Index/DataFrame) keep their own. 

441 if isinstance(others, ABCSeries): 

442 return [others] 

443 elif isinstance(others, ABCIndex): 

444 return [Series(others, index=idx, dtype=others.dtype)] 

445 elif isinstance(others, ABCDataFrame): 

446 return [others[x] for x in others] 

447 elif isinstance(others, np.ndarray) and others.ndim == 2: 

448 others = DataFrame(others, index=idx) 

449 return [others[x] for x in others] 

450 elif is_list_like(others, allow_sets=False): 

451 try: 

452 others = list(others) # ensure iterators do not get read twice etc 

453 except TypeError: 

454 # e.g. ser.str, raise below 

455 pass 

456 else: 

457 # in case of list-like `others`, all elements must be 

458 # either Series/Index/np.ndarray (1-dim)... 

459 if all( 

460 isinstance(x, (ABCSeries, ABCIndex, ExtensionArray)) 

461 or (isinstance(x, np.ndarray) and x.ndim == 1) 

462 for x in others 

463 ): 

464 los: list[Series] = [] 

465 while others: # iterate through list and append each element 

466 los = los + self._get_series_list(others.pop(0)) 

467 return los 

468 # ... or just strings 

469 elif all(not is_list_like(x) for x in others): 

470 return [Series(others, index=idx)] 

471 raise TypeError( 

472 "others must be Series, Index, DataFrame, np.ndarray " 

473 "or list-like (either containing only strings or " 

474 "containing only objects of type Series/Index/" 

475 "np.ndarray[1-dim])" 

476 ) 

477 

478 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) 

479 def cat( 

480 self, 

481 others=None, 

482 sep: str | None = None, 

483 na_rep=None, 

484 join: AlignJoin = "left", 

485 ) -> str | Series | Index: 

486 """ 

487 Concatenate strings in the Series/Index with given separator. 

488 

489 If `others` is specified, this function concatenates the Series/Index 

490 and elements of `others` element-wise. 

491 If `others` is not passed, then all values in the Series/Index are 

492 concatenated into a single string with a given `sep`. 

493 

494 Parameters 

495 ---------- 

496 others : Series, Index, DataFrame, np.ndarray or list-like 

497 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and 

498 other list-likes of strings must have the same length as the 

499 calling Series/Index, with the exception of indexed objects (i.e. 

500 Series/Index/DataFrame) if `join` is not None. 

501 

502 If others is a list-like that contains a combination of Series, 

503 Index or np.ndarray (1-dim), then all elements will be unpacked and 

504 must satisfy the above criteria individually. 

505 

506 If others is None, the method returns the concatenation of all 

507 strings in the calling Series/Index. 

508 sep : str, default '' 

509 The separator between the different elements/columns. By default 

510 the empty string `''` is used. 

511 na_rep : str or None, default None 

512 Representation that is inserted for all missing values: 

513 

514 - If `na_rep` is None, and `others` is None, missing values in the 

515 Series/Index are omitted from the result. 

516 - If `na_rep` is None, and `others` is not None, a row containing a 

517 missing value in any of the columns (before concatenation) will 

518 have a missing value in the result. 

519 join : {'left', 'right', 'outer', 'inner'}, default 'left' 

520 Determines the join-style between the calling Series/Index and any 

521 Series/Index/DataFrame in `others` (objects without an index need 

522 to match the length of the calling Series/Index). To disable 

523 alignment, use `.values` on any Series/Index/DataFrame in `others`. 

524 

525 Returns 

526 ------- 

527 str, Series or Index 

528 If `others` is None, `str` is returned, otherwise a `Series/Index` 

529 (same type as caller) of objects is returned. 

530 

531 See Also 

532 -------- 

533 split : Split each string in the Series/Index. 

534 join : Join lists contained as elements in the Series/Index. 

535 

536 Examples 

537 -------- 

538 When not passing `others`, all values are concatenated into a single 

539 string: 

540 

541 >>> s = pd.Series(['a', 'b', np.nan, 'd']) 

542 >>> s.str.cat(sep=' ') 

543 'a b d' 

544 

545 By default, NA values in the Series are ignored. Using `na_rep`, they 

546 can be given a representation: 

547 

548 >>> s.str.cat(sep=' ', na_rep='?') 

549 'a b ? d' 

550 

551 If `others` is specified, corresponding values are concatenated with 

552 the separator. Result will be a Series of strings. 

553 

554 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') 

555 0 a,A 

556 1 b,B 

557 2 NaN 

558 3 d,D 

559 dtype: object 

560 

561 Missing values will remain missing in the result, but can again be 

562 represented using `na_rep` 

563 

564 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') 

565 0 a,A 

566 1 b,B 

567 2 -,C 

568 3 d,D 

569 dtype: object 

570 

571 If `sep` is not specified, the values are concatenated without 

572 separation. 

573 

574 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') 

575 0 aA 

576 1 bB 

577 2 -C 

578 3 dD 

579 dtype: object 

580 

581 Series with different indexes can be aligned before concatenation. The 

582 `join`-keyword works as in other methods. 

583 

584 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) 

585 >>> s.str.cat(t, join='left', na_rep='-') 

586 0 aa 

587 1 b- 

588 2 -c 

589 3 dd 

590 dtype: object 

591 >>> 

592 >>> s.str.cat(t, join='outer', na_rep='-') 

593 0 aa 

594 1 b- 

595 2 -c 

596 3 dd 

597 4 -e 

598 dtype: object 

599 >>> 

600 >>> s.str.cat(t, join='inner', na_rep='-') 

601 0 aa 

602 2 -c 

603 3 dd 

604 dtype: object 

605 >>> 

606 >>> s.str.cat(t, join='right', na_rep='-') 

607 3 dd 

608 0 aa 

609 4 -e 

610 2 -c 

611 dtype: object 

612 

613 For more examples, see :ref:`here <text.concatenate>`. 

614 """ 

615 # TODO: dispatch 

616 from pandas import ( 

617 Index, 

618 Series, 

619 concat, 

620 ) 

621 

622 if isinstance(others, str): 

623 raise ValueError("Did you mean to supply a `sep` keyword?") 

624 if sep is None: 

625 sep = "" 

626 

627 if isinstance(self._orig, ABCIndex): 

628 data = Series(self._orig, index=self._orig, dtype=self._orig.dtype) 

629 else: # Series 

630 data = self._orig 

631 

632 # concatenate Series/Index with itself if no "others" 

633 if others is None: 

634 # error: Incompatible types in assignment (expression has type 

635 # "ndarray", variable has type "Series") 

636 data = ensure_object(data) # type: ignore[assignment] 

637 na_mask = isna(data) 

638 if na_rep is None and na_mask.any(): 

639 return sep.join(data[~na_mask]) 

640 elif na_rep is not None and na_mask.any(): 

641 return sep.join(np.where(na_mask, na_rep, data)) 

642 else: 

643 return sep.join(data) 

644 

645 try: 

646 # turn anything in "others" into lists of Series 

647 others = self._get_series_list(others) 

648 except ValueError as err: # do not catch TypeError raised by _get_series_list 

649 raise ValueError( 

650 "If `others` contains arrays or lists (or other " 

651 "list-likes without an index), these must all be " 

652 "of the same length as the calling Series/Index." 

653 ) from err 

654 

655 # align if required 

656 if any(not data.index.equals(x.index) for x in others): 

657 # Need to add keys for uniqueness in case of duplicate columns 

658 others = concat( 

659 others, 

660 axis=1, 

661 join=(join if join == "inner" else "outer"), 

662 keys=range(len(others)), 

663 sort=False, 

664 copy=False, 

665 ) 

666 data, others = data.align(others, join=join) 

667 others = [others[x] for x in others] # again list of Series 

668 

669 all_cols = [ensure_object(x) for x in [data] + others] 

670 na_masks = np.array([isna(x) for x in all_cols]) 

671 union_mask = np.logical_or.reduce(na_masks, axis=0) 

672 

673 if na_rep is None and union_mask.any(): 

674 # no na_rep means NaNs for all rows where any column has a NaN 

675 # only necessary if there are actually any NaNs 

676 result = np.empty(len(data), dtype=object) 

677 np.putmask(result, union_mask, np.nan) 

678 

679 not_masked = ~union_mask 

680 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) 

681 elif na_rep is not None and union_mask.any(): 

682 # fill NaNs with na_rep in case there are actually any NaNs 

683 all_cols = [ 

684 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) 

685 ] 

686 result = cat_safe(all_cols, sep) 

687 else: 

688 # no NaNs - can just concatenate 

689 result = cat_safe(all_cols, sep) 

690 

691 out: Index | Series 

692 if isinstance(self._orig.dtype, CategoricalDtype): 

693 # We need to infer the new categories. 

694 dtype = self._orig.dtype.categories.dtype 

695 else: 

696 dtype = self._orig.dtype 

697 if isinstance(self._orig, ABCIndex): 

698 # add dtype for case that result is all-NA 

699 if isna(result).all(): 

700 dtype = object # type: ignore[assignment] 

701 

702 out = Index(result, dtype=dtype, name=self._orig.name) 

703 else: # Series 

704 res_ser = Series( 

705 result, dtype=dtype, index=data.index, name=self._orig.name, copy=False 

706 ) 

707 out = res_ser.__finalize__(self._orig, method="str_cat") 

708 return out 

709 

710 _shared_docs[ 

711 "str_split" 

712 ] = r""" 

713 Split strings around given separator/delimiter. 

714 

715 Splits the string in the Series/Index from the %(side)s, 

716 at the specified delimiter string. 

717 

718 Parameters 

719 ---------- 

720 pat : str%(pat_regex)s, optional 

721 %(pat_description)s. 

722 If not specified, split on whitespace. 

723 n : int, default -1 (all) 

724 Limit number of splits in output. 

725 ``None``, 0 and -1 will be interpreted as return all splits. 

726 expand : bool, default False 

727 Expand the split strings into separate columns. 

728 

729 - If ``True``, return DataFrame/MultiIndex expanding dimensionality. 

730 - If ``False``, return Series/Index, containing lists of strings. 

731 %(regex_argument)s 

732 Returns 

733 ------- 

734 Series, Index, DataFrame or MultiIndex 

735 Type matches caller unless ``expand=True`` (see Notes). 

736 %(raises_split)s 

737 See Also 

738 -------- 

739 Series.str.split : Split strings around given separator/delimiter. 

740 Series.str.rsplit : Splits string around given separator/delimiter, 

741 starting from the right. 

742 Series.str.join : Join lists contained as elements in the Series/Index 

743 with passed delimiter. 

744 str.split : Standard library version for split. 

745 str.rsplit : Standard library version for rsplit. 

746 

747 Notes 

748 ----- 

749 The handling of the `n` keyword depends on the number of found splits: 

750 

751 - If found splits > `n`, make first `n` splits only 

752 - If found splits <= `n`, make all splits 

753 - If for a certain row the number of found splits < `n`, 

754 append `None` for padding up to `n` if ``expand=True`` 

755 

756 If using ``expand=True``, Series and Index callers return DataFrame and 

757 MultiIndex objects, respectively. 

758 %(regex_pat_note)s 

759 Examples 

760 -------- 

761 >>> s = pd.Series( 

762 ... [ 

763 ... "this is a regular sentence", 

764 ... "https://docs.python.org/3/tutorial/index.html", 

765 ... np.nan 

766 ... ] 

767 ... ) 

768 >>> s 

769 0 this is a regular sentence 

770 1 https://docs.python.org/3/tutorial/index.html 

771 2 NaN 

772 dtype: object 

773 

774 In the default setting, the string is split by whitespace. 

775 

776 >>> s.str.split() 

777 0 [this, is, a, regular, sentence] 

778 1 [https://docs.python.org/3/tutorial/index.html] 

779 2 NaN 

780 dtype: object 

781 

782 Without the `n` parameter, the outputs of `rsplit` and `split` 

783 are identical. 

784 

785 >>> s.str.rsplit() 

786 0 [this, is, a, regular, sentence] 

787 1 [https://docs.python.org/3/tutorial/index.html] 

788 2 NaN 

789 dtype: object 

790 

791 The `n` parameter can be used to limit the number of splits on the 

792 delimiter. The outputs of `split` and `rsplit` are different. 

793 

794 >>> s.str.split(n=2) 

795 0 [this, is, a regular sentence] 

796 1 [https://docs.python.org/3/tutorial/index.html] 

797 2 NaN 

798 dtype: object 

799 

800 >>> s.str.rsplit(n=2) 

801 0 [this is a, regular, sentence] 

802 1 [https://docs.python.org/3/tutorial/index.html] 

803 2 NaN 

804 dtype: object 

805 

806 The `pat` parameter can be used to split by other characters. 

807 

808 >>> s.str.split(pat="/") 

809 0 [this is a regular sentence] 

810 1 [https:, , docs.python.org, 3, tutorial, index... 

811 2 NaN 

812 dtype: object 

813 

814 When using ``expand=True``, the split elements will expand out into 

815 separate columns. If NaN is present, it is propagated throughout 

816 the columns during the split. 

817 

818 >>> s.str.split(expand=True) 

819 0 1 2 3 4 

820 0 this is a regular sentence 

821 1 https://docs.python.org/3/tutorial/index.html None None None None 

822 2 NaN NaN NaN NaN NaN 

823 

824 For slightly more complex use cases like splitting the html document name 

825 from a url, a combination of parameter settings can be used. 

826 

827 >>> s.str.rsplit("/", n=1, expand=True) 

828 0 1 

829 0 this is a regular sentence None 

830 1 https://docs.python.org/3/tutorial index.html 

831 2 NaN NaN 

832 %(regex_examples)s""" 

833 

834 @Appender( 

835 _shared_docs["str_split"] 

836 % { 

837 "side": "beginning", 

838 "pat_regex": " or compiled regex", 

839 "pat_description": "String or regular expression to split on", 

840 "regex_argument": """ 

841 regex : bool, default None 

842 Determines if the passed-in pattern is a regular expression: 

843 

844 - If ``True``, assumes the passed-in pattern is a regular expression 

845 - If ``False``, treats the pattern as a literal string. 

846 - If ``None`` and `pat` length is 1, treats `pat` as a literal string. 

847 - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression. 

848 - Cannot be set to False if `pat` is a compiled regex 

849 

850 .. versionadded:: 1.4.0 

851 """, 

852 "raises_split": """ 

853 Raises 

854 ------ 

855 ValueError 

856 * if `regex` is False and `pat` is a compiled regex 

857 """, 

858 "regex_pat_note": """ 

859 Use of `regex =False` with a `pat` as a compiled regex will raise an error. 

860 """, 

861 "method": "split", 

862 "regex_examples": r""" 

863 Remember to escape special characters when explicitly using regular expressions. 

864 

865 >>> s = pd.Series(["foo and bar plus baz"]) 

866 >>> s.str.split(r"and|plus", expand=True) 

867 0 1 2 

868 0 foo bar baz 

869 

870 Regular expressions can be used to handle urls or file names. 

871 When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled 

872 as a regex only if ``len(pat) != 1``. 

873 

874 >>> s = pd.Series(['foojpgbar.jpg']) 

875 >>> s.str.split(r".", expand=True) 

876 0 1 

877 0 foojpgbar jpg 

878 

879 >>> s.str.split(r"\.jpg", expand=True) 

880 0 1 

881 0 foojpgbar 

882 

883 When ``regex=True``, `pat` is interpreted as a regex 

884 

885 >>> s.str.split(r"\.jpg", regex=True, expand=True) 

886 0 1 

887 0 foojpgbar 

888 

889 A compiled regex can be passed as `pat` 

890 

891 >>> import re 

892 >>> s.str.split(re.compile(r"\.jpg"), expand=True) 

893 0 1 

894 0 foojpgbar 

895 

896 When ``regex=False``, `pat` is interpreted as the string itself 

897 

898 >>> s.str.split(r"\.jpg", regex=False, expand=True) 

899 0 

900 0 foojpgbar.jpg 

901 """, 

902 } 

903 ) 

904 @forbid_nonstring_types(["bytes"]) 

905 def split( 

906 self, 

907 pat: str | re.Pattern | None = None, 

908 *, 

909 n=-1, 

910 expand: bool = False, 

911 regex: bool | None = None, 

912 ): 

913 if regex is False and is_re(pat): 

914 raise ValueError( 

915 "Cannot use a compiled regex as replacement pattern with regex=False" 

916 ) 

917 if is_re(pat): 

918 regex = True 

919 result = self._data.array._str_split(pat, n, expand, regex) 

920 if self._data.dtype == "category": 

921 dtype = self._data.dtype.categories.dtype 

922 else: 

923 dtype = object if self._data.dtype == object else None 

924 return self._wrap_result( 

925 result, expand=expand, returns_string=expand, dtype=dtype 

926 ) 

927 

928 @Appender( 

929 _shared_docs["str_split"] 

930 % { 

931 "side": "end", 

932 "pat_regex": "", 

933 "pat_description": "String to split on", 

934 "regex_argument": "", 

935 "raises_split": "", 

936 "regex_pat_note": "", 

937 "method": "rsplit", 

938 "regex_examples": "", 

939 } 

940 ) 

941 @forbid_nonstring_types(["bytes"]) 

942 def rsplit(self, pat=None, *, n=-1, expand: bool = False): 

943 result = self._data.array._str_rsplit(pat, n=n) 

944 dtype = object if self._data.dtype == object else None 

945 return self._wrap_result( 

946 result, expand=expand, returns_string=expand, dtype=dtype 

947 ) 

948 

949 _shared_docs[ 

950 "str_partition" 

951 ] = """ 

952 Split the string at the %(side)s occurrence of `sep`. 

953 

954 This method splits the string at the %(side)s occurrence of `sep`, 

955 and returns 3 elements containing the part before the separator, 

956 the separator itself, and the part after the separator. 

957 If the separator is not found, return %(return)s. 

958 

959 Parameters 

960 ---------- 

961 sep : str, default whitespace 

962 String to split on. 

963 expand : bool, default True 

964 If True, return DataFrame/MultiIndex expanding dimensionality. 

965 If False, return Series/Index. 

966 

967 Returns 

968 ------- 

969 DataFrame/MultiIndex or Series/Index of objects 

970 

971 See Also 

972 -------- 

973 %(also)s 

974 Series.str.split : Split strings around given separators. 

975 str.partition : Standard library version. 

976 

977 Examples 

978 -------- 

979 

980 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) 

981 >>> s 

982 0 Linda van der Berg 

983 1 George Pitt-Rivers 

984 dtype: object 

985 

986 >>> s.str.partition() 

987 0 1 2 

988 0 Linda van der Berg 

989 1 George Pitt-Rivers 

990 

991 To partition by the last space instead of the first one: 

992 

993 >>> s.str.rpartition() 

994 0 1 2 

995 0 Linda van der Berg 

996 1 George Pitt-Rivers 

997 

998 To partition by something different than a space: 

999 

1000 >>> s.str.partition('-') 

1001 0 1 2 

1002 0 Linda van der Berg 

1003 1 George Pitt - Rivers 

1004 

1005 To return a Series containing tuples instead of a DataFrame: 

1006 

1007 >>> s.str.partition('-', expand=False) 

1008 0 (Linda van der Berg, , ) 

1009 1 (George Pitt, -, Rivers) 

1010 dtype: object 

1011 

1012 Also available on indices: 

1013 

1014 >>> idx = pd.Index(['X 123', 'Y 999']) 

1015 >>> idx 

1016 Index(['X 123', 'Y 999'], dtype='object') 

1017 

1018 Which will create a MultiIndex: 

1019 

1020 >>> idx.str.partition() 

1021 MultiIndex([('X', ' ', '123'), 

1022 ('Y', ' ', '999')], 

1023 ) 

1024 

1025 Or an index with tuples with ``expand=False``: 

1026 

1027 >>> idx.str.partition(expand=False) 

1028 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') 

1029 """ 

1030 

1031 @Appender( 

1032 _shared_docs["str_partition"] 

1033 % { 

1034 "side": "first", 

1035 "return": "3 elements containing the string itself, followed by two " 

1036 "empty strings", 

1037 "also": "rpartition : Split the string at the last occurrence of `sep`.", 

1038 } 

1039 ) 

1040 @forbid_nonstring_types(["bytes"]) 

1041 def partition(self, sep: str = " ", expand: bool = True): 

1042 result = self._data.array._str_partition(sep, expand) 

1043 if self._data.dtype == "category": 

1044 dtype = self._data.dtype.categories.dtype 

1045 else: 

1046 dtype = object if self._data.dtype == object else None 

1047 return self._wrap_result( 

1048 result, expand=expand, returns_string=expand, dtype=dtype 

1049 ) 

1050 

1051 @Appender( 

1052 _shared_docs["str_partition"] 

1053 % { 

1054 "side": "last", 

1055 "return": "3 elements containing two empty strings, followed by the " 

1056 "string itself", 

1057 "also": "partition : Split the string at the first occurrence of `sep`.", 

1058 } 

1059 ) 

1060 @forbid_nonstring_types(["bytes"]) 

1061 def rpartition(self, sep: str = " ", expand: bool = True): 

1062 result = self._data.array._str_rpartition(sep, expand) 

1063 if self._data.dtype == "category": 

1064 dtype = self._data.dtype.categories.dtype 

1065 else: 

1066 dtype = object if self._data.dtype == object else None 

1067 return self._wrap_result( 

1068 result, expand=expand, returns_string=expand, dtype=dtype 

1069 ) 

1070 

1071 def get(self, i): 

1072 """ 

1073 Extract element from each component at specified position or with specified key. 

1074 

1075 Extract element from lists, tuples, dict, or strings in each element in the 

1076 Series/Index. 

1077 

1078 Parameters 

1079 ---------- 

1080 i : int or hashable dict label 

1081 Position or key of element to extract. 

1082 

1083 Returns 

1084 ------- 

1085 Series or Index 

1086 

1087 Examples 

1088 -------- 

1089 >>> s = pd.Series(["String", 

1090 ... (1, 2, 3), 

1091 ... ["a", "b", "c"], 

1092 ... 123, 

1093 ... -456, 

1094 ... {1: "Hello", "2": "World"}]) 

1095 >>> s 

1096 0 String 

1097 1 (1, 2, 3) 

1098 2 [a, b, c] 

1099 3 123 

1100 4 -456 

1101 5 {1: 'Hello', '2': 'World'} 

1102 dtype: object 

1103 

1104 >>> s.str.get(1) 

1105 0 t 

1106 1 2 

1107 2 b 

1108 3 NaN 

1109 4 NaN 

1110 5 Hello 

1111 dtype: object 

1112 

1113 >>> s.str.get(-1) 

1114 0 g 

1115 1 3 

1116 2 c 

1117 3 NaN 

1118 4 NaN 

1119 5 None 

1120 dtype: object 

1121 

1122 Return element with given key 

1123 

1124 >>> s = pd.Series([{"name": "Hello", "value": "World"}, 

1125 ... {"name": "Goodbye", "value": "Planet"}]) 

1126 >>> s.str.get('name') 

1127 0 Hello 

1128 1 Goodbye 

1129 dtype: object 

1130 """ 

1131 result = self._data.array._str_get(i) 

1132 return self._wrap_result(result) 

1133 

1134 @forbid_nonstring_types(["bytes"]) 

1135 def join(self, sep: str): 

1136 """ 

1137 Join lists contained as elements in the Series/Index with passed delimiter. 

1138 

1139 If the elements of a Series are lists themselves, join the content of these 

1140 lists using the delimiter passed to the function. 

1141 This function is an equivalent to :meth:`str.join`. 

1142 

1143 Parameters 

1144 ---------- 

1145 sep : str 

1146 Delimiter to use between list entries. 

1147 

1148 Returns 

1149 ------- 

1150 Series/Index: object 

1151 The list entries concatenated by intervening occurrences of the 

1152 delimiter. 

1153 

1154 Raises 

1155 ------ 

1156 AttributeError 

1157 If the supplied Series contains neither strings nor lists. 

1158 

1159 See Also 

1160 -------- 

1161 str.join : Standard library version of this method. 

1162 Series.str.split : Split strings around given separator/delimiter. 

1163 

1164 Notes 

1165 ----- 

1166 If any of the list items is not a string object, the result of the join 

1167 will be `NaN`. 

1168 

1169 Examples 

1170 -------- 

1171 Example with a list that contains non-string elements. 

1172 

1173 >>> s = pd.Series([['lion', 'elephant', 'zebra'], 

1174 ... [1.1, 2.2, 3.3], 

1175 ... ['cat', np.nan, 'dog'], 

1176 ... ['cow', 4.5, 'goat'], 

1177 ... ['duck', ['swan', 'fish'], 'guppy']]) 

1178 >>> s 

1179 0 [lion, elephant, zebra] 

1180 1 [1.1, 2.2, 3.3] 

1181 2 [cat, nan, dog] 

1182 3 [cow, 4.5, goat] 

1183 4 [duck, [swan, fish], guppy] 

1184 dtype: object 

1185 

1186 Join all lists using a '-'. The lists containing object(s) of types other 

1187 than str will produce a NaN. 

1188 

1189 >>> s.str.join('-') 

1190 0 lion-elephant-zebra 

1191 1 NaN 

1192 2 NaN 

1193 3 NaN 

1194 4 NaN 

1195 dtype: object 

1196 """ 

1197 result = self._data.array._str_join(sep) 

1198 return self._wrap_result(result) 

1199 

1200 @forbid_nonstring_types(["bytes"]) 

1201 def contains( 

1202 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True 

1203 ): 

1204 r""" 

1205 Test if pattern or regex is contained within a string of a Series or Index. 

1206 

1207 Return boolean Series or Index based on whether a given pattern or regex is 

1208 contained within a string of a Series or Index. 

1209 

1210 Parameters 

1211 ---------- 

1212 pat : str 

1213 Character sequence or regular expression. 

1214 case : bool, default True 

1215 If True, case sensitive. 

1216 flags : int, default 0 (no flags) 

1217 Flags to pass through to the re module, e.g. re.IGNORECASE. 

1218 na : scalar, optional 

1219 Fill value for missing values. The default depends on dtype of the 

1220 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, 

1221 ``pandas.NA`` is used. 

1222 regex : bool, default True 

1223 If True, assumes the pat is a regular expression. 

1224 

1225 If False, treats the pat as a literal string. 

1226 

1227 Returns 

1228 ------- 

1229 Series or Index of boolean values 

1230 A Series or Index of boolean values indicating whether the 

1231 given pattern is contained within the string of each element 

1232 of the Series or Index. 

1233 

1234 See Also 

1235 -------- 

1236 match : Analogous, but stricter, relying on re.match instead of re.search. 

1237 Series.str.startswith : Test if the start of each string element matches a 

1238 pattern. 

1239 Series.str.endswith : Same as startswith, but tests the end of string. 

1240 

1241 Examples 

1242 -------- 

1243 Returning a Series of booleans using only a literal pattern. 

1244 

1245 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan]) 

1246 >>> s1.str.contains('og', regex=False) 

1247 0 False 

1248 1 True 

1249 2 False 

1250 3 False 

1251 4 NaN 

1252 dtype: object 

1253 

1254 Returning an Index of booleans using only a literal pattern. 

1255 

1256 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.nan]) 

1257 >>> ind.str.contains('23', regex=False) 

1258 Index([False, False, False, True, nan], dtype='object') 

1259 

1260 Specifying case sensitivity using `case`. 

1261 

1262 >>> s1.str.contains('oG', case=True, regex=True) 

1263 0 False 

1264 1 False 

1265 2 False 

1266 3 False 

1267 4 NaN 

1268 dtype: object 

1269 

1270 Specifying `na` to be `False` instead of `NaN` replaces NaN values 

1271 with `False`. If Series or Index does not contain NaN values 

1272 the resultant dtype will be `bool`, otherwise, an `object` dtype. 

1273 

1274 >>> s1.str.contains('og', na=False, regex=True) 

1275 0 False 

1276 1 True 

1277 2 False 

1278 3 False 

1279 4 False 

1280 dtype: bool 

1281 

1282 Returning 'house' or 'dog' when either expression occurs in a string. 

1283 

1284 >>> s1.str.contains('house|dog', regex=True) 

1285 0 False 

1286 1 True 

1287 2 True 

1288 3 False 

1289 4 NaN 

1290 dtype: object 

1291 

1292 Ignoring case sensitivity using `flags` with regex. 

1293 

1294 >>> import re 

1295 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) 

1296 0 False 

1297 1 False 

1298 2 True 

1299 3 False 

1300 4 NaN 

1301 dtype: object 

1302 

1303 Returning any digit using regular expression. 

1304 

1305 >>> s1.str.contains('\\d', regex=True) 

1306 0 False 

1307 1 False 

1308 2 False 

1309 3 True 

1310 4 NaN 

1311 dtype: object 

1312 

1313 Ensure `pat` is a not a literal pattern when `regex` is set to True. 

1314 Note in the following example one might expect only `s2[1]` and `s2[3]` to 

1315 return `True`. However, '.0' as a regex matches any character 

1316 followed by a 0. 

1317 

1318 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) 

1319 >>> s2.str.contains('.0', regex=True) 

1320 0 True 

1321 1 True 

1322 2 False 

1323 3 True 

1324 4 False 

1325 dtype: bool 

1326 """ 

1327 if regex and re.compile(pat).groups: 

1328 warnings.warn( 

1329 "This pattern is interpreted as a regular expression, and has " 

1330 "match groups. To actually get the groups, use str.extract.", 

1331 UserWarning, 

1332 stacklevel=find_stack_level(), 

1333 ) 

1334 

1335 result = self._data.array._str_contains(pat, case, flags, na, regex) 

1336 return self._wrap_result(result, fill_value=na, returns_string=False) 

1337 

1338 @forbid_nonstring_types(["bytes"]) 

1339 def match(self, pat: str, case: bool = True, flags: int = 0, na=None): 

1340 """ 

1341 Determine if each string starts with a match of a regular expression. 

1342 

1343 Parameters 

1344 ---------- 

1345 pat : str 

1346 Character sequence. 

1347 case : bool, default True 

1348 If True, case sensitive. 

1349 flags : int, default 0 (no flags) 

1350 Regex module flags, e.g. re.IGNORECASE. 

1351 na : scalar, optional 

1352 Fill value for missing values. The default depends on dtype of the 

1353 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, 

1354 ``pandas.NA`` is used. 

1355 

1356 Returns 

1357 ------- 

1358 Series/Index/array of boolean values 

1359 

1360 See Also 

1361 -------- 

1362 fullmatch : Stricter matching that requires the entire string to match. 

1363 contains : Analogous, but less strict, relying on re.search instead of 

1364 re.match. 

1365 extract : Extract matched groups. 

1366 

1367 Examples 

1368 -------- 

1369 >>> ser = pd.Series(["horse", "eagle", "donkey"]) 

1370 >>> ser.str.match("e") 

1371 0 False 

1372 1 True 

1373 2 False 

1374 dtype: bool 

1375 """ 

1376 result = self._data.array._str_match(pat, case=case, flags=flags, na=na) 

1377 return self._wrap_result(result, fill_value=na, returns_string=False) 

1378 

1379 @forbid_nonstring_types(["bytes"]) 

1380 def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): 

1381 """ 

1382 Determine if each string entirely matches a regular expression. 

1383 

1384 Parameters 

1385 ---------- 

1386 pat : str 

1387 Character sequence or regular expression. 

1388 case : bool, default True 

1389 If True, case sensitive. 

1390 flags : int, default 0 (no flags) 

1391 Regex module flags, e.g. re.IGNORECASE. 

1392 na : scalar, optional 

1393 Fill value for missing values. The default depends on dtype of the 

1394 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, 

1395 ``pandas.NA`` is used. 

1396 

1397 Returns 

1398 ------- 

1399 Series/Index/array of boolean values 

1400 

1401 See Also 

1402 -------- 

1403 match : Similar, but also returns `True` when only a *prefix* of the string 

1404 matches the regular expression. 

1405 extract : Extract matched groups. 

1406 

1407 Examples 

1408 -------- 

1409 >>> ser = pd.Series(["cat", "duck", "dove"]) 

1410 >>> ser.str.fullmatch(r'd.+') 

1411 0 False 

1412 1 True 

1413 2 True 

1414 dtype: bool 

1415 """ 

1416 result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) 

1417 return self._wrap_result(result, fill_value=na, returns_string=False) 

1418 

1419 @forbid_nonstring_types(["bytes"]) 

1420 def replace( 

1421 self, 

1422 pat: str | re.Pattern, 

1423 repl: str | Callable, 

1424 n: int = -1, 

1425 case: bool | None = None, 

1426 flags: int = 0, 

1427 regex: bool = False, 

1428 ): 

1429 r""" 

1430 Replace each occurrence of pattern/regex in the Series/Index. 

1431 

1432 Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on 

1433 the regex value. 

1434 

1435 Parameters 

1436 ---------- 

1437 pat : str or compiled regex 

1438 String can be a character sequence or regular expression. 

1439 repl : str or callable 

1440 Replacement string or a callable. The callable is passed the regex 

1441 match object and must return a replacement string to be used. 

1442 See :func:`re.sub`. 

1443 n : int, default -1 (all) 

1444 Number of replacements to make from start. 

1445 case : bool, default None 

1446 Determines if replace is case sensitive: 

1447 

1448 - If True, case sensitive (the default if `pat` is a string) 

1449 - Set to False for case insensitive 

1450 - Cannot be set if `pat` is a compiled regex. 

1451 

1452 flags : int, default 0 (no flags) 

1453 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled 

1454 regex. 

1455 regex : bool, default False 

1456 Determines if the passed-in pattern is a regular expression: 

1457 

1458 - If True, assumes the passed-in pattern is a regular expression. 

1459 - If False, treats the pattern as a literal string 

1460 - Cannot be set to False if `pat` is a compiled regex or `repl` is 

1461 a callable. 

1462 

1463 Returns 

1464 ------- 

1465 Series or Index of object 

1466 A copy of the object with all matching occurrences of `pat` replaced by 

1467 `repl`. 

1468 

1469 Raises 

1470 ------ 

1471 ValueError 

1472 * if `regex` is False and `repl` is a callable or `pat` is a compiled 

1473 regex 

1474 * if `pat` is a compiled regex and `case` or `flags` is set 

1475 

1476 Notes 

1477 ----- 

1478 When `pat` is a compiled regex, all flags should be included in the 

1479 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled 

1480 regex will raise an error. 

1481 

1482 Examples 

1483 -------- 

1484 When `pat` is a string and `regex` is True, the given `pat` 

1485 is compiled as a regex. When `repl` is a string, it replaces matching 

1486 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are 

1487 left as is: 

1488 

1489 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) 

1490 0 bao 

1491 1 baz 

1492 2 NaN 

1493 dtype: object 

1494 

1495 When `pat` is a string and `regex` is False, every `pat` is replaced with 

1496 `repl` as with :meth:`str.replace`: 

1497 

1498 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) 

1499 0 bao 

1500 1 fuz 

1501 2 NaN 

1502 dtype: object 

1503 

1504 When `repl` is a callable, it is called on every `pat` using 

1505 :func:`re.sub`. The callable should expect one positional argument 

1506 (a regex object) and return a string. 

1507 

1508 To get the idea: 

1509 

1510 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True) 

1511 0 <re.Match object; span=(0, 1), match='f'>oo 

1512 1 <re.Match object; span=(0, 1), match='f'>uz 

1513 2 NaN 

1514 dtype: object 

1515 

1516 Reverse every lowercase alphabetic word: 

1517 

1518 >>> repl = lambda m: m.group(0)[::-1] 

1519 >>> ser = pd.Series(['foo 123', 'bar baz', np.nan]) 

1520 >>> ser.str.replace(r'[a-z]+', repl, regex=True) 

1521 0 oof 123 

1522 1 rab zab 

1523 2 NaN 

1524 dtype: object 

1525 

1526 Using regex groups (extract second group and swap case): 

1527 

1528 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)" 

1529 >>> repl = lambda m: m.group('two').swapcase() 

1530 >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz']) 

1531 >>> ser.str.replace(pat, repl, regex=True) 

1532 0 tWO 

1533 1 bAR 

1534 dtype: object 

1535 

1536 Using a compiled regex with flags 

1537 

1538 >>> import re 

1539 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) 

1540 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True) 

1541 0 foo 

1542 1 bar 

1543 2 NaN 

1544 dtype: object 

1545 """ 

1546 # Check whether repl is valid (GH 13438, GH 15055) 

1547 if not (isinstance(repl, str) or callable(repl)): 

1548 raise TypeError("repl must be a string or callable") 

1549 

1550 is_compiled_re = is_re(pat) 

1551 if regex or regex is None: 

1552 if is_compiled_re and (case is not None or flags != 0): 

1553 raise ValueError( 

1554 "case and flags cannot be set when pat is a compiled regex" 

1555 ) 

1556 

1557 elif is_compiled_re: 

1558 raise ValueError( 

1559 "Cannot use a compiled regex as replacement pattern with regex=False" 

1560 ) 

1561 elif callable(repl): 

1562 raise ValueError("Cannot use a callable replacement when regex=False") 

1563 

1564 if case is None: 

1565 case = True 

1566 

1567 result = self._data.array._str_replace( 

1568 pat, repl, n=n, case=case, flags=flags, regex=regex 

1569 ) 

1570 return self._wrap_result(result) 

1571 

1572 @forbid_nonstring_types(["bytes"]) 

1573 def repeat(self, repeats): 

1574 """ 

1575 Duplicate each string in the Series or Index. 

1576 

1577 Parameters 

1578 ---------- 

1579 repeats : int or sequence of int 

1580 Same value for all (int) or different value per (sequence). 

1581 

1582 Returns 

1583 ------- 

1584 Series or pandas.Index 

1585 Series or Index of repeated string objects specified by 

1586 input parameter repeats. 

1587 

1588 Examples 

1589 -------- 

1590 >>> s = pd.Series(['a', 'b', 'c']) 

1591 >>> s 

1592 0 a 

1593 1 b 

1594 2 c 

1595 dtype: object 

1596 

1597 Single int repeats string in Series 

1598 

1599 >>> s.str.repeat(repeats=2) 

1600 0 aa 

1601 1 bb 

1602 2 cc 

1603 dtype: object 

1604 

1605 Sequence of int repeats corresponding string in Series 

1606 

1607 >>> s.str.repeat(repeats=[1, 2, 3]) 

1608 0 a 

1609 1 bb 

1610 2 ccc 

1611 dtype: object 

1612 """ 

1613 result = self._data.array._str_repeat(repeats) 

1614 return self._wrap_result(result) 

1615 

1616 @forbid_nonstring_types(["bytes"]) 

1617 def pad( 

1618 self, 

1619 width: int, 

1620 side: Literal["left", "right", "both"] = "left", 

1621 fillchar: str = " ", 

1622 ): 

1623 """ 

1624 Pad strings in the Series/Index up to width. 

1625 

1626 Parameters 

1627 ---------- 

1628 width : int 

1629 Minimum width of resulting string; additional characters will be filled 

1630 with character defined in `fillchar`. 

1631 side : {'left', 'right', 'both'}, default 'left' 

1632 Side from which to fill resulting string. 

1633 fillchar : str, default ' ' 

1634 Additional character for filling, default is whitespace. 

1635 

1636 Returns 

1637 ------- 

1638 Series or Index of object 

1639 Returns Series or Index with minimum number of char in object. 

1640 

1641 See Also 

1642 -------- 

1643 Series.str.rjust : Fills the left side of strings with an arbitrary 

1644 character. Equivalent to ``Series.str.pad(side='left')``. 

1645 Series.str.ljust : Fills the right side of strings with an arbitrary 

1646 character. Equivalent to ``Series.str.pad(side='right')``. 

1647 Series.str.center : Fills both sides of strings with an arbitrary 

1648 character. Equivalent to ``Series.str.pad(side='both')``. 

1649 Series.str.zfill : Pad strings in the Series/Index by prepending '0' 

1650 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. 

1651 

1652 Examples 

1653 -------- 

1654 >>> s = pd.Series(["caribou", "tiger"]) 

1655 >>> s 

1656 0 caribou 

1657 1 tiger 

1658 dtype: object 

1659 

1660 >>> s.str.pad(width=10) 

1661 0 caribou 

1662 1 tiger 

1663 dtype: object 

1664 

1665 >>> s.str.pad(width=10, side='right', fillchar='-') 

1666 0 caribou--- 

1667 1 tiger----- 

1668 dtype: object 

1669 

1670 >>> s.str.pad(width=10, side='both', fillchar='-') 

1671 0 -caribou-- 

1672 1 --tiger--- 

1673 dtype: object 

1674 """ 

1675 if not isinstance(fillchar, str): 

1676 msg = f"fillchar must be a character, not {type(fillchar).__name__}" 

1677 raise TypeError(msg) 

1678 

1679 if len(fillchar) != 1: 

1680 raise TypeError("fillchar must be a character, not str") 

1681 

1682 if not is_integer(width): 

1683 msg = f"width must be of integer type, not {type(width).__name__}" 

1684 raise TypeError(msg) 

1685 

1686 result = self._data.array._str_pad(width, side=side, fillchar=fillchar) 

1687 return self._wrap_result(result) 

1688 

1689 _shared_docs[ 

1690 "str_pad" 

1691 ] = """ 

1692 Pad %(side)s side of strings in the Series/Index. 

1693 

1694 Equivalent to :meth:`str.%(method)s`. 

1695 

1696 Parameters 

1697 ---------- 

1698 width : int 

1699 Minimum width of resulting string; additional characters will be filled 

1700 with ``fillchar``. 

1701 fillchar : str 

1702 Additional character for filling, default is whitespace. 

1703 

1704 Returns 

1705 ------- 

1706 Series/Index of objects. 

1707 

1708 Examples 

1709 -------- 

1710 For Series.str.center: 

1711 

1712 >>> ser = pd.Series(['dog', 'bird', 'mouse']) 

1713 >>> ser.str.center(8, fillchar='.') 

1714 0 ..dog... 

1715 1 ..bird.. 

1716 2 .mouse.. 

1717 dtype: object 

1718 

1719 For Series.str.ljust: 

1720 

1721 >>> ser = pd.Series(['dog', 'bird', 'mouse']) 

1722 >>> ser.str.ljust(8, fillchar='.') 

1723 0 dog..... 

1724 1 bird.... 

1725 2 mouse... 

1726 dtype: object 

1727 

1728 For Series.str.rjust: 

1729 

1730 >>> ser = pd.Series(['dog', 'bird', 'mouse']) 

1731 >>> ser.str.rjust(8, fillchar='.') 

1732 0 .....dog 

1733 1 ....bird 

1734 2 ...mouse 

1735 dtype: object 

1736 """ 

1737 

1738 @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"}) 

1739 @forbid_nonstring_types(["bytes"]) 

1740 def center(self, width: int, fillchar: str = " "): 

1741 return self.pad(width, side="both", fillchar=fillchar) 

1742 

1743 @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"}) 

1744 @forbid_nonstring_types(["bytes"]) 

1745 def ljust(self, width: int, fillchar: str = " "): 

1746 return self.pad(width, side="right", fillchar=fillchar) 

1747 

1748 @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"}) 

1749 @forbid_nonstring_types(["bytes"]) 

1750 def rjust(self, width: int, fillchar: str = " "): 

1751 return self.pad(width, side="left", fillchar=fillchar) 

1752 

1753 @forbid_nonstring_types(["bytes"]) 

1754 def zfill(self, width: int): 

1755 """ 

1756 Pad strings in the Series/Index by prepending '0' characters. 

1757 

1758 Strings in the Series/Index are padded with '0' characters on the 

1759 left of the string to reach a total string length `width`. Strings 

1760 in the Series/Index with length greater or equal to `width` are 

1761 unchanged. 

1762 

1763 Parameters 

1764 ---------- 

1765 width : int 

1766 Minimum length of resulting string; strings with length less 

1767 than `width` be prepended with '0' characters. 

1768 

1769 Returns 

1770 ------- 

1771 Series/Index of objects. 

1772 

1773 See Also 

1774 -------- 

1775 Series.str.rjust : Fills the left side of strings with an arbitrary 

1776 character. 

1777 Series.str.ljust : Fills the right side of strings with an arbitrary 

1778 character. 

1779 Series.str.pad : Fills the specified sides of strings with an arbitrary 

1780 character. 

1781 Series.str.center : Fills both sides of strings with an arbitrary 

1782 character. 

1783 

1784 Notes 

1785 ----- 

1786 Differs from :meth:`str.zfill` which has special handling 

1787 for '+'/'-' in the string. 

1788 

1789 Examples 

1790 -------- 

1791 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) 

1792 >>> s 

1793 0 -1 

1794 1 1 

1795 2 1000 

1796 3 10 

1797 4 NaN 

1798 dtype: object 

1799 

1800 Note that ``10`` and ``NaN`` are not strings, therefore they are 

1801 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a 

1802 special character and the zero is added to the right of it 

1803 (:meth:`str.zfill` would have moved it to the left). ``1000`` 

1804 remains unchanged as it is longer than `width`. 

1805 

1806 >>> s.str.zfill(3) 

1807 0 -01 

1808 1 001 

1809 2 1000 

1810 3 NaN 

1811 4 NaN 

1812 dtype: object 

1813 """ 

1814 if not is_integer(width): 

1815 msg = f"width must be of integer type, not {type(width).__name__}" 

1816 raise TypeError(msg) 

1817 f = lambda x: x.zfill(width) 

1818 result = self._data.array._str_map(f) 

1819 return self._wrap_result(result) 

1820 

1821 def slice(self, start=None, stop=None, step=None): 

1822 """ 

1823 Slice substrings from each element in the Series or Index. 

1824 

1825 Parameters 

1826 ---------- 

1827 start : int, optional 

1828 Start position for slice operation. 

1829 stop : int, optional 

1830 Stop position for slice operation. 

1831 step : int, optional 

1832 Step size for slice operation. 

1833 

1834 Returns 

1835 ------- 

1836 Series or Index of object 

1837 Series or Index from sliced substring from original string object. 

1838 

1839 See Also 

1840 -------- 

1841 Series.str.slice_replace : Replace a slice with a string. 

1842 Series.str.get : Return element at position. 

1843 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` 

1844 being the position. 

1845 

1846 Examples 

1847 -------- 

1848 >>> s = pd.Series(["koala", "dog", "chameleon"]) 

1849 >>> s 

1850 0 koala 

1851 1 dog 

1852 2 chameleon 

1853 dtype: object 

1854 

1855 >>> s.str.slice(start=1) 

1856 0 oala 

1857 1 og 

1858 2 hameleon 

1859 dtype: object 

1860 

1861 >>> s.str.slice(start=-1) 

1862 0 a 

1863 1 g 

1864 2 n 

1865 dtype: object 

1866 

1867 >>> s.str.slice(stop=2) 

1868 0 ko 

1869 1 do 

1870 2 ch 

1871 dtype: object 

1872 

1873 >>> s.str.slice(step=2) 

1874 0 kaa 

1875 1 dg 

1876 2 caeen 

1877 dtype: object 

1878 

1879 >>> s.str.slice(start=0, stop=5, step=3) 

1880 0 kl 

1881 1 d 

1882 2 cm 

1883 dtype: object 

1884 

1885 Equivalent behaviour to: 

1886 

1887 >>> s.str[0:5:3] 

1888 0 kl 

1889 1 d 

1890 2 cm 

1891 dtype: object 

1892 """ 

1893 result = self._data.array._str_slice(start, stop, step) 

1894 return self._wrap_result(result) 

1895 

1896 @forbid_nonstring_types(["bytes"]) 

1897 def slice_replace(self, start=None, stop=None, repl=None): 

1898 """ 

1899 Replace a positional slice of a string with another value. 

1900 

1901 Parameters 

1902 ---------- 

1903 start : int, optional 

1904 Left index position to use for the slice. If not specified (None), 

1905 the slice is unbounded on the left, i.e. slice from the start 

1906 of the string. 

1907 stop : int, optional 

1908 Right index position to use for the slice. If not specified (None), 

1909 the slice is unbounded on the right, i.e. slice until the 

1910 end of the string. 

1911 repl : str, optional 

1912 String for replacement. If not specified (None), the sliced region 

1913 is replaced with an empty string. 

1914 

1915 Returns 

1916 ------- 

1917 Series or Index 

1918 Same type as the original object. 

1919 

1920 See Also 

1921 -------- 

1922 Series.str.slice : Just slicing without replacement. 

1923 

1924 Examples 

1925 -------- 

1926 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) 

1927 >>> s 

1928 0 a 

1929 1 ab 

1930 2 abc 

1931 3 abdc 

1932 4 abcde 

1933 dtype: object 

1934 

1935 Specify just `start`, meaning replace `start` until the end of the 

1936 string with `repl`. 

1937 

1938 >>> s.str.slice_replace(1, repl='X') 

1939 0 aX 

1940 1 aX 

1941 2 aX 

1942 3 aX 

1943 4 aX 

1944 dtype: object 

1945 

1946 Specify just `stop`, meaning the start of the string to `stop` is replaced 

1947 with `repl`, and the rest of the string is included. 

1948 

1949 >>> s.str.slice_replace(stop=2, repl='X') 

1950 0 X 

1951 1 X 

1952 2 Xc 

1953 3 Xdc 

1954 4 Xcde 

1955 dtype: object 

1956 

1957 Specify `start` and `stop`, meaning the slice from `start` to `stop` is 

1958 replaced with `repl`. Everything before or after `start` and `stop` is 

1959 included as is. 

1960 

1961 >>> s.str.slice_replace(start=1, stop=3, repl='X') 

1962 0 aX 

1963 1 aX 

1964 2 aX 

1965 3 aXc 

1966 4 aXde 

1967 dtype: object 

1968 """ 

1969 result = self._data.array._str_slice_replace(start, stop, repl) 

1970 return self._wrap_result(result) 

1971 

1972 def decode(self, encoding, errors: str = "strict"): 

1973 """ 

1974 Decode character string in the Series/Index using indicated encoding. 

1975 

1976 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in 

1977 python3. 

1978 

1979 Parameters 

1980 ---------- 

1981 encoding : str 

1982 errors : str, optional 

1983 

1984 Returns 

1985 ------- 

1986 Series or Index 

1987 

1988 Examples 

1989 -------- 

1990 For Series: 

1991 

1992 >>> ser = pd.Series([b'cow', b'123', b'()']) 

1993 >>> ser.str.decode('ascii') 

1994 0 cow 

1995 1 123 

1996 2 () 

1997 dtype: object 

1998 """ 

1999 # TODO: Add a similar _bytes interface. 

2000 if encoding in _cpython_optimized_decoders: 

2001 # CPython optimized implementation 

2002 f = lambda x: x.decode(encoding, errors) 

2003 else: 

2004 decoder = codecs.getdecoder(encoding) 

2005 f = lambda x: decoder(x, errors)[0] 

2006 arr = self._data.array 

2007 # assert isinstance(arr, (StringArray,)) 

2008 result = arr._str_map(f) 

2009 return self._wrap_result(result) 

2010 

2011 @forbid_nonstring_types(["bytes"]) 

2012 def encode(self, encoding, errors: str = "strict"): 

2013 """ 

2014 Encode character string in the Series/Index using indicated encoding. 

2015 

2016 Equivalent to :meth:`str.encode`. 

2017 

2018 Parameters 

2019 ---------- 

2020 encoding : str 

2021 errors : str, optional 

2022 

2023 Returns 

2024 ------- 

2025 Series/Index of objects 

2026 

2027 Examples 

2028 -------- 

2029 >>> ser = pd.Series(['cow', '123', '()']) 

2030 >>> ser.str.encode(encoding='ascii') 

2031 0 b'cow' 

2032 1 b'123' 

2033 2 b'()' 

2034 dtype: object 

2035 """ 

2036 result = self._data.array._str_encode(encoding, errors) 

2037 return self._wrap_result(result, returns_string=False) 

2038 

2039 _shared_docs[ 

2040 "str_strip" 

2041 ] = r""" 

2042 Remove %(position)s characters. 

2043 

2044 Strip whitespaces (including newlines) or a set of specified characters 

2045 from each string in the Series/Index from %(side)s. 

2046 Replaces any non-strings in Series with NaNs. 

2047 Equivalent to :meth:`str.%(method)s`. 

2048 

2049 Parameters 

2050 ---------- 

2051 to_strip : str or None, default None 

2052 Specifying the set of characters to be removed. 

2053 All combinations of this set of characters will be stripped. 

2054 If None then whitespaces are removed. 

2055 

2056 Returns 

2057 ------- 

2058 Series or Index of object 

2059 

2060 See Also 

2061 -------- 

2062 Series.str.strip : Remove leading and trailing characters in Series/Index. 

2063 Series.str.lstrip : Remove leading characters in Series/Index. 

2064 Series.str.rstrip : Remove trailing characters in Series/Index. 

2065 

2066 Examples 

2067 -------- 

2068 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True]) 

2069 >>> s 

2070 0 1. Ant. 

2071 1 2. Bee!\n 

2072 2 3. Cat?\t 

2073 3 NaN 

2074 4 10 

2075 5 True 

2076 dtype: object 

2077 

2078 >>> s.str.strip() 

2079 0 1. Ant. 

2080 1 2. Bee! 

2081 2 3. Cat? 

2082 3 NaN 

2083 4 NaN 

2084 5 NaN 

2085 dtype: object 

2086 

2087 >>> s.str.lstrip('123.') 

2088 0 Ant. 

2089 1 Bee!\n 

2090 2 Cat?\t 

2091 3 NaN 

2092 4 NaN 

2093 5 NaN 

2094 dtype: object 

2095 

2096 >>> s.str.rstrip('.!? \n\t') 

2097 0 1. Ant 

2098 1 2. Bee 

2099 2 3. Cat 

2100 3 NaN 

2101 4 NaN 

2102 5 NaN 

2103 dtype: object 

2104 

2105 >>> s.str.strip('123.!? \n\t') 

2106 0 Ant 

2107 1 Bee 

2108 2 Cat 

2109 3 NaN 

2110 4 NaN 

2111 5 NaN 

2112 dtype: object 

2113 """ 

2114 

2115 @Appender( 

2116 _shared_docs["str_strip"] 

2117 % { 

2118 "side": "left and right sides", 

2119 "method": "strip", 

2120 "position": "leading and trailing", 

2121 } 

2122 ) 

2123 @forbid_nonstring_types(["bytes"]) 

2124 def strip(self, to_strip=None): 

2125 result = self._data.array._str_strip(to_strip) 

2126 return self._wrap_result(result) 

2127 

2128 @Appender( 

2129 _shared_docs["str_strip"] 

2130 % {"side": "left side", "method": "lstrip", "position": "leading"} 

2131 ) 

2132 @forbid_nonstring_types(["bytes"]) 

2133 def lstrip(self, to_strip=None): 

2134 result = self._data.array._str_lstrip(to_strip) 

2135 return self._wrap_result(result) 

2136 

2137 @Appender( 

2138 _shared_docs["str_strip"] 

2139 % {"side": "right side", "method": "rstrip", "position": "trailing"} 

2140 ) 

2141 @forbid_nonstring_types(["bytes"]) 

2142 def rstrip(self, to_strip=None): 

2143 result = self._data.array._str_rstrip(to_strip) 

2144 return self._wrap_result(result) 

2145 

2146 _shared_docs[ 

2147 "str_removefix" 

2148 ] = r""" 

2149 Remove a %(side)s from an object series. 

2150 

2151 If the %(side)s is not present, the original string will be returned. 

2152 

2153 Parameters 

2154 ---------- 

2155 %(side)s : str 

2156 Remove the %(side)s of the string. 

2157 

2158 Returns 

2159 ------- 

2160 Series/Index: object 

2161 The Series or Index with given %(side)s removed. 

2162 

2163 See Also 

2164 -------- 

2165 Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series. 

2166 

2167 Examples 

2168 -------- 

2169 >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"]) 

2170 >>> s 

2171 0 str_foo 

2172 1 str_bar 

2173 2 no_prefix 

2174 dtype: object 

2175 >>> s.str.removeprefix("str_") 

2176 0 foo 

2177 1 bar 

2178 2 no_prefix 

2179 dtype: object 

2180 

2181 >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"]) 

2182 >>> s 

2183 0 foo_str 

2184 1 bar_str 

2185 2 no_suffix 

2186 dtype: object 

2187 >>> s.str.removesuffix("_str") 

2188 0 foo 

2189 1 bar 

2190 2 no_suffix 

2191 dtype: object 

2192 """ 

2193 

2194 @Appender( 

2195 _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"} 

2196 ) 

2197 @forbid_nonstring_types(["bytes"]) 

2198 def removeprefix(self, prefix: str): 

2199 result = self._data.array._str_removeprefix(prefix) 

2200 return self._wrap_result(result) 

2201 

2202 @Appender( 

2203 _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"} 

2204 ) 

2205 @forbid_nonstring_types(["bytes"]) 

2206 def removesuffix(self, suffix: str): 

2207 result = self._data.array._str_removesuffix(suffix) 

2208 return self._wrap_result(result) 

2209 

2210 @forbid_nonstring_types(["bytes"]) 

2211 def wrap(self, width: int, **kwargs): 

2212 r""" 

2213 Wrap strings in Series/Index at specified line width. 

2214 

2215 This method has the same keyword parameters and defaults as 

2216 :class:`textwrap.TextWrapper`. 

2217 

2218 Parameters 

2219 ---------- 

2220 width : int 

2221 Maximum line width. 

2222 expand_tabs : bool, optional 

2223 If True, tab characters will be expanded to spaces (default: True). 

2224 replace_whitespace : bool, optional 

2225 If True, each whitespace character (as defined by string.whitespace) 

2226 remaining after tab expansion will be replaced by a single space 

2227 (default: True). 

2228 drop_whitespace : bool, optional 

2229 If True, whitespace that, after wrapping, happens to end up at the 

2230 beginning or end of a line is dropped (default: True). 

2231 break_long_words : bool, optional 

2232 If True, then words longer than width will be broken in order to ensure 

2233 that no lines are longer than width. If it is false, long words will 

2234 not be broken, and some lines may be longer than width (default: True). 

2235 break_on_hyphens : bool, optional 

2236 If True, wrapping will occur preferably on whitespace and right after 

2237 hyphens in compound words, as it is customary in English. If false, 

2238 only whitespaces will be considered as potentially good places for line 

2239 breaks, but you need to set break_long_words to false if you want truly 

2240 insecable words (default: True). 

2241 

2242 Returns 

2243 ------- 

2244 Series or Index 

2245 

2246 Notes 

2247 ----- 

2248 Internally, this method uses a :class:`textwrap.TextWrapper` instance with 

2249 default settings. To achieve behavior matching R's stringr library str_wrap 

2250 function, use the arguments: 

2251 

2252 - expand_tabs = False 

2253 - replace_whitespace = True 

2254 - drop_whitespace = True 

2255 - break_long_words = False 

2256 - break_on_hyphens = False 

2257 

2258 Examples 

2259 -------- 

2260 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) 

2261 >>> s.str.wrap(12) 

2262 0 line to be\nwrapped 

2263 1 another line\nto be\nwrapped 

2264 dtype: object 

2265 """ 

2266 result = self._data.array._str_wrap(width, **kwargs) 

2267 return self._wrap_result(result) 

2268 

2269 @forbid_nonstring_types(["bytes"]) 

2270 def get_dummies(self, sep: str = "|"): 

2271 """ 

2272 Return DataFrame of dummy/indicator variables for Series. 

2273 

2274 Each string in Series is split by sep and returned as a DataFrame 

2275 of dummy/indicator variables. 

2276 

2277 Parameters 

2278 ---------- 

2279 sep : str, default "|" 

2280 String to split on. 

2281 

2282 Returns 

2283 ------- 

2284 DataFrame 

2285 Dummy variables corresponding to values of the Series. 

2286 

2287 See Also 

2288 -------- 

2289 get_dummies : Convert categorical variable into dummy/indicator 

2290 variables. 

2291 

2292 Examples 

2293 -------- 

2294 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() 

2295 a b c 

2296 0 1 1 0 

2297 1 1 0 0 

2298 2 1 0 1 

2299 

2300 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() 

2301 a b c 

2302 0 1 1 0 

2303 1 0 0 0 

2304 2 1 0 1 

2305 """ 

2306 # we need to cast to Series of strings as only that has all 

2307 # methods available for making the dummies... 

2308 result, name = self._data.array._str_get_dummies(sep) 

2309 return self._wrap_result( 

2310 result, 

2311 name=name, 

2312 expand=True, 

2313 returns_string=False, 

2314 ) 

2315 

2316 @forbid_nonstring_types(["bytes"]) 

2317 def translate(self, table): 

2318 """ 

2319 Map all characters in the string through the given mapping table. 

2320 

2321 Equivalent to standard :meth:`str.translate`. 

2322 

2323 Parameters 

2324 ---------- 

2325 table : dict 

2326 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or 

2327 None. Unmapped characters are left untouched. 

2328 Characters mapped to None are deleted. :meth:`str.maketrans` is a 

2329 helper function for making translation tables. 

2330 

2331 Returns 

2332 ------- 

2333 Series or Index 

2334 

2335 Examples 

2336 -------- 

2337 >>> ser = pd.Series(["El niño", "Françoise"]) 

2338 >>> mytable = str.maketrans({'ñ': 'n', 'ç': 'c'}) 

2339 >>> ser.str.translate(mytable) 

2340 0 El nino 

2341 1 Francoise 

2342 dtype: object 

2343 """ 

2344 result = self._data.array._str_translate(table) 

2345 dtype = object if self._data.dtype == "object" else None 

2346 return self._wrap_result(result, dtype=dtype) 

2347 

2348 @forbid_nonstring_types(["bytes"]) 

2349 def count(self, pat, flags: int = 0): 

2350 r""" 

2351 Count occurrences of pattern in each string of the Series/Index. 

2352 

2353 This function is used to count the number of times a particular regex 

2354 pattern is repeated in each of the string elements of the 

2355 :class:`~pandas.Series`. 

2356 

2357 Parameters 

2358 ---------- 

2359 pat : str 

2360 Valid regular expression. 

2361 flags : int, default 0, meaning no flags 

2362 Flags for the `re` module. For a complete list, `see here 

2363 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_. 

2364 **kwargs 

2365 For compatibility with other string methods. Not used. 

2366 

2367 Returns 

2368 ------- 

2369 Series or Index 

2370 Same type as the calling object containing the integer counts. 

2371 

2372 See Also 

2373 -------- 

2374 re : Standard library module for regular expressions. 

2375 str.count : Standard library version, without regular expression support. 

2376 

2377 Notes 

2378 ----- 

2379 Some characters need to be escaped when passing in `pat`. 

2380 eg. ``'$'`` has a special meaning in regex and must be escaped when 

2381 finding this literal character. 

2382 

2383 Examples 

2384 -------- 

2385 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) 

2386 >>> s.str.count('a') 

2387 0 0.0 

2388 1 0.0 

2389 2 2.0 

2390 3 2.0 

2391 4 NaN 

2392 5 0.0 

2393 6 1.0 

2394 dtype: float64 

2395 

2396 Escape ``'$'`` to find the literal dollar sign. 

2397 

2398 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) 

2399 >>> s.str.count('\\$') 

2400 0 1 

2401 1 0 

2402 2 1 

2403 3 2 

2404 4 2 

2405 5 0 

2406 dtype: int64 

2407 

2408 This is also available on Index 

2409 

2410 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') 

2411 Index([0, 0, 2, 1], dtype='int64') 

2412 """ 

2413 result = self._data.array._str_count(pat, flags) 

2414 return self._wrap_result(result, returns_string=False) 

2415 

2416 @forbid_nonstring_types(["bytes"]) 

2417 def startswith( 

2418 self, pat: str | tuple[str, ...], na: Scalar | None = None 

2419 ) -> Series | Index: 

2420 """ 

2421 Test if the start of each string element matches a pattern. 

2422 

2423 Equivalent to :meth:`str.startswith`. 

2424 

2425 Parameters 

2426 ---------- 

2427 pat : str or tuple[str, ...] 

2428 Character sequence or tuple of strings. Regular expressions are not 

2429 accepted. 

2430 na : object, default NaN 

2431 Object shown if element tested is not a string. The default depends 

2432 on dtype of the array. For object-dtype, ``numpy.nan`` is used. 

2433 For ``StringDtype``, ``pandas.NA`` is used. 

2434 

2435 Returns 

2436 ------- 

2437 Series or Index of bool 

2438 A Series of booleans indicating whether the given pattern matches 

2439 the start of each string element. 

2440 

2441 See Also 

2442 -------- 

2443 str.startswith : Python standard library string method. 

2444 Series.str.endswith : Same as startswith, but tests the end of string. 

2445 Series.str.contains : Tests if string element contains a pattern. 

2446 

2447 Examples 

2448 -------- 

2449 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) 

2450 >>> s 

2451 0 bat 

2452 1 Bear 

2453 2 cat 

2454 3 NaN 

2455 dtype: object 

2456 

2457 >>> s.str.startswith('b') 

2458 0 True 

2459 1 False 

2460 2 False 

2461 3 NaN 

2462 dtype: object 

2463 

2464 >>> s.str.startswith(('b', 'B')) 

2465 0 True 

2466 1 True 

2467 2 False 

2468 3 NaN 

2469 dtype: object 

2470 

2471 Specifying `na` to be `False` instead of `NaN`. 

2472 

2473 >>> s.str.startswith('b', na=False) 

2474 0 True 

2475 1 False 

2476 2 False 

2477 3 False 

2478 dtype: bool 

2479 """ 

2480 if not isinstance(pat, (str, tuple)): 

2481 msg = f"expected a string or tuple, not {type(pat).__name__}" 

2482 raise TypeError(msg) 

2483 result = self._data.array._str_startswith(pat, na=na) 

2484 return self._wrap_result(result, returns_string=False) 

2485 

2486 @forbid_nonstring_types(["bytes"]) 

2487 def endswith( 

2488 self, pat: str | tuple[str, ...], na: Scalar | None = None 

2489 ) -> Series | Index: 

2490 """ 

2491 Test if the end of each string element matches a pattern. 

2492 

2493 Equivalent to :meth:`str.endswith`. 

2494 

2495 Parameters 

2496 ---------- 

2497 pat : str or tuple[str, ...] 

2498 Character sequence or tuple of strings. Regular expressions are not 

2499 accepted. 

2500 na : object, default NaN 

2501 Object shown if element tested is not a string. The default depends 

2502 on dtype of the array. For object-dtype, ``numpy.nan`` is used. 

2503 For ``StringDtype``, ``pandas.NA`` is used. 

2504 

2505 Returns 

2506 ------- 

2507 Series or Index of bool 

2508 A Series of booleans indicating whether the given pattern matches 

2509 the end of each string element. 

2510 

2511 See Also 

2512 -------- 

2513 str.endswith : Python standard library string method. 

2514 Series.str.startswith : Same as endswith, but tests the start of string. 

2515 Series.str.contains : Tests if string element contains a pattern. 

2516 

2517 Examples 

2518 -------- 

2519 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) 

2520 >>> s 

2521 0 bat 

2522 1 bear 

2523 2 caT 

2524 3 NaN 

2525 dtype: object 

2526 

2527 >>> s.str.endswith('t') 

2528 0 True 

2529 1 False 

2530 2 False 

2531 3 NaN 

2532 dtype: object 

2533 

2534 >>> s.str.endswith(('t', 'T')) 

2535 0 True 

2536 1 False 

2537 2 True 

2538 3 NaN 

2539 dtype: object 

2540 

2541 Specifying `na` to be `False` instead of `NaN`. 

2542 

2543 >>> s.str.endswith('t', na=False) 

2544 0 True 

2545 1 False 

2546 2 False 

2547 3 False 

2548 dtype: bool 

2549 """ 

2550 if not isinstance(pat, (str, tuple)): 

2551 msg = f"expected a string or tuple, not {type(pat).__name__}" 

2552 raise TypeError(msg) 

2553 result = self._data.array._str_endswith(pat, na=na) 

2554 return self._wrap_result(result, returns_string=False) 

2555 

2556 @forbid_nonstring_types(["bytes"]) 

2557 def findall(self, pat, flags: int = 0): 

2558 """ 

2559 Find all occurrences of pattern or regular expression in the Series/Index. 

2560 

2561 Equivalent to applying :func:`re.findall` to all the elements in the 

2562 Series/Index. 

2563 

2564 Parameters 

2565 ---------- 

2566 pat : str 

2567 Pattern or regular expression. 

2568 flags : int, default 0 

2569 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which 

2570 means no flags). 

2571 

2572 Returns 

2573 ------- 

2574 Series/Index of lists of strings 

2575 All non-overlapping matches of pattern or regular expression in each 

2576 string of this Series/Index. 

2577 

2578 See Also 

2579 -------- 

2580 count : Count occurrences of pattern or regular expression in each string 

2581 of the Series/Index. 

2582 extractall : For each string in the Series, extract groups from all matches 

2583 of regular expression and return a DataFrame with one row for each 

2584 match and one column for each group. 

2585 re.findall : The equivalent ``re`` function to all non-overlapping matches 

2586 of pattern or regular expression in string, as a list of strings. 

2587 

2588 Examples 

2589 -------- 

2590 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) 

2591 

2592 The search for the pattern 'Monkey' returns one match: 

2593 

2594 >>> s.str.findall('Monkey') 

2595 0 [] 

2596 1 [Monkey] 

2597 2 [] 

2598 dtype: object 

2599 

2600 On the other hand, the search for the pattern 'MONKEY' doesn't return any 

2601 match: 

2602 

2603 >>> s.str.findall('MONKEY') 

2604 0 [] 

2605 1 [] 

2606 2 [] 

2607 dtype: object 

2608 

2609 Flags can be added to the pattern or regular expression. For instance, 

2610 to find the pattern 'MONKEY' ignoring the case: 

2611 

2612 >>> import re 

2613 >>> s.str.findall('MONKEY', flags=re.IGNORECASE) 

2614 0 [] 

2615 1 [Monkey] 

2616 2 [] 

2617 dtype: object 

2618 

2619 When the pattern matches more than one string in the Series, all matches 

2620 are returned: 

2621 

2622 >>> s.str.findall('on') 

2623 0 [on] 

2624 1 [on] 

2625 2 [] 

2626 dtype: object 

2627 

2628 Regular expressions are supported too. For instance, the search for all the 

2629 strings ending with the word 'on' is shown next: 

2630 

2631 >>> s.str.findall('on$') 

2632 0 [on] 

2633 1 [] 

2634 2 [] 

2635 dtype: object 

2636 

2637 If the pattern is found more than once in the same string, then a list of 

2638 multiple strings is returned: 

2639 

2640 >>> s.str.findall('b') 

2641 0 [] 

2642 1 [] 

2643 2 [b, b] 

2644 dtype: object 

2645 """ 

2646 result = self._data.array._str_findall(pat, flags) 

2647 return self._wrap_result(result, returns_string=False) 

2648 

2649 @forbid_nonstring_types(["bytes"]) 

2650 def extract( 

2651 self, pat: str, flags: int = 0, expand: bool = True 

2652 ) -> DataFrame | Series | Index: 

2653 r""" 

2654 Extract capture groups in the regex `pat` as columns in a DataFrame. 

2655 

2656 For each subject string in the Series, extract groups from the 

2657 first match of regular expression `pat`. 

2658 

2659 Parameters 

2660 ---------- 

2661 pat : str 

2662 Regular expression pattern with capturing groups. 

2663 flags : int, default 0 (no flags) 

2664 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that 

2665 modify regular expression matching for things like case, 

2666 spaces, etc. For more details, see :mod:`re`. 

2667 expand : bool, default True 

2668 If True, return DataFrame with one column per capture group. 

2669 If False, return a Series/Index if there is one capture group 

2670 or DataFrame if there are multiple capture groups. 

2671 

2672 Returns 

2673 ------- 

2674 DataFrame or Series or Index 

2675 A DataFrame with one row for each subject string, and one 

2676 column for each group. Any capture group names in regular 

2677 expression pat will be used for column names; otherwise 

2678 capture group numbers will be used. The dtype of each result 

2679 column is always object, even when no match is found. If 

2680 ``expand=False`` and pat has only one capture group, then 

2681 return a Series (if subject is a Series) or Index (if subject 

2682 is an Index). 

2683 

2684 See Also 

2685 -------- 

2686 extractall : Returns all matches (not just the first match). 

2687 

2688 Examples 

2689 -------- 

2690 A pattern with two groups will return a DataFrame with two columns. 

2691 Non-matches will be NaN. 

2692 

2693 >>> s = pd.Series(['a1', 'b2', 'c3']) 

2694 >>> s.str.extract(r'([ab])(\d)') 

2695 0 1 

2696 0 a 1 

2697 1 b 2 

2698 2 NaN NaN 

2699 

2700 A pattern may contain optional groups. 

2701 

2702 >>> s.str.extract(r'([ab])?(\d)') 

2703 0 1 

2704 0 a 1 

2705 1 b 2 

2706 2 NaN 3 

2707 

2708 Named groups will become column names in the result. 

2709 

2710 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)') 

2711 letter digit 

2712 0 a 1 

2713 1 b 2 

2714 2 NaN NaN 

2715 

2716 A pattern with one group will return a DataFrame with one column 

2717 if expand=True. 

2718 

2719 >>> s.str.extract(r'[ab](\d)', expand=True) 

2720 0 

2721 0 1 

2722 1 2 

2723 2 NaN 

2724 

2725 A pattern with one group will return a Series if expand=False. 

2726 

2727 >>> s.str.extract(r'[ab](\d)', expand=False) 

2728 0 1 

2729 1 2 

2730 2 NaN 

2731 dtype: object 

2732 """ 

2733 from pandas import DataFrame 

2734 

2735 if not isinstance(expand, bool): 

2736 raise ValueError("expand must be True or False") 

2737 

2738 regex = re.compile(pat, flags=flags) 

2739 if regex.groups == 0: 

2740 raise ValueError("pattern contains no capture groups") 

2741 

2742 if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex): 

2743 raise ValueError("only one regex group is supported with Index") 

2744 

2745 obj = self._data 

2746 result_dtype = _result_dtype(obj) 

2747 

2748 returns_df = regex.groups > 1 or expand 

2749 

2750 if returns_df: 

2751 name = None 

2752 columns = _get_group_names(regex) 

2753 

2754 if obj.array.size == 0: 

2755 result = DataFrame(columns=columns, dtype=result_dtype) 

2756 

2757 else: 

2758 result_list = self._data.array._str_extract( 

2759 pat, flags=flags, expand=returns_df 

2760 ) 

2761 

2762 result_index: Index | None 

2763 if isinstance(obj, ABCSeries): 

2764 result_index = obj.index 

2765 else: 

2766 result_index = None 

2767 

2768 result = DataFrame( 

2769 result_list, columns=columns, index=result_index, dtype=result_dtype 

2770 ) 

2771 

2772 else: 

2773 name = _get_single_group_name(regex) 

2774 result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) 

2775 return self._wrap_result(result, name=name, dtype=result_dtype) 

2776 

2777 @forbid_nonstring_types(["bytes"]) 

2778 def extractall(self, pat, flags: int = 0) -> DataFrame: 

2779 r""" 

2780 Extract capture groups in the regex `pat` as columns in DataFrame. 

2781 

2782 For each subject string in the Series, extract groups from all 

2783 matches of regular expression pat. When each subject string in the 

2784 Series has exactly one match, extractall(pat).xs(0, level='match') 

2785 is the same as extract(pat). 

2786 

2787 Parameters 

2788 ---------- 

2789 pat : str 

2790 Regular expression pattern with capturing groups. 

2791 flags : int, default 0 (no flags) 

2792 A ``re`` module flag, for example ``re.IGNORECASE``. These allow 

2793 to modify regular expression matching for things like case, spaces, 

2794 etc. Multiple flags can be combined with the bitwise OR operator, 

2795 for example ``re.IGNORECASE | re.MULTILINE``. 

2796 

2797 Returns 

2798 ------- 

2799 DataFrame 

2800 A ``DataFrame`` with one row for each match, and one column for each 

2801 group. Its rows have a ``MultiIndex`` with first levels that come from 

2802 the subject ``Series``. The last level is named 'match' and indexes the 

2803 matches in each item of the ``Series``. Any capture group names in 

2804 regular expression pat will be used for column names; otherwise capture 

2805 group numbers will be used. 

2806 

2807 See Also 

2808 -------- 

2809 extract : Returns first match only (not all matches). 

2810 

2811 Examples 

2812 -------- 

2813 A pattern with one group will return a DataFrame with one column. 

2814 Indices with no matches will not appear in the result. 

2815 

2816 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) 

2817 >>> s.str.extractall(r"[ab](\d)") 

2818 0 

2819 match 

2820 A 0 1 

2821 1 2 

2822 B 0 1 

2823 

2824 Capture group names are used for column names of the result. 

2825 

2826 >>> s.str.extractall(r"[ab](?P<digit>\d)") 

2827 digit 

2828 match 

2829 A 0 1 

2830 1 2 

2831 B 0 1 

2832 

2833 A pattern with two groups will return a DataFrame with two columns. 

2834 

2835 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") 

2836 letter digit 

2837 match 

2838 A 0 a 1 

2839 1 a 2 

2840 B 0 b 1 

2841 

2842 Optional groups that do not match are NaN in the result. 

2843 

2844 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)") 

2845 letter digit 

2846 match 

2847 A 0 a 1 

2848 1 a 2 

2849 B 0 b 1 

2850 C 0 NaN 1 

2851 """ 

2852 # TODO: dispatch 

2853 return str_extractall(self._orig, pat, flags) 

2854 

2855 _shared_docs[ 

2856 "find" 

2857 ] = """ 

2858 Return %(side)s indexes in each strings in the Series/Index. 

2859 

2860 Each of returned indexes corresponds to the position where the 

2861 substring is fully contained between [start:end]. Return -1 on 

2862 failure. Equivalent to standard :meth:`str.%(method)s`. 

2863 

2864 Parameters 

2865 ---------- 

2866 sub : str 

2867 Substring being searched. 

2868 start : int 

2869 Left edge index. 

2870 end : int 

2871 Right edge index. 

2872 

2873 Returns 

2874 ------- 

2875 Series or Index of int. 

2876 

2877 See Also 

2878 -------- 

2879 %(also)s 

2880 

2881 Examples 

2882 -------- 

2883 For Series.str.find: 

2884 

2885 >>> ser = pd.Series(["cow_", "duck_", "do_ve"]) 

2886 >>> ser.str.find("_") 

2887 0 3 

2888 1 4 

2889 2 2 

2890 dtype: int64 

2891 

2892 For Series.str.rfind: 

2893 

2894 >>> ser = pd.Series(["_cow_", "duck_", "do_v_e"]) 

2895 >>> ser.str.rfind("_") 

2896 0 4 

2897 1 4 

2898 2 4 

2899 dtype: int64 

2900 """ 

2901 

2902 @Appender( 

2903 _shared_docs["find"] 

2904 % { 

2905 "side": "lowest", 

2906 "method": "find", 

2907 "also": "rfind : Return highest indexes in each strings.", 

2908 } 

2909 ) 

2910 @forbid_nonstring_types(["bytes"]) 

2911 def find(self, sub, start: int = 0, end=None): 

2912 if not isinstance(sub, str): 

2913 msg = f"expected a string object, not {type(sub).__name__}" 

2914 raise TypeError(msg) 

2915 

2916 result = self._data.array._str_find(sub, start, end) 

2917 return self._wrap_result(result, returns_string=False) 

2918 

2919 @Appender( 

2920 _shared_docs["find"] 

2921 % { 

2922 "side": "highest", 

2923 "method": "rfind", 

2924 "also": "find : Return lowest indexes in each strings.", 

2925 } 

2926 ) 

2927 @forbid_nonstring_types(["bytes"]) 

2928 def rfind(self, sub, start: int = 0, end=None): 

2929 if not isinstance(sub, str): 

2930 msg = f"expected a string object, not {type(sub).__name__}" 

2931 raise TypeError(msg) 

2932 

2933 result = self._data.array._str_rfind(sub, start=start, end=end) 

2934 return self._wrap_result(result, returns_string=False) 

2935 

2936 @forbid_nonstring_types(["bytes"]) 

2937 def normalize(self, form): 

2938 """ 

2939 Return the Unicode normal form for the strings in the Series/Index. 

2940 

2941 For more information on the forms, see the 

2942 :func:`unicodedata.normalize`. 

2943 

2944 Parameters 

2945 ---------- 

2946 form : {'NFC', 'NFKC', 'NFD', 'NFKD'} 

2947 Unicode form. 

2948 

2949 Returns 

2950 ------- 

2951 Series/Index of objects 

2952 

2953 Examples 

2954 -------- 

2955 >>> ser = pd.Series(['ñ']) 

2956 >>> ser.str.normalize('NFC') == ser.str.normalize('NFD') 

2957 0 False 

2958 dtype: bool 

2959 """ 

2960 result = self._data.array._str_normalize(form) 

2961 return self._wrap_result(result) 

2962 

2963 _shared_docs[ 

2964 "index" 

2965 ] = """ 

2966 Return %(side)s indexes in each string in Series/Index. 

2967 

2968 Each of the returned indexes corresponds to the position where the 

2969 substring is fully contained between [start:end]. This is the same 

2970 as ``str.%(similar)s`` except instead of returning -1, it raises a 

2971 ValueError when the substring is not found. Equivalent to standard 

2972 ``str.%(method)s``. 

2973 

2974 Parameters 

2975 ---------- 

2976 sub : str 

2977 Substring being searched. 

2978 start : int 

2979 Left edge index. 

2980 end : int 

2981 Right edge index. 

2982 

2983 Returns 

2984 ------- 

2985 Series or Index of object 

2986 

2987 See Also 

2988 -------- 

2989 %(also)s 

2990 

2991 Examples 

2992 -------- 

2993 For Series.str.index: 

2994 

2995 >>> ser = pd.Series(["horse", "eagle", "donkey"]) 

2996 >>> ser.str.index("e") 

2997 0 4 

2998 1 0 

2999 2 4 

3000 dtype: int64 

3001 

3002 For Series.str.rindex: 

3003 

3004 >>> ser = pd.Series(["Deer", "eagle", "Sheep"]) 

3005 >>> ser.str.rindex("e") 

3006 0 2 

3007 1 4 

3008 2 3 

3009 dtype: int64 

3010 """ 

3011 

3012 @Appender( 

3013 _shared_docs["index"] 

3014 % { 

3015 "side": "lowest", 

3016 "similar": "find", 

3017 "method": "index", 

3018 "also": "rindex : Return highest indexes in each strings.", 

3019 } 

3020 ) 

3021 @forbid_nonstring_types(["bytes"]) 

3022 def index(self, sub, start: int = 0, end=None): 

3023 if not isinstance(sub, str): 

3024 msg = f"expected a string object, not {type(sub).__name__}" 

3025 raise TypeError(msg) 

3026 

3027 result = self._data.array._str_index(sub, start=start, end=end) 

3028 return self._wrap_result(result, returns_string=False) 

3029 

3030 @Appender( 

3031 _shared_docs["index"] 

3032 % { 

3033 "side": "highest", 

3034 "similar": "rfind", 

3035 "method": "rindex", 

3036 "also": "index : Return lowest indexes in each strings.", 

3037 } 

3038 ) 

3039 @forbid_nonstring_types(["bytes"]) 

3040 def rindex(self, sub, start: int = 0, end=None): 

3041 if not isinstance(sub, str): 

3042 msg = f"expected a string object, not {type(sub).__name__}" 

3043 raise TypeError(msg) 

3044 

3045 result = self._data.array._str_rindex(sub, start=start, end=end) 

3046 return self._wrap_result(result, returns_string=False) 

3047 

3048 def len(self): 

3049 """ 

3050 Compute the length of each element in the Series/Index. 

3051 

3052 The element may be a sequence (such as a string, tuple or list) or a collection 

3053 (such as a dictionary). 

3054 

3055 Returns 

3056 ------- 

3057 Series or Index of int 

3058 A Series or Index of integer values indicating the length of each 

3059 element in the Series or Index. 

3060 

3061 See Also 

3062 -------- 

3063 str.len : Python built-in function returning the length of an object. 

3064 Series.size : Returns the length of the Series. 

3065 

3066 Examples 

3067 -------- 

3068 Returns the length (number of characters) in a string. Returns the 

3069 number of entries for dictionaries, lists or tuples. 

3070 

3071 >>> s = pd.Series(['dog', 

3072 ... '', 

3073 ... 5, 

3074 ... {'foo' : 'bar'}, 

3075 ... [2, 3, 5, 7], 

3076 ... ('one', 'two', 'three')]) 

3077 >>> s 

3078 0 dog 

3079 1 

3080 2 5 

3081 3 {'foo': 'bar'} 

3082 4 [2, 3, 5, 7] 

3083 5 (one, two, three) 

3084 dtype: object 

3085 >>> s.str.len() 

3086 0 3.0 

3087 1 0.0 

3088 2 NaN 

3089 3 1.0 

3090 4 4.0 

3091 5 3.0 

3092 dtype: float64 

3093 """ 

3094 result = self._data.array._str_len() 

3095 return self._wrap_result(result, returns_string=False) 

3096 

3097 _shared_docs[ 

3098 "casemethods" 

3099 ] = """ 

3100 Convert strings in the Series/Index to %(type)s. 

3101 %(version)s 

3102 Equivalent to :meth:`str.%(method)s`. 

3103 

3104 Returns 

3105 ------- 

3106 Series or Index of object 

3107 

3108 See Also 

3109 -------- 

3110 Series.str.lower : Converts all characters to lowercase. 

3111 Series.str.upper : Converts all characters to uppercase. 

3112 Series.str.title : Converts first character of each word to uppercase and 

3113 remaining to lowercase. 

3114 Series.str.capitalize : Converts first character to uppercase and 

3115 remaining to lowercase. 

3116 Series.str.swapcase : Converts uppercase to lowercase and lowercase to 

3117 uppercase. 

3118 Series.str.casefold: Removes all case distinctions in the string. 

3119 

3120 Examples 

3121 -------- 

3122 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) 

3123 >>> s 

3124 0 lower 

3125 1 CAPITALS 

3126 2 this is a sentence 

3127 3 SwApCaSe 

3128 dtype: object 

3129 

3130 >>> s.str.lower() 

3131 0 lower 

3132 1 capitals 

3133 2 this is a sentence 

3134 3 swapcase 

3135 dtype: object 

3136 

3137 >>> s.str.upper() 

3138 0 LOWER 

3139 1 CAPITALS 

3140 2 THIS IS A SENTENCE 

3141 3 SWAPCASE 

3142 dtype: object 

3143 

3144 >>> s.str.title() 

3145 0 Lower 

3146 1 Capitals 

3147 2 This Is A Sentence 

3148 3 Swapcase 

3149 dtype: object 

3150 

3151 >>> s.str.capitalize() 

3152 0 Lower 

3153 1 Capitals 

3154 2 This is a sentence 

3155 3 Swapcase 

3156 dtype: object 

3157 

3158 >>> s.str.swapcase() 

3159 0 LOWER 

3160 1 capitals 

3161 2 THIS IS A SENTENCE 

3162 3 sWaPcAsE 

3163 dtype: object 

3164 """ 

3165 # Types: 

3166 # cases: 

3167 # upper, lower, title, capitalize, swapcase, casefold 

3168 # boolean: 

3169 # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle 

3170 # _doc_args holds dict of strings to use in substituting casemethod docs 

3171 _doc_args: dict[str, dict[str, str]] = {} 

3172 _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} 

3173 _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""} 

3174 _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""} 

3175 _doc_args["capitalize"] = { 

3176 "type": "be capitalized", 

3177 "method": "capitalize", 

3178 "version": "", 

3179 } 

3180 _doc_args["swapcase"] = { 

3181 "type": "be swapcased", 

3182 "method": "swapcase", 

3183 "version": "", 

3184 } 

3185 _doc_args["casefold"] = { 

3186 "type": "be casefolded", 

3187 "method": "casefold", 

3188 "version": "", 

3189 } 

3190 

3191 @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) 

3192 @forbid_nonstring_types(["bytes"]) 

3193 def lower(self): 

3194 result = self._data.array._str_lower() 

3195 return self._wrap_result(result) 

3196 

3197 @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) 

3198 @forbid_nonstring_types(["bytes"]) 

3199 def upper(self): 

3200 result = self._data.array._str_upper() 

3201 return self._wrap_result(result) 

3202 

3203 @Appender(_shared_docs["casemethods"] % _doc_args["title"]) 

3204 @forbid_nonstring_types(["bytes"]) 

3205 def title(self): 

3206 result = self._data.array._str_title() 

3207 return self._wrap_result(result) 

3208 

3209 @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) 

3210 @forbid_nonstring_types(["bytes"]) 

3211 def capitalize(self): 

3212 result = self._data.array._str_capitalize() 

3213 return self._wrap_result(result) 

3214 

3215 @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) 

3216 @forbid_nonstring_types(["bytes"]) 

3217 def swapcase(self): 

3218 result = self._data.array._str_swapcase() 

3219 return self._wrap_result(result) 

3220 

3221 @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) 

3222 @forbid_nonstring_types(["bytes"]) 

3223 def casefold(self): 

3224 result = self._data.array._str_casefold() 

3225 return self._wrap_result(result) 

3226 

3227 _shared_docs[ 

3228 "ismethods" 

3229 ] = """ 

3230 Check whether all characters in each string are %(type)s. 

3231 

3232 This is equivalent to running the Python string method 

3233 :meth:`str.%(method)s` for each element of the Series/Index. If a string 

3234 has zero characters, ``False`` is returned for that check. 

3235 

3236 Returns 

3237 ------- 

3238 Series or Index of bool 

3239 Series or Index of boolean values with the same length as the original 

3240 Series/Index. 

3241 

3242 See Also 

3243 -------- 

3244 Series.str.isalpha : Check whether all characters are alphabetic. 

3245 Series.str.isnumeric : Check whether all characters are numeric. 

3246 Series.str.isalnum : Check whether all characters are alphanumeric. 

3247 Series.str.isdigit : Check whether all characters are digits. 

3248 Series.str.isdecimal : Check whether all characters are decimal. 

3249 Series.str.isspace : Check whether all characters are whitespace. 

3250 Series.str.islower : Check whether all characters are lowercase. 

3251 Series.str.isupper : Check whether all characters are uppercase. 

3252 Series.str.istitle : Check whether all characters are titlecase. 

3253 

3254 Examples 

3255 -------- 

3256 **Checks for Alphabetic and Numeric Characters** 

3257 

3258 >>> s1 = pd.Series(['one', 'one1', '1', '']) 

3259 

3260 >>> s1.str.isalpha() 

3261 0 True 

3262 1 False 

3263 2 False 

3264 3 False 

3265 dtype: bool 

3266 

3267 >>> s1.str.isnumeric() 

3268 0 False 

3269 1 False 

3270 2 True 

3271 3 False 

3272 dtype: bool 

3273 

3274 >>> s1.str.isalnum() 

3275 0 True 

3276 1 True 

3277 2 True 

3278 3 False 

3279 dtype: bool 

3280 

3281 Note that checks against characters mixed with any additional punctuation 

3282 or whitespace will evaluate to false for an alphanumeric check. 

3283 

3284 >>> s2 = pd.Series(['A B', '1.5', '3,000']) 

3285 >>> s2.str.isalnum() 

3286 0 False 

3287 1 False 

3288 2 False 

3289 dtype: bool 

3290 

3291 **More Detailed Checks for Numeric Characters** 

3292 

3293 There are several different but overlapping sets of numeric characters that 

3294 can be checked for. 

3295 

3296 >>> s3 = pd.Series(['23', '³', '⅕', '']) 

3297 

3298 The ``s3.str.isdecimal`` method checks for characters used to form numbers 

3299 in base 10. 

3300 

3301 >>> s3.str.isdecimal() 

3302 0 True 

3303 1 False 

3304 2 False 

3305 3 False 

3306 dtype: bool 

3307 

3308 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also 

3309 includes special digits, like superscripted and subscripted digits in 

3310 unicode. 

3311 

3312 >>> s3.str.isdigit() 

3313 0 True 

3314 1 True 

3315 2 False 

3316 3 False 

3317 dtype: bool 

3318 

3319 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also 

3320 includes other characters that can represent quantities such as unicode 

3321 fractions. 

3322 

3323 >>> s3.str.isnumeric() 

3324 0 True 

3325 1 True 

3326 2 True 

3327 3 False 

3328 dtype: bool 

3329 

3330 **Checks for Whitespace** 

3331 

3332 >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) 

3333 >>> s4.str.isspace() 

3334 0 True 

3335 1 True 

3336 2 False 

3337 dtype: bool 

3338 

3339 **Checks for Character Case** 

3340 

3341 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) 

3342 

3343 >>> s5.str.islower() 

3344 0 True 

3345 1 False 

3346 2 False 

3347 3 False 

3348 dtype: bool 

3349 

3350 >>> s5.str.isupper() 

3351 0 False 

3352 1 False 

3353 2 True 

3354 3 False 

3355 dtype: bool 

3356 

3357 The ``s5.str.istitle`` method checks for whether all words are in title 

3358 case (whether only the first letter of each word is capitalized). Words are 

3359 assumed to be as any sequence of non-numeric characters separated by 

3360 whitespace characters. 

3361 

3362 >>> s5.str.istitle() 

3363 0 False 

3364 1 True 

3365 2 False 

3366 3 False 

3367 dtype: bool 

3368 """ 

3369 _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} 

3370 _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} 

3371 _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} 

3372 _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} 

3373 _doc_args["islower"] = {"type": "lowercase", "method": "islower"} 

3374 _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} 

3375 _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} 

3376 _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} 

3377 _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"} 

3378 # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) 

3379 

3380 isalnum = _map_and_wrap( 

3381 "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] 

3382 ) 

3383 isalpha = _map_and_wrap( 

3384 "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] 

3385 ) 

3386 isdigit = _map_and_wrap( 

3387 "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] 

3388 ) 

3389 isspace = _map_and_wrap( 

3390 "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] 

3391 ) 

3392 islower = _map_and_wrap( 

3393 "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] 

3394 ) 

3395 isupper = _map_and_wrap( 

3396 "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] 

3397 ) 

3398 istitle = _map_and_wrap( 

3399 "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] 

3400 ) 

3401 isnumeric = _map_and_wrap( 

3402 "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] 

3403 ) 

3404 isdecimal = _map_and_wrap( 

3405 "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] 

3406 ) 

3407 

3408 

3409def cat_safe(list_of_columns: list[npt.NDArray[np.object_]], sep: str): 

3410 """ 

3411 Auxiliary function for :meth:`str.cat`. 

3412 

3413 Same signature as cat_core, but handles TypeErrors in concatenation, which 

3414 happen if the arrays in list_of columns have the wrong dtypes or content. 

3415 

3416 Parameters 

3417 ---------- 

3418 list_of_columns : list of numpy arrays 

3419 List of arrays to be concatenated with sep; 

3420 these arrays may not contain NaNs! 

3421 sep : string 

3422 The separator string for concatenating the columns. 

3423 

3424 Returns 

3425 ------- 

3426 nd.array 

3427 The concatenation of list_of_columns with sep. 

3428 """ 

3429 try: 

3430 result = cat_core(list_of_columns, sep) 

3431 except TypeError: 

3432 # if there are any non-string values (wrong dtype or hidden behind 

3433 # object dtype), np.sum will fail; catch and return with better message 

3434 for column in list_of_columns: 

3435 dtype = lib.infer_dtype(column, skipna=True) 

3436 if dtype not in ["string", "empty"]: 

3437 raise TypeError( 

3438 "Concatenation requires list-likes containing only " 

3439 "strings (or missing values). Offending values found in " 

3440 f"column {dtype}" 

3441 ) from None 

3442 return result 

3443 

3444 

3445def cat_core(list_of_columns: list, sep: str): 

3446 """ 

3447 Auxiliary function for :meth:`str.cat` 

3448 

3449 Parameters 

3450 ---------- 

3451 list_of_columns : list of numpy arrays 

3452 List of arrays to be concatenated with sep; 

3453 these arrays may not contain NaNs! 

3454 sep : string 

3455 The separator string for concatenating the columns. 

3456 

3457 Returns 

3458 ------- 

3459 nd.array 

3460 The concatenation of list_of_columns with sep. 

3461 """ 

3462 if sep == "": 

3463 # no need to interleave sep if it is empty 

3464 arr_of_cols = np.asarray(list_of_columns, dtype=object) 

3465 return np.sum(arr_of_cols, axis=0) 

3466 list_with_sep = [sep] * (2 * len(list_of_columns) - 1) 

3467 list_with_sep[::2] = list_of_columns 

3468 arr_with_sep = np.asarray(list_with_sep, dtype=object) 

3469 return np.sum(arr_with_sep, axis=0) 

3470 

3471 

3472def _result_dtype(arr): 

3473 # workaround #27953 

3474 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails 

3475 # when the list of values is empty. 

3476 from pandas.core.arrays.string_ import StringDtype 

3477 

3478 if isinstance(arr.dtype, (ArrowDtype, StringDtype)): 

3479 return arr.dtype 

3480 return object 

3481 

3482 

3483def _get_single_group_name(regex: re.Pattern) -> Hashable: 

3484 if regex.groupindex: 

3485 return next(iter(regex.groupindex)) 

3486 else: 

3487 return None 

3488 

3489 

3490def _get_group_names(regex: re.Pattern) -> list[Hashable]: 

3491 """ 

3492 Get named groups from compiled regex. 

3493 

3494 Unnamed groups are numbered. 

3495 

3496 Parameters 

3497 ---------- 

3498 regex : compiled regex 

3499 

3500 Returns 

3501 ------- 

3502 list of column labels 

3503 """ 

3504 names = {v: k for k, v in regex.groupindex.items()} 

3505 return [names.get(1 + i, i) for i in range(regex.groups)] 

3506 

3507 

3508def str_extractall(arr, pat, flags: int = 0) -> DataFrame: 

3509 regex = re.compile(pat, flags=flags) 

3510 # the regex must contain capture groups. 

3511 if regex.groups == 0: 

3512 raise ValueError("pattern contains no capture groups") 

3513 

3514 if isinstance(arr, ABCIndex): 

3515 arr = arr.to_series().reset_index(drop=True).astype(arr.dtype) 

3516 

3517 columns = _get_group_names(regex) 

3518 match_list = [] 

3519 index_list = [] 

3520 is_mi = arr.index.nlevels > 1 

3521 

3522 for subject_key, subject in arr.items(): 

3523 if isinstance(subject, str): 

3524 if not is_mi: 

3525 subject_key = (subject_key,) 

3526 

3527 for match_i, match_tuple in enumerate(regex.findall(subject)): 

3528 if isinstance(match_tuple, str): 

3529 match_tuple = (match_tuple,) 

3530 na_tuple = [np.nan if group == "" else group for group in match_tuple] 

3531 match_list.append(na_tuple) 

3532 result_key = tuple(subject_key + (match_i,)) 

3533 index_list.append(result_key) 

3534 

3535 from pandas import MultiIndex 

3536 

3537 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) 

3538 dtype = _result_dtype(arr) 

3539 

3540 result = arr._constructor_expanddim( 

3541 match_list, index=index, columns=columns, dtype=dtype 

3542 ) 

3543 return result