Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/strings/accessor.py: 34%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

581 statements  

1from __future__ import annotations 

2 

3import codecs 

4from functools import wraps 

5import re 

6from typing import ( 

7 TYPE_CHECKING, 

8 Callable, 

9 Hashable, 

10 Literal, 

11 cast, 

12) 

13import warnings 

14 

15import numpy as np 

16 

17from pandas._libs import lib 

18from pandas._typing import ( 

19 AlignJoin, 

20 DtypeObj, 

21 F, 

22 Scalar, 

23) 

24from pandas.util._decorators import Appender 

25from pandas.util._exceptions import find_stack_level 

26 

27from pandas.core.dtypes.common import ( 

28 ensure_object, 

29 is_bool_dtype, 

30 is_categorical_dtype, 

31 is_integer, 

32 is_list_like, 

33 is_object_dtype, 

34 is_re, 

35) 

36from pandas.core.dtypes.generic import ( 

37 ABCDataFrame, 

38 ABCIndex, 

39 ABCMultiIndex, 

40 ABCSeries, 

41) 

42from pandas.core.dtypes.missing import isna 

43 

44from pandas.core.arrays.arrow.dtype import ArrowDtype 

45from pandas.core.base import NoNewAttributesMixin 

46from pandas.core.construction import extract_array 

47 

48if TYPE_CHECKING: 

49 from pandas import ( 

50 DataFrame, 

51 Index, 

52 Series, 

53 ) 

54 

55_shared_docs: dict[str, str] = {} 

56_cpython_optimized_encoders = ( 

57 "utf-8", 

58 "utf8", 

59 "latin-1", 

60 "latin1", 

61 "iso-8859-1", 

62 "mbcs", 

63 "ascii", 

64) 

65_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") 

66 

67 

68def forbid_nonstring_types( 

69 forbidden: list[str] | None, name: str | None = None 

70) -> Callable[[F], F]: 

71 """ 

72 Decorator to forbid specific types for a method of StringMethods. 

73 

74 For calling `.str.{method}` on a Series or Index, it is necessary to first 

75 initialize the :class:`StringMethods` object, and then call the method. 

76 However, different methods allow different input types, and so this can not 

77 be checked during :meth:`StringMethods.__init__`, but must be done on a 

78 per-method basis. This decorator exists to facilitate this process, and 

79 make it explicit which (inferred) types are disallowed by the method. 

80 

81 :meth:`StringMethods.__init__` allows the *union* of types its different 

82 methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), 

83 namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. 

84 

85 The default string types ['string', 'empty'] are allowed for all methods. 

86 For the additional types ['bytes', 'mixed', 'mixed-integer'], each method 

87 then needs to forbid the types it is not intended for. 

88 

89 Parameters 

90 ---------- 

91 forbidden : list-of-str or None 

92 List of forbidden non-string types, may be one or more of 

93 `['bytes', 'mixed', 'mixed-integer']`. 

94 name : str, default None 

95 Name of the method to use in the error message. By default, this is 

96 None, in which case the name from the method being wrapped will be 

97 copied. However, for working with further wrappers (like _pat_wrapper 

98 and _noarg_wrapper), it is necessary to specify the name. 

99 

100 Returns 

101 ------- 

102 func : wrapper 

103 The method to which the decorator is applied, with an added check that 

104 enforces the inferred type to not be in the list of forbidden types. 

105 

106 Raises 

107 ------ 

108 TypeError 

109 If the inferred type of the underlying data is in `forbidden`. 

110 """ 

111 # deal with None 

112 forbidden = [] if forbidden is None else forbidden 

113 

114 allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( 

115 forbidden 

116 ) 

117 

118 def _forbid_nonstring_types(func: F) -> F: 

119 func_name = func.__name__ if name is None else name 

120 

121 @wraps(func) 

122 def wrapper(self, *args, **kwargs): 

123 if self._inferred_dtype not in allowed_types: 

124 msg = ( 

125 f"Cannot use .str.{func_name} with values of " 

126 f"inferred dtype '{self._inferred_dtype}'." 

127 ) 

128 raise TypeError(msg) 

129 return func(self, *args, **kwargs) 

130 

131 wrapper.__name__ = func_name 

132 return cast(F, wrapper) 

133 

134 return _forbid_nonstring_types 

135 

136 

137def _map_and_wrap(name, docstring): 

138 @forbid_nonstring_types(["bytes"], name=name) 

139 def wrapper(self): 

140 result = getattr(self._data.array, f"_str_{name}")() 

141 return self._wrap_result(result) 

142 

143 wrapper.__doc__ = docstring 

144 return wrapper 

145 

146 

147class StringMethods(NoNewAttributesMixin): 

148 """ 

149 Vectorized string functions for Series and Index. 

150 

151 NAs stay NA unless handled otherwise by a particular method. 

152 Patterned after Python's string methods, with some inspiration from 

153 R's stringr package. 

154 

155 Examples 

156 -------- 

157 >>> s = pd.Series(["A_Str_Series"]) 

158 >>> s 

159 0 A_Str_Series 

160 dtype: object 

161 

162 >>> s.str.split("_") 

163 0 [A, Str, Series] 

164 dtype: object 

165 

166 >>> s.str.replace("_", "") 

167 0 AStrSeries 

168 dtype: object 

169 """ 

170 

171 # Note: see the docstring in pandas.core.strings.__init__ 

172 # for an explanation of the implementation. 

173 # TODO: Dispatch all the methods 

174 # Currently the following are not dispatched to the array 

175 # * cat 

176 # * extractall 

177 

178 def __init__(self, data) -> None: 

179 from pandas.core.arrays.string_ import StringDtype 

180 

181 self._inferred_dtype = self._validate(data) 

182 self._is_categorical = is_categorical_dtype(data.dtype) 

183 self._is_string = isinstance(data.dtype, StringDtype) 

184 self._data = data 

185 

186 self._index = self._name = None 

187 if isinstance(data, ABCSeries): 

188 self._index = data.index 

189 self._name = data.name 

190 

191 # ._values.categories works for both Series/Index 

192 self._parent = data._values.categories if self._is_categorical else data 

193 # save orig to blow up categoricals to the right type 

194 self._orig = data 

195 self._freeze() 

196 

197 @staticmethod 

198 def _validate(data): 

199 """ 

200 Auxiliary function for StringMethods, infers and checks dtype of data. 

201 

202 This is a "first line of defence" at the creation of the StringMethods- 

203 object, and just checks that the dtype is in the 

204 *union* of the allowed types over all string methods below; this 

205 restriction is then refined on a per-method basis using the decorator 

206 @forbid_nonstring_types (more info in the corresponding docstring). 

207 

208 This really should exclude all series/index with any non-string values, 

209 but that isn't practical for performance reasons until we have a str 

210 dtype (GH 9343 / 13877) 

211 

212 Parameters 

213 ---------- 

214 data : The content of the Series 

215 

216 Returns 

217 ------- 

218 dtype : inferred dtype of data 

219 """ 

220 if isinstance(data, ABCMultiIndex): 

221 raise AttributeError( 

222 "Can only use .str accessor with Index, not MultiIndex" 

223 ) 

224 

225 # see _libs/lib.pyx for list of inferred types 

226 allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] 

227 

228 data = extract_array(data) 

229 

230 values = getattr(data, "categories", data) # categorical / normal 

231 

232 inferred_dtype = lib.infer_dtype(values, skipna=True) 

233 

234 if inferred_dtype not in allowed_types: 

235 raise AttributeError("Can only use .str accessor with string values!") 

236 return inferred_dtype 

237 

238 def __getitem__(self, key): 

239 result = self._data.array._str_getitem(key) 

240 return self._wrap_result(result) 

241 

242 def _wrap_result( 

243 self, 

244 result, 

245 name=None, 

246 expand: bool | None = None, 

247 fill_value=np.nan, 

248 returns_string: bool = True, 

249 returns_bool: bool = False, 

250 ): 

251 from pandas import ( 

252 Index, 

253 MultiIndex, 

254 ) 

255 

256 if not hasattr(result, "ndim") or not hasattr(result, "dtype"): 

257 if isinstance(result, ABCDataFrame): 

258 result = result.__finalize__(self._orig, name="str") 

259 return result 

260 assert result.ndim < 3 

261 

262 # We can be wrapping a string / object / categorical result, in which 

263 # case we'll want to return the same dtype as the input. 

264 # Or we can be wrapping a numeric output, in which case we don't want 

265 # to return a StringArray. 

266 # Ideally the array method returns the right array type. 

267 if expand is None: 

268 # infer from ndim if expand is not specified 

269 expand = result.ndim != 1 

270 elif expand is True and not isinstance(self._orig, ABCIndex): 

271 # required when expand=True is explicitly specified 

272 # not needed when inferred 

273 if isinstance(result.dtype, ArrowDtype): 

274 import pyarrow as pa 

275 

276 from pandas.compat import pa_version_under11p0 

277 

278 from pandas.core.arrays.arrow.array import ArrowExtensionArray 

279 

280 value_lengths = result._data.combine_chunks().value_lengths() 

281 max_len = pa.compute.max(value_lengths).as_py() 

282 min_len = pa.compute.min(value_lengths).as_py() 

283 if result._hasna: 

284 # ArrowExtensionArray.fillna doesn't work for list scalars 

285 result = ArrowExtensionArray( 

286 result._data.fill_null([None] * max_len) 

287 ) 

288 if min_len < max_len: 

289 # append nulls to each scalar list element up to max_len 

290 if not pa_version_under11p0: 

291 result = ArrowExtensionArray( 

292 pa.compute.list_slice( 

293 result._data, 

294 start=0, 

295 stop=max_len, 

296 return_fixed_size_list=True, 

297 ) 

298 ) 

299 else: 

300 all_null = np.full(max_len, fill_value=None, dtype=object) 

301 values = result.to_numpy() 

302 new_values = [] 

303 for row in values: 

304 if len(row) < max_len: 

305 nulls = all_null[: max_len - len(row)] 

306 row = np.append(row, nulls) 

307 new_values.append(row) 

308 pa_type = result._data.type 

309 result = ArrowExtensionArray(pa.array(new_values, type=pa_type)) 

310 if name is not None: 

311 labels = name 

312 else: 

313 labels = range(max_len) 

314 result = { 

315 label: ArrowExtensionArray(pa.array(res)) 

316 for label, res in zip(labels, (zip(*result.tolist()))) 

317 } 

318 elif is_object_dtype(result): 

319 

320 def cons_row(x): 

321 if is_list_like(x): 

322 return x 

323 else: 

324 return [x] 

325 

326 result = [cons_row(x) for x in result] 

327 if result and not self._is_string: 

328 # propagate nan values to match longest sequence (GH 18450) 

329 max_len = max(len(x) for x in result) 

330 result = [ 

331 x * max_len if len(x) == 0 or x[0] is np.nan else x 

332 for x in result 

333 ] 

334 

335 if not isinstance(expand, bool): 

336 raise ValueError("expand must be True or False") 

337 

338 if expand is False: 

339 # if expand is False, result should have the same name 

340 # as the original otherwise specified 

341 if name is None: 

342 name = getattr(result, "name", None) 

343 if name is None: 

344 # do not use logical or, _orig may be a DataFrame 

345 # which has "name" column 

346 name = self._orig.name 

347 

348 # Wait until we are sure result is a Series or Index before 

349 # checking attributes (GH 12180) 

350 if isinstance(self._orig, ABCIndex): 

351 # if result is a boolean np.array, return the np.array 

352 # instead of wrapping it into a boolean Index (GH 8875) 

353 if is_bool_dtype(result): 

354 return result 

355 

356 if expand: 

357 result = list(result) 

358 out = MultiIndex.from_tuples(result, names=name) 

359 if out.nlevels == 1: 

360 # We had all tuples of length-one, which are 

361 # better represented as a regular Index. 

362 out = out.get_level_values(0) 

363 return out 

364 else: 

365 return Index(result, name=name) 

366 else: 

367 index = self._orig.index 

368 # This is a mess. 

369 dtype: DtypeObj | str | None 

370 vdtype = getattr(result, "dtype", None) 

371 if self._is_string: 

372 if is_bool_dtype(vdtype): 

373 dtype = result.dtype 

374 elif returns_string: 

375 dtype = self._orig.dtype 

376 else: 

377 dtype = vdtype 

378 else: 

379 dtype = vdtype 

380 

381 if expand: 

382 cons = self._orig._constructor_expanddim 

383 result = cons(result, columns=name, index=index, dtype=dtype) 

384 else: 

385 # Must be a Series 

386 cons = self._orig._constructor 

387 result = cons(result, name=name, index=index, dtype=dtype) 

388 result = result.__finalize__(self._orig, method="str") 

389 if name is not None and result.ndim == 1: 

390 # __finalize__ might copy over the original name, but we may 

391 # want the new name (e.g. str.extract). 

392 result.name = name 

393 return result 

394 

395 def _get_series_list(self, others): 

396 """ 

397 Auxiliary function for :meth:`str.cat`. Turn potentially mixed input 

398 into a list of Series (elements without an index must match the length 

399 of the calling Series/Index). 

400 

401 Parameters 

402 ---------- 

403 others : Series, DataFrame, np.ndarray, list-like or list-like of 

404 Objects that are either Series, Index or np.ndarray (1-dim). 

405 

406 Returns 

407 ------- 

408 list of Series 

409 Others transformed into list of Series. 

410 """ 

411 from pandas import ( 

412 DataFrame, 

413 Series, 

414 ) 

415 

416 # self._orig is either Series or Index 

417 idx = self._orig if isinstance(self._orig, ABCIndex) else self._orig.index 

418 

419 # Generally speaking, all objects without an index inherit the index 

420 # `idx` of the calling Series/Index - i.e. must have matching length. 

421 # Objects with an index (i.e. Series/Index/DataFrame) keep their own. 

422 if isinstance(others, ABCSeries): 

423 return [others] 

424 elif isinstance(others, ABCIndex): 

425 return [Series(others, index=idx, dtype=others.dtype)] 

426 elif isinstance(others, ABCDataFrame): 

427 return [others[x] for x in others] 

428 elif isinstance(others, np.ndarray) and others.ndim == 2: 

429 others = DataFrame(others, index=idx) 

430 return [others[x] for x in others] 

431 elif is_list_like(others, allow_sets=False): 

432 others = list(others) # ensure iterators do not get read twice etc 

433 

434 # in case of list-like `others`, all elements must be 

435 # either Series/Index/np.ndarray (1-dim)... 

436 if all( 

437 isinstance(x, (ABCSeries, ABCIndex)) 

438 or (isinstance(x, np.ndarray) and x.ndim == 1) 

439 for x in others 

440 ): 

441 los: list[Series] = [] 

442 while others: # iterate through list and append each element 

443 los = los + self._get_series_list(others.pop(0)) 

444 return los 

445 # ... or just strings 

446 elif all(not is_list_like(x) for x in others): 

447 return [Series(others, index=idx)] 

448 raise TypeError( 

449 "others must be Series, Index, DataFrame, np.ndarray " 

450 "or list-like (either containing only strings or " 

451 "containing only objects of type Series/Index/" 

452 "np.ndarray[1-dim])" 

453 ) 

454 

455 @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) 

456 def cat( 

457 self, 

458 others=None, 

459 sep=None, 

460 na_rep=None, 

461 join: AlignJoin = "left", 

462 ) -> str | Series | Index: 

463 """ 

464 Concatenate strings in the Series/Index with given separator. 

465 

466 If `others` is specified, this function concatenates the Series/Index 

467 and elements of `others` element-wise. 

468 If `others` is not passed, then all values in the Series/Index are 

469 concatenated into a single string with a given `sep`. 

470 

471 Parameters 

472 ---------- 

473 others : Series, Index, DataFrame, np.ndarray or list-like 

474 Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and 

475 other list-likes of strings must have the same length as the 

476 calling Series/Index, with the exception of indexed objects (i.e. 

477 Series/Index/DataFrame) if `join` is not None. 

478 

479 If others is a list-like that contains a combination of Series, 

480 Index or np.ndarray (1-dim), then all elements will be unpacked and 

481 must satisfy the above criteria individually. 

482 

483 If others is None, the method returns the concatenation of all 

484 strings in the calling Series/Index. 

485 sep : str, default '' 

486 The separator between the different elements/columns. By default 

487 the empty string `''` is used. 

488 na_rep : str or None, default None 

489 Representation that is inserted for all missing values: 

490 

491 - If `na_rep` is None, and `others` is None, missing values in the 

492 Series/Index are omitted from the result. 

493 - If `na_rep` is None, and `others` is not None, a row containing a 

494 missing value in any of the columns (before concatenation) will 

495 have a missing value in the result. 

496 join : {'left', 'right', 'outer', 'inner'}, default 'left' 

497 Determines the join-style between the calling Series/Index and any 

498 Series/Index/DataFrame in `others` (objects without an index need 

499 to match the length of the calling Series/Index). To disable 

500 alignment, use `.values` on any Series/Index/DataFrame in `others`. 

501 

502 Returns 

503 ------- 

504 str, Series or Index 

505 If `others` is None, `str` is returned, otherwise a `Series/Index` 

506 (same type as caller) of objects is returned. 

507 

508 See Also 

509 -------- 

510 split : Split each string in the Series/Index. 

511 join : Join lists contained as elements in the Series/Index. 

512 

513 Examples 

514 -------- 

515 When not passing `others`, all values are concatenated into a single 

516 string: 

517 

518 >>> s = pd.Series(['a', 'b', np.nan, 'd']) 

519 >>> s.str.cat(sep=' ') 

520 'a b d' 

521 

522 By default, NA values in the Series are ignored. Using `na_rep`, they 

523 can be given a representation: 

524 

525 >>> s.str.cat(sep=' ', na_rep='?') 

526 'a b ? d' 

527 

528 If `others` is specified, corresponding values are concatenated with 

529 the separator. Result will be a Series of strings. 

530 

531 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') 

532 0 a,A 

533 1 b,B 

534 2 NaN 

535 3 d,D 

536 dtype: object 

537 

538 Missing values will remain missing in the result, but can again be 

539 represented using `na_rep` 

540 

541 >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') 

542 0 a,A 

543 1 b,B 

544 2 -,C 

545 3 d,D 

546 dtype: object 

547 

548 If `sep` is not specified, the values are concatenated without 

549 separation. 

550 

551 >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') 

552 0 aA 

553 1 bB 

554 2 -C 

555 3 dD 

556 dtype: object 

557 

558 Series with different indexes can be aligned before concatenation. The 

559 `join`-keyword works as in other methods. 

560 

561 >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) 

562 >>> s.str.cat(t, join='left', na_rep='-') 

563 0 aa 

564 1 b- 

565 2 -c 

566 3 dd 

567 dtype: object 

568 >>> 

569 >>> s.str.cat(t, join='outer', na_rep='-') 

570 0 aa 

571 1 b- 

572 2 -c 

573 3 dd 

574 4 -e 

575 dtype: object 

576 >>> 

577 >>> s.str.cat(t, join='inner', na_rep='-') 

578 0 aa 

579 2 -c 

580 3 dd 

581 dtype: object 

582 >>> 

583 >>> s.str.cat(t, join='right', na_rep='-') 

584 3 dd 

585 0 aa 

586 4 -e 

587 2 -c 

588 dtype: object 

589 

590 For more examples, see :ref:`here <text.concatenate>`. 

591 """ 

592 # TODO: dispatch 

593 from pandas import ( 

594 Index, 

595 Series, 

596 concat, 

597 ) 

598 

599 if isinstance(others, str): 

600 raise ValueError("Did you mean to supply a `sep` keyword?") 

601 if sep is None: 

602 sep = "" 

603 

604 if isinstance(self._orig, ABCIndex): 

605 data = Series(self._orig, index=self._orig, dtype=self._orig.dtype) 

606 else: # Series 

607 data = self._orig 

608 

609 # concatenate Series/Index with itself if no "others" 

610 if others is None: 

611 # error: Incompatible types in assignment (expression has type 

612 # "ndarray", variable has type "Series") 

613 data = ensure_object(data) # type: ignore[assignment] 

614 na_mask = isna(data) 

615 if na_rep is None and na_mask.any(): 

616 return sep.join(data[~na_mask]) 

617 elif na_rep is not None and na_mask.any(): 

618 return sep.join(np.where(na_mask, na_rep, data)) 

619 else: 

620 return sep.join(data) 

621 

622 try: 

623 # turn anything in "others" into lists of Series 

624 others = self._get_series_list(others) 

625 except ValueError as err: # do not catch TypeError raised by _get_series_list 

626 raise ValueError( 

627 "If `others` contains arrays or lists (or other " 

628 "list-likes without an index), these must all be " 

629 "of the same length as the calling Series/Index." 

630 ) from err 

631 

632 # align if required 

633 if any(not data.index.equals(x.index) for x in others): 

634 # Need to add keys for uniqueness in case of duplicate columns 

635 others = concat( 

636 others, 

637 axis=1, 

638 join=(join if join == "inner" else "outer"), 

639 keys=range(len(others)), 

640 sort=False, 

641 copy=False, 

642 ) 

643 data, others = data.align(others, join=join) 

644 others = [others[x] for x in others] # again list of Series 

645 

646 all_cols = [ensure_object(x) for x in [data] + others] 

647 na_masks = np.array([isna(x) for x in all_cols]) 

648 union_mask = np.logical_or.reduce(na_masks, axis=0) 

649 

650 if na_rep is None and union_mask.any(): 

651 # no na_rep means NaNs for all rows where any column has a NaN 

652 # only necessary if there are actually any NaNs 

653 result = np.empty(len(data), dtype=object) 

654 np.putmask(result, union_mask, np.nan) 

655 

656 not_masked = ~union_mask 

657 result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) 

658 elif na_rep is not None and union_mask.any(): 

659 # fill NaNs with na_rep in case there are actually any NaNs 

660 all_cols = [ 

661 np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) 

662 ] 

663 result = cat_safe(all_cols, sep) 

664 else: 

665 # no NaNs - can just concatenate 

666 result = cat_safe(all_cols, sep) 

667 

668 out: Index | Series 

669 if isinstance(self._orig, ABCIndex): 

670 # add dtype for case that result is all-NA 

671 

672 out = Index(result, dtype=object, name=self._orig.name) 

673 else: # Series 

674 if is_categorical_dtype(self._orig.dtype): 

675 # We need to infer the new categories. 

676 dtype = None 

677 else: 

678 dtype = self._orig.dtype 

679 res_ser = Series( 

680 result, dtype=dtype, index=data.index, name=self._orig.name, copy=False 

681 ) 

682 out = res_ser.__finalize__(self._orig, method="str_cat") 

683 return out 

684 

685 _shared_docs[ 

686 "str_split" 

687 ] = r""" 

688 Split strings around given separator/delimiter. 

689 

690 Splits the string in the Series/Index from the %(side)s, 

691 at the specified delimiter string. 

692 

693 Parameters 

694 ---------- 

695 pat : str%(pat_regex)s, optional 

696 %(pat_description)s. 

697 If not specified, split on whitespace. 

698 n : int, default -1 (all) 

699 Limit number of splits in output. 

700 ``None``, 0 and -1 will be interpreted as return all splits. 

701 expand : bool, default False 

702 Expand the split strings into separate columns. 

703 

704 - If ``True``, return DataFrame/MultiIndex expanding dimensionality. 

705 - If ``False``, return Series/Index, containing lists of strings. 

706 %(regex_argument)s 

707 Returns 

708 ------- 

709 Series, Index, DataFrame or MultiIndex 

710 Type matches caller unless ``expand=True`` (see Notes). 

711 %(raises_split)s 

712 See Also 

713 -------- 

714 Series.str.split : Split strings around given separator/delimiter. 

715 Series.str.rsplit : Splits string around given separator/delimiter, 

716 starting from the right. 

717 Series.str.join : Join lists contained as elements in the Series/Index 

718 with passed delimiter. 

719 str.split : Standard library version for split. 

720 str.rsplit : Standard library version for rsplit. 

721 

722 Notes 

723 ----- 

724 The handling of the `n` keyword depends on the number of found splits: 

725 

726 - If found splits > `n`, make first `n` splits only 

727 - If found splits <= `n`, make all splits 

728 - If for a certain row the number of found splits < `n`, 

729 append `None` for padding up to `n` if ``expand=True`` 

730 

731 If using ``expand=True``, Series and Index callers return DataFrame and 

732 MultiIndex objects, respectively. 

733 %(regex_pat_note)s 

734 Examples 

735 -------- 

736 >>> s = pd.Series( 

737 ... [ 

738 ... "this is a regular sentence", 

739 ... "https://docs.python.org/3/tutorial/index.html", 

740 ... np.nan 

741 ... ] 

742 ... ) 

743 >>> s 

744 0 this is a regular sentence 

745 1 https://docs.python.org/3/tutorial/index.html 

746 2 NaN 

747 dtype: object 

748 

749 In the default setting, the string is split by whitespace. 

750 

751 >>> s.str.split() 

752 0 [this, is, a, regular, sentence] 

753 1 [https://docs.python.org/3/tutorial/index.html] 

754 2 NaN 

755 dtype: object 

756 

757 Without the `n` parameter, the outputs of `rsplit` and `split` 

758 are identical. 

759 

760 >>> s.str.rsplit() 

761 0 [this, is, a, regular, sentence] 

762 1 [https://docs.python.org/3/tutorial/index.html] 

763 2 NaN 

764 dtype: object 

765 

766 The `n` parameter can be used to limit the number of splits on the 

767 delimiter. The outputs of `split` and `rsplit` are different. 

768 

769 >>> s.str.split(n=2) 

770 0 [this, is, a regular sentence] 

771 1 [https://docs.python.org/3/tutorial/index.html] 

772 2 NaN 

773 dtype: object 

774 

775 >>> s.str.rsplit(n=2) 

776 0 [this is a, regular, sentence] 

777 1 [https://docs.python.org/3/tutorial/index.html] 

778 2 NaN 

779 dtype: object 

780 

781 The `pat` parameter can be used to split by other characters. 

782 

783 >>> s.str.split(pat="/") 

784 0 [this is a regular sentence] 

785 1 [https:, , docs.python.org, 3, tutorial, index... 

786 2 NaN 

787 dtype: object 

788 

789 When using ``expand=True``, the split elements will expand out into 

790 separate columns. If NaN is present, it is propagated throughout 

791 the columns during the split. 

792 

793 >>> s.str.split(expand=True) 

794 0 1 2 3 4 

795 0 this is a regular sentence 

796 1 https://docs.python.org/3/tutorial/index.html None None None None 

797 2 NaN NaN NaN NaN NaN 

798 

799 For slightly more complex use cases like splitting the html document name 

800 from a url, a combination of parameter settings can be used. 

801 

802 >>> s.str.rsplit("/", n=1, expand=True) 

803 0 1 

804 0 this is a regular sentence None 

805 1 https://docs.python.org/3/tutorial index.html 

806 2 NaN NaN 

807 %(regex_examples)s""" 

808 

809 @Appender( 

810 _shared_docs["str_split"] 

811 % { 

812 "side": "beginning", 

813 "pat_regex": " or compiled regex", 

814 "pat_description": "String or regular expression to split on", 

815 "regex_argument": """ 

816 regex : bool, default None 

817 Determines if the passed-in pattern is a regular expression: 

818 

819 - If ``True``, assumes the passed-in pattern is a regular expression 

820 - If ``False``, treats the pattern as a literal string. 

821 - If ``None`` and `pat` length is 1, treats `pat` as a literal string. 

822 - If ``None`` and `pat` length is not 1, treats `pat` as a regular expression. 

823 - Cannot be set to False if `pat` is a compiled regex 

824 

825 .. versionadded:: 1.4.0 

826 """, 

827 "raises_split": """ 

828 Raises 

829 ------ 

830 ValueError 

831 * if `regex` is False and `pat` is a compiled regex 

832 """, 

833 "regex_pat_note": """ 

834 Use of `regex =False` with a `pat` as a compiled regex will raise an error. 

835 """, 

836 "method": "split", 

837 "regex_examples": r""" 

838 Remember to escape special characters when explicitly using regular expressions. 

839 

840 >>> s = pd.Series(["foo and bar plus baz"]) 

841 >>> s.str.split(r"and|plus", expand=True) 

842 0 1 2 

843 0 foo bar baz 

844 

845 Regular expressions can be used to handle urls or file names. 

846 When `pat` is a string and ``regex=None`` (the default), the given `pat` is compiled 

847 as a regex only if ``len(pat) != 1``. 

848 

849 >>> s = pd.Series(['foojpgbar.jpg']) 

850 >>> s.str.split(r".", expand=True) 

851 0 1 

852 0 foojpgbar jpg 

853 

854 >>> s.str.split(r"\.jpg", expand=True) 

855 0 1 

856 0 foojpgbar 

857 

858 When ``regex=True``, `pat` is interpreted as a regex 

859 

860 >>> s.str.split(r"\.jpg", regex=True, expand=True) 

861 0 1 

862 0 foojpgbar 

863 

864 A compiled regex can be passed as `pat` 

865 

866 >>> import re 

867 >>> s.str.split(re.compile(r"\.jpg"), expand=True) 

868 0 1 

869 0 foojpgbar 

870 

871 When ``regex=False``, `pat` is interpreted as the string itself 

872 

873 >>> s.str.split(r"\.jpg", regex=False, expand=True) 

874 0 

875 0 foojpgbar.jpg 

876 """, 

877 } 

878 ) 

879 @forbid_nonstring_types(["bytes"]) 

880 def split( 

881 self, 

882 pat: str | re.Pattern | None = None, 

883 *, 

884 n=-1, 

885 expand: bool = False, 

886 regex: bool | None = None, 

887 ): 

888 if regex is False and is_re(pat): 

889 raise ValueError( 

890 "Cannot use a compiled regex as replacement pattern with regex=False" 

891 ) 

892 if is_re(pat): 

893 regex = True 

894 result = self._data.array._str_split(pat, n, expand, regex) 

895 return self._wrap_result(result, returns_string=expand, expand=expand) 

896 

897 @Appender( 

898 _shared_docs["str_split"] 

899 % { 

900 "side": "end", 

901 "pat_regex": "", 

902 "pat_description": "String to split on", 

903 "regex_argument": "", 

904 "raises_split": "", 

905 "regex_pat_note": "", 

906 "method": "rsplit", 

907 "regex_examples": "", 

908 } 

909 ) 

910 @forbid_nonstring_types(["bytes"]) 

911 def rsplit(self, pat=None, *, n=-1, expand: bool = False): 

912 result = self._data.array._str_rsplit(pat, n=n) 

913 return self._wrap_result(result, expand=expand, returns_string=expand) 

914 

915 _shared_docs[ 

916 "str_partition" 

917 ] = """ 

918 Split the string at the %(side)s occurrence of `sep`. 

919 

920 This method splits the string at the %(side)s occurrence of `sep`, 

921 and returns 3 elements containing the part before the separator, 

922 the separator itself, and the part after the separator. 

923 If the separator is not found, return %(return)s. 

924 

925 Parameters 

926 ---------- 

927 sep : str, default whitespace 

928 String to split on. 

929 expand : bool, default True 

930 If True, return DataFrame/MultiIndex expanding dimensionality. 

931 If False, return Series/Index. 

932 

933 Returns 

934 ------- 

935 DataFrame/MultiIndex or Series/Index of objects 

936 

937 See Also 

938 -------- 

939 %(also)s 

940 Series.str.split : Split strings around given separators. 

941 str.partition : Standard library version. 

942 

943 Examples 

944 -------- 

945 

946 >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) 

947 >>> s 

948 0 Linda van der Berg 

949 1 George Pitt-Rivers 

950 dtype: object 

951 

952 >>> s.str.partition() 

953 0 1 2 

954 0 Linda van der Berg 

955 1 George Pitt-Rivers 

956 

957 To partition by the last space instead of the first one: 

958 

959 >>> s.str.rpartition() 

960 0 1 2 

961 0 Linda van der Berg 

962 1 George Pitt-Rivers 

963 

964 To partition by something different than a space: 

965 

966 >>> s.str.partition('-') 

967 0 1 2 

968 0 Linda van der Berg 

969 1 George Pitt - Rivers 

970 

971 To return a Series containing tuples instead of a DataFrame: 

972 

973 >>> s.str.partition('-', expand=False) 

974 0 (Linda van der Berg, , ) 

975 1 (George Pitt, -, Rivers) 

976 dtype: object 

977 

978 Also available on indices: 

979 

980 >>> idx = pd.Index(['X 123', 'Y 999']) 

981 >>> idx 

982 Index(['X 123', 'Y 999'], dtype='object') 

983 

984 Which will create a MultiIndex: 

985 

986 >>> idx.str.partition() 

987 MultiIndex([('X', ' ', '123'), 

988 ('Y', ' ', '999')], 

989 ) 

990 

991 Or an index with tuples with ``expand=False``: 

992 

993 >>> idx.str.partition(expand=False) 

994 Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') 

995 """ 

996 

997 @Appender( 

998 _shared_docs["str_partition"] 

999 % { 

1000 "side": "first", 

1001 "return": "3 elements containing the string itself, followed by two " 

1002 "empty strings", 

1003 "also": "rpartition : Split the string at the last occurrence of `sep`.", 

1004 } 

1005 ) 

1006 @forbid_nonstring_types(["bytes"]) 

1007 def partition(self, sep: str = " ", expand: bool = True): 

1008 result = self._data.array._str_partition(sep, expand) 

1009 return self._wrap_result(result, expand=expand, returns_string=expand) 

1010 

1011 @Appender( 

1012 _shared_docs["str_partition"] 

1013 % { 

1014 "side": "last", 

1015 "return": "3 elements containing two empty strings, followed by the " 

1016 "string itself", 

1017 "also": "partition : Split the string at the first occurrence of `sep`.", 

1018 } 

1019 ) 

1020 @forbid_nonstring_types(["bytes"]) 

1021 def rpartition(self, sep: str = " ", expand: bool = True): 

1022 result = self._data.array._str_rpartition(sep, expand) 

1023 return self._wrap_result(result, expand=expand, returns_string=expand) 

1024 

1025 def get(self, i): 

1026 """ 

1027 Extract element from each component at specified position or with specified key. 

1028 

1029 Extract element from lists, tuples, dict, or strings in each element in the 

1030 Series/Index. 

1031 

1032 Parameters 

1033 ---------- 

1034 i : int or hashable dict label 

1035 Position or key of element to extract. 

1036 

1037 Returns 

1038 ------- 

1039 Series or Index 

1040 

1041 Examples 

1042 -------- 

1043 >>> s = pd.Series(["String", 

1044 ... (1, 2, 3), 

1045 ... ["a", "b", "c"], 

1046 ... 123, 

1047 ... -456, 

1048 ... {1: "Hello", "2": "World"}]) 

1049 >>> s 

1050 0 String 

1051 1 (1, 2, 3) 

1052 2 [a, b, c] 

1053 3 123 

1054 4 -456 

1055 5 {1: 'Hello', '2': 'World'} 

1056 dtype: object 

1057 

1058 >>> s.str.get(1) 

1059 0 t 

1060 1 2 

1061 2 b 

1062 3 NaN 

1063 4 NaN 

1064 5 Hello 

1065 dtype: object 

1066 

1067 >>> s.str.get(-1) 

1068 0 g 

1069 1 3 

1070 2 c 

1071 3 NaN 

1072 4 NaN 

1073 5 None 

1074 dtype: object 

1075 

1076 Return element with given key 

1077 

1078 >>> s = pd.Series([{"name": "Hello", "value": "World"}, 

1079 ... {"name": "Goodbye", "value": "Planet"}]) 

1080 >>> s.str.get('name') 

1081 0 Hello 

1082 1 Goodbye 

1083 dtype: object 

1084 """ 

1085 result = self._data.array._str_get(i) 

1086 return self._wrap_result(result) 

1087 

1088 @forbid_nonstring_types(["bytes"]) 

1089 def join(self, sep): 

1090 """ 

1091 Join lists contained as elements in the Series/Index with passed delimiter. 

1092 

1093 If the elements of a Series are lists themselves, join the content of these 

1094 lists using the delimiter passed to the function. 

1095 This function is an equivalent to :meth:`str.join`. 

1096 

1097 Parameters 

1098 ---------- 

1099 sep : str 

1100 Delimiter to use between list entries. 

1101 

1102 Returns 

1103 ------- 

1104 Series/Index: object 

1105 The list entries concatenated by intervening occurrences of the 

1106 delimiter. 

1107 

1108 Raises 

1109 ------ 

1110 AttributeError 

1111 If the supplied Series contains neither strings nor lists. 

1112 

1113 See Also 

1114 -------- 

1115 str.join : Standard library version of this method. 

1116 Series.str.split : Split strings around given separator/delimiter. 

1117 

1118 Notes 

1119 ----- 

1120 If any of the list items is not a string object, the result of the join 

1121 will be `NaN`. 

1122 

1123 Examples 

1124 -------- 

1125 Example with a list that contains non-string elements. 

1126 

1127 >>> s = pd.Series([['lion', 'elephant', 'zebra'], 

1128 ... [1.1, 2.2, 3.3], 

1129 ... ['cat', np.nan, 'dog'], 

1130 ... ['cow', 4.5, 'goat'], 

1131 ... ['duck', ['swan', 'fish'], 'guppy']]) 

1132 >>> s 

1133 0 [lion, elephant, zebra] 

1134 1 [1.1, 2.2, 3.3] 

1135 2 [cat, nan, dog] 

1136 3 [cow, 4.5, goat] 

1137 4 [duck, [swan, fish], guppy] 

1138 dtype: object 

1139 

1140 Join all lists using a '-'. The lists containing object(s) of types other 

1141 than str will produce a NaN. 

1142 

1143 >>> s.str.join('-') 

1144 0 lion-elephant-zebra 

1145 1 NaN 

1146 2 NaN 

1147 3 NaN 

1148 4 NaN 

1149 dtype: object 

1150 """ 

1151 result = self._data.array._str_join(sep) 

1152 return self._wrap_result(result) 

1153 

1154 @forbid_nonstring_types(["bytes"]) 

1155 def contains( 

1156 self, pat, case: bool = True, flags: int = 0, na=None, regex: bool = True 

1157 ): 

1158 r""" 

1159 Test if pattern or regex is contained within a string of a Series or Index. 

1160 

1161 Return boolean Series or Index based on whether a given pattern or regex is 

1162 contained within a string of a Series or Index. 

1163 

1164 Parameters 

1165 ---------- 

1166 pat : str 

1167 Character sequence or regular expression. 

1168 case : bool, default True 

1169 If True, case sensitive. 

1170 flags : int, default 0 (no flags) 

1171 Flags to pass through to the re module, e.g. re.IGNORECASE. 

1172 na : scalar, optional 

1173 Fill value for missing values. The default depends on dtype of the 

1174 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, 

1175 ``pandas.NA`` is used. 

1176 regex : bool, default True 

1177 If True, assumes the pat is a regular expression. 

1178 

1179 If False, treats the pat as a literal string. 

1180 

1181 Returns 

1182 ------- 

1183 Series or Index of boolean values 

1184 A Series or Index of boolean values indicating whether the 

1185 given pattern is contained within the string of each element 

1186 of the Series or Index. 

1187 

1188 See Also 

1189 -------- 

1190 match : Analogous, but stricter, relying on re.match instead of re.search. 

1191 Series.str.startswith : Test if the start of each string element matches a 

1192 pattern. 

1193 Series.str.endswith : Same as startswith, but tests the end of string. 

1194 

1195 Examples 

1196 -------- 

1197 Returning a Series of booleans using only a literal pattern. 

1198 

1199 >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) 

1200 >>> s1.str.contains('og', regex=False) 

1201 0 False 

1202 1 True 

1203 2 False 

1204 3 False 

1205 4 NaN 

1206 dtype: object 

1207 

1208 Returning an Index of booleans using only a literal pattern. 

1209 

1210 >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) 

1211 >>> ind.str.contains('23', regex=False) 

1212 Index([False, False, False, True, nan], dtype='object') 

1213 

1214 Specifying case sensitivity using `case`. 

1215 

1216 >>> s1.str.contains('oG', case=True, regex=True) 

1217 0 False 

1218 1 False 

1219 2 False 

1220 3 False 

1221 4 NaN 

1222 dtype: object 

1223 

1224 Specifying `na` to be `False` instead of `NaN` replaces NaN values 

1225 with `False`. If Series or Index does not contain NaN values 

1226 the resultant dtype will be `bool`, otherwise, an `object` dtype. 

1227 

1228 >>> s1.str.contains('og', na=False, regex=True) 

1229 0 False 

1230 1 True 

1231 2 False 

1232 3 False 

1233 4 False 

1234 dtype: bool 

1235 

1236 Returning 'house' or 'dog' when either expression occurs in a string. 

1237 

1238 >>> s1.str.contains('house|dog', regex=True) 

1239 0 False 

1240 1 True 

1241 2 True 

1242 3 False 

1243 4 NaN 

1244 dtype: object 

1245 

1246 Ignoring case sensitivity using `flags` with regex. 

1247 

1248 >>> import re 

1249 >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) 

1250 0 False 

1251 1 False 

1252 2 True 

1253 3 False 

1254 4 NaN 

1255 dtype: object 

1256 

1257 Returning any digit using regular expression. 

1258 

1259 >>> s1.str.contains('\\d', regex=True) 

1260 0 False 

1261 1 False 

1262 2 False 

1263 3 True 

1264 4 NaN 

1265 dtype: object 

1266 

1267 Ensure `pat` is a not a literal pattern when `regex` is set to True. 

1268 Note in the following example one might expect only `s2[1]` and `s2[3]` to 

1269 return `True`. However, '.0' as a regex matches any character 

1270 followed by a 0. 

1271 

1272 >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) 

1273 >>> s2.str.contains('.0', regex=True) 

1274 0 True 

1275 1 True 

1276 2 False 

1277 3 True 

1278 4 False 

1279 dtype: bool 

1280 """ 

1281 if regex and re.compile(pat).groups: 

1282 warnings.warn( 

1283 "This pattern is interpreted as a regular expression, and has " 

1284 "match groups. To actually get the groups, use str.extract.", 

1285 UserWarning, 

1286 stacklevel=find_stack_level(), 

1287 ) 

1288 

1289 result = self._data.array._str_contains(pat, case, flags, na, regex) 

1290 return self._wrap_result(result, fill_value=na, returns_string=False) 

1291 

1292 @forbid_nonstring_types(["bytes"]) 

1293 def match(self, pat, case: bool = True, flags: int = 0, na=None): 

1294 """ 

1295 Determine if each string starts with a match of a regular expression. 

1296 

1297 Parameters 

1298 ---------- 

1299 pat : str 

1300 Character sequence or regular expression. 

1301 case : bool, default True 

1302 If True, case sensitive. 

1303 flags : int, default 0 (no flags) 

1304 Regex module flags, e.g. re.IGNORECASE. 

1305 na : scalar, optional 

1306 Fill value for missing values. The default depends on dtype of the 

1307 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, 

1308 ``pandas.NA`` is used. 

1309 

1310 Returns 

1311 ------- 

1312 Series/Index/array of boolean values 

1313 

1314 See Also 

1315 -------- 

1316 fullmatch : Stricter matching that requires the entire string to match. 

1317 contains : Analogous, but less strict, relying on re.search instead of 

1318 re.match. 

1319 extract : Extract matched groups. 

1320 """ 

1321 result = self._data.array._str_match(pat, case=case, flags=flags, na=na) 

1322 return self._wrap_result(result, fill_value=na, returns_string=False) 

1323 

1324 @forbid_nonstring_types(["bytes"]) 

1325 def fullmatch(self, pat, case: bool = True, flags: int = 0, na=None): 

1326 """ 

1327 Determine if each string entirely matches a regular expression. 

1328 

1329 .. versionadded:: 1.1.0 

1330 

1331 Parameters 

1332 ---------- 

1333 pat : str 

1334 Character sequence or regular expression. 

1335 case : bool, default True 

1336 If True, case sensitive. 

1337 flags : int, default 0 (no flags) 

1338 Regex module flags, e.g. re.IGNORECASE. 

1339 na : scalar, optional 

1340 Fill value for missing values. The default depends on dtype of the 

1341 array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, 

1342 ``pandas.NA`` is used. 

1343 

1344 Returns 

1345 ------- 

1346 Series/Index/array of boolean values 

1347 

1348 See Also 

1349 -------- 

1350 match : Similar, but also returns `True` when only a *prefix* of the string 

1351 matches the regular expression. 

1352 extract : Extract matched groups. 

1353 """ 

1354 result = self._data.array._str_fullmatch(pat, case=case, flags=flags, na=na) 

1355 return self._wrap_result(result, fill_value=na, returns_string=False) 

1356 

1357 @forbid_nonstring_types(["bytes"]) 

1358 def replace( 

1359 self, 

1360 pat: str | re.Pattern, 

1361 repl: str | Callable, 

1362 n: int = -1, 

1363 case: bool | None = None, 

1364 flags: int = 0, 

1365 regex: bool = False, 

1366 ): 

1367 r""" 

1368 Replace each occurrence of pattern/regex in the Series/Index. 

1369 

1370 Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on 

1371 the regex value. 

1372 

1373 Parameters 

1374 ---------- 

1375 pat : str or compiled regex 

1376 String can be a character sequence or regular expression. 

1377 repl : str or callable 

1378 Replacement string or a callable. The callable is passed the regex 

1379 match object and must return a replacement string to be used. 

1380 See :func:`re.sub`. 

1381 n : int, default -1 (all) 

1382 Number of replacements to make from start. 

1383 case : bool, default None 

1384 Determines if replace is case sensitive: 

1385 

1386 - If True, case sensitive (the default if `pat` is a string) 

1387 - Set to False for case insensitive 

1388 - Cannot be set if `pat` is a compiled regex. 

1389 

1390 flags : int, default 0 (no flags) 

1391 Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled 

1392 regex. 

1393 regex : bool, default False 

1394 Determines if the passed-in pattern is a regular expression: 

1395 

1396 - If True, assumes the passed-in pattern is a regular expression. 

1397 - If False, treats the pattern as a literal string 

1398 - Cannot be set to False if `pat` is a compiled regex or `repl` is 

1399 a callable. 

1400 

1401 Returns 

1402 ------- 

1403 Series or Index of object 

1404 A copy of the object with all matching occurrences of `pat` replaced by 

1405 `repl`. 

1406 

1407 Raises 

1408 ------ 

1409 ValueError 

1410 * if `regex` is False and `repl` is a callable or `pat` is a compiled 

1411 regex 

1412 * if `pat` is a compiled regex and `case` or `flags` is set 

1413 

1414 Notes 

1415 ----- 

1416 When `pat` is a compiled regex, all flags should be included in the 

1417 compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled 

1418 regex will raise an error. 

1419 

1420 Examples 

1421 -------- 

1422 When `pat` is a string and `regex` is True (the default), the given `pat` 

1423 is compiled as a regex. When `repl` is a string, it replaces matching 

1424 regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are 

1425 left as is: 

1426 

1427 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) 

1428 0 bao 

1429 1 baz 

1430 2 NaN 

1431 dtype: object 

1432 

1433 When `pat` is a string and `regex` is False, every `pat` is replaced with 

1434 `repl` as with :meth:`str.replace`: 

1435 

1436 >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) 

1437 0 bao 

1438 1 fuz 

1439 2 NaN 

1440 dtype: object 

1441 

1442 When `repl` is a callable, it is called on every `pat` using 

1443 :func:`re.sub`. The callable should expect one positional argument 

1444 (a regex object) and return a string. 

1445 

1446 To get the idea: 

1447 

1448 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr, regex=True) 

1449 0 <re.Match object; span=(0, 1), match='f'>oo 

1450 1 <re.Match object; span=(0, 1), match='f'>uz 

1451 2 NaN 

1452 dtype: object 

1453 

1454 Reverse every lowercase alphabetic word: 

1455 

1456 >>> repl = lambda m: m.group(0)[::-1] 

1457 >>> ser = pd.Series(['foo 123', 'bar baz', np.nan]) 

1458 >>> ser.str.replace(r'[a-z]+', repl, regex=True) 

1459 0 oof 123 

1460 1 rab zab 

1461 2 NaN 

1462 dtype: object 

1463 

1464 Using regex groups (extract second group and swap case): 

1465 

1466 >>> pat = r"(?P<one>\w+) (?P<two>\w+) (?P<three>\w+)" 

1467 >>> repl = lambda m: m.group('two').swapcase() 

1468 >>> ser = pd.Series(['One Two Three', 'Foo Bar Baz']) 

1469 >>> ser.str.replace(pat, repl, regex=True) 

1470 0 tWO 

1471 1 bAR 

1472 dtype: object 

1473 

1474 Using a compiled regex with flags 

1475 

1476 >>> import re 

1477 >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) 

1478 >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar', regex=True) 

1479 0 foo 

1480 1 bar 

1481 2 NaN 

1482 dtype: object 

1483 """ 

1484 # Check whether repl is valid (GH 13438, GH 15055) 

1485 if not (isinstance(repl, str) or callable(repl)): 

1486 raise TypeError("repl must be a string or callable") 

1487 

1488 is_compiled_re = is_re(pat) 

1489 if regex or regex is None: 

1490 if is_compiled_re and (case is not None or flags != 0): 

1491 raise ValueError( 

1492 "case and flags cannot be set when pat is a compiled regex" 

1493 ) 

1494 

1495 elif is_compiled_re: 

1496 raise ValueError( 

1497 "Cannot use a compiled regex as replacement pattern with regex=False" 

1498 ) 

1499 elif callable(repl): 

1500 raise ValueError("Cannot use a callable replacement when regex=False") 

1501 

1502 if case is None: 

1503 case = True 

1504 

1505 result = self._data.array._str_replace( 

1506 pat, repl, n=n, case=case, flags=flags, regex=regex 

1507 ) 

1508 return self._wrap_result(result) 

1509 

1510 @forbid_nonstring_types(["bytes"]) 

1511 def repeat(self, repeats): 

1512 """ 

1513 Duplicate each string in the Series or Index. 

1514 

1515 Parameters 

1516 ---------- 

1517 repeats : int or sequence of int 

1518 Same value for all (int) or different value per (sequence). 

1519 

1520 Returns 

1521 ------- 

1522 Series or pandas.Index 

1523 Series or Index of repeated string objects specified by 

1524 input parameter repeats. 

1525 

1526 Examples 

1527 -------- 

1528 >>> s = pd.Series(['a', 'b', 'c']) 

1529 >>> s 

1530 0 a 

1531 1 b 

1532 2 c 

1533 dtype: object 

1534 

1535 Single int repeats string in Series 

1536 

1537 >>> s.str.repeat(repeats=2) 

1538 0 aa 

1539 1 bb 

1540 2 cc 

1541 dtype: object 

1542 

1543 Sequence of int repeats corresponding string in Series 

1544 

1545 >>> s.str.repeat(repeats=[1, 2, 3]) 

1546 0 a 

1547 1 bb 

1548 2 ccc 

1549 dtype: object 

1550 """ 

1551 result = self._data.array._str_repeat(repeats) 

1552 return self._wrap_result(result) 

1553 

1554 @forbid_nonstring_types(["bytes"]) 

1555 def pad( 

1556 self, 

1557 width, 

1558 side: Literal["left", "right", "both"] = "left", 

1559 fillchar: str = " ", 

1560 ): 

1561 """ 

1562 Pad strings in the Series/Index up to width. 

1563 

1564 Parameters 

1565 ---------- 

1566 width : int 

1567 Minimum width of resulting string; additional characters will be filled 

1568 with character defined in `fillchar`. 

1569 side : {'left', 'right', 'both'}, default 'left' 

1570 Side from which to fill resulting string. 

1571 fillchar : str, default ' ' 

1572 Additional character for filling, default is whitespace. 

1573 

1574 Returns 

1575 ------- 

1576 Series or Index of object 

1577 Returns Series or Index with minimum number of char in object. 

1578 

1579 See Also 

1580 -------- 

1581 Series.str.rjust : Fills the left side of strings with an arbitrary 

1582 character. Equivalent to ``Series.str.pad(side='left')``. 

1583 Series.str.ljust : Fills the right side of strings with an arbitrary 

1584 character. Equivalent to ``Series.str.pad(side='right')``. 

1585 Series.str.center : Fills both sides of strings with an arbitrary 

1586 character. Equivalent to ``Series.str.pad(side='both')``. 

1587 Series.str.zfill : Pad strings in the Series/Index by prepending '0' 

1588 character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. 

1589 

1590 Examples 

1591 -------- 

1592 >>> s = pd.Series(["caribou", "tiger"]) 

1593 >>> s 

1594 0 caribou 

1595 1 tiger 

1596 dtype: object 

1597 

1598 >>> s.str.pad(width=10) 

1599 0 caribou 

1600 1 tiger 

1601 dtype: object 

1602 

1603 >>> s.str.pad(width=10, side='right', fillchar='-') 

1604 0 caribou--- 

1605 1 tiger----- 

1606 dtype: object 

1607 

1608 >>> s.str.pad(width=10, side='both', fillchar='-') 

1609 0 -caribou-- 

1610 1 --tiger--- 

1611 dtype: object 

1612 """ 

1613 if not isinstance(fillchar, str): 

1614 msg = f"fillchar must be a character, not {type(fillchar).__name__}" 

1615 raise TypeError(msg) 

1616 

1617 if len(fillchar) != 1: 

1618 raise TypeError("fillchar must be a character, not str") 

1619 

1620 if not is_integer(width): 

1621 msg = f"width must be of integer type, not {type(width).__name__}" 

1622 raise TypeError(msg) 

1623 

1624 result = self._data.array._str_pad(width, side=side, fillchar=fillchar) 

1625 return self._wrap_result(result) 

1626 

1627 _shared_docs[ 

1628 "str_pad" 

1629 ] = """ 

1630 Pad %(side)s side of strings in the Series/Index. 

1631 

1632 Equivalent to :meth:`str.%(method)s`. 

1633 

1634 Parameters 

1635 ---------- 

1636 width : int 

1637 Minimum width of resulting string; additional characters will be filled 

1638 with ``fillchar``. 

1639 fillchar : str 

1640 Additional character for filling, default is whitespace. 

1641 

1642 Returns 

1643 ------- 

1644 Series/Index of objects. 

1645 """ 

1646 

1647 @Appender(_shared_docs["str_pad"] % {"side": "left and right", "method": "center"}) 

1648 @forbid_nonstring_types(["bytes"]) 

1649 def center(self, width, fillchar: str = " "): 

1650 return self.pad(width, side="both", fillchar=fillchar) 

1651 

1652 @Appender(_shared_docs["str_pad"] % {"side": "right", "method": "ljust"}) 

1653 @forbid_nonstring_types(["bytes"]) 

1654 def ljust(self, width, fillchar: str = " "): 

1655 return self.pad(width, side="right", fillchar=fillchar) 

1656 

1657 @Appender(_shared_docs["str_pad"] % {"side": "left", "method": "rjust"}) 

1658 @forbid_nonstring_types(["bytes"]) 

1659 def rjust(self, width, fillchar: str = " "): 

1660 return self.pad(width, side="left", fillchar=fillchar) 

1661 

1662 @forbid_nonstring_types(["bytes"]) 

1663 def zfill(self, width): 

1664 """ 

1665 Pad strings in the Series/Index by prepending '0' characters. 

1666 

1667 Strings in the Series/Index are padded with '0' characters on the 

1668 left of the string to reach a total string length `width`. Strings 

1669 in the Series/Index with length greater or equal to `width` are 

1670 unchanged. 

1671 

1672 Parameters 

1673 ---------- 

1674 width : int 

1675 Minimum length of resulting string; strings with length less 

1676 than `width` be prepended with '0' characters. 

1677 

1678 Returns 

1679 ------- 

1680 Series/Index of objects. 

1681 

1682 See Also 

1683 -------- 

1684 Series.str.rjust : Fills the left side of strings with an arbitrary 

1685 character. 

1686 Series.str.ljust : Fills the right side of strings with an arbitrary 

1687 character. 

1688 Series.str.pad : Fills the specified sides of strings with an arbitrary 

1689 character. 

1690 Series.str.center : Fills both sides of strings with an arbitrary 

1691 character. 

1692 

1693 Notes 

1694 ----- 

1695 Differs from :meth:`str.zfill` which has special handling 

1696 for '+'/'-' in the string. 

1697 

1698 Examples 

1699 -------- 

1700 >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) 

1701 >>> s 

1702 0 -1 

1703 1 1 

1704 2 1000 

1705 3 10 

1706 4 NaN 

1707 dtype: object 

1708 

1709 Note that ``10`` and ``NaN`` are not strings, therefore they are 

1710 converted to ``NaN``. The minus sign in ``'-1'`` is treated as a 

1711 special character and the zero is added to the right of it 

1712 (:meth:`str.zfill` would have moved it to the left). ``1000`` 

1713 remains unchanged as it is longer than `width`. 

1714 

1715 >>> s.str.zfill(3) 

1716 0 -01 

1717 1 001 

1718 2 1000 

1719 3 NaN 

1720 4 NaN 

1721 dtype: object 

1722 """ 

1723 if not is_integer(width): 

1724 msg = f"width must be of integer type, not {type(width).__name__}" 

1725 raise TypeError(msg) 

1726 f = lambda x: x.zfill(width) 

1727 result = self._data.array._str_map(f) 

1728 return self._wrap_result(result) 

1729 

1730 def slice(self, start=None, stop=None, step=None): 

1731 """ 

1732 Slice substrings from each element in the Series or Index. 

1733 

1734 Parameters 

1735 ---------- 

1736 start : int, optional 

1737 Start position for slice operation. 

1738 stop : int, optional 

1739 Stop position for slice operation. 

1740 step : int, optional 

1741 Step size for slice operation. 

1742 

1743 Returns 

1744 ------- 

1745 Series or Index of object 

1746 Series or Index from sliced substring from original string object. 

1747 

1748 See Also 

1749 -------- 

1750 Series.str.slice_replace : Replace a slice with a string. 

1751 Series.str.get : Return element at position. 

1752 Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` 

1753 being the position. 

1754 

1755 Examples 

1756 -------- 

1757 >>> s = pd.Series(["koala", "dog", "chameleon"]) 

1758 >>> s 

1759 0 koala 

1760 1 dog 

1761 2 chameleon 

1762 dtype: object 

1763 

1764 >>> s.str.slice(start=1) 

1765 0 oala 

1766 1 og 

1767 2 hameleon 

1768 dtype: object 

1769 

1770 >>> s.str.slice(start=-1) 

1771 0 a 

1772 1 g 

1773 2 n 

1774 dtype: object 

1775 

1776 >>> s.str.slice(stop=2) 

1777 0 ko 

1778 1 do 

1779 2 ch 

1780 dtype: object 

1781 

1782 >>> s.str.slice(step=2) 

1783 0 kaa 

1784 1 dg 

1785 2 caeen 

1786 dtype: object 

1787 

1788 >>> s.str.slice(start=0, stop=5, step=3) 

1789 0 kl 

1790 1 d 

1791 2 cm 

1792 dtype: object 

1793 

1794 Equivalent behaviour to: 

1795 

1796 >>> s.str[0:5:3] 

1797 0 kl 

1798 1 d 

1799 2 cm 

1800 dtype: object 

1801 """ 

1802 result = self._data.array._str_slice(start, stop, step) 

1803 return self._wrap_result(result) 

1804 

1805 @forbid_nonstring_types(["bytes"]) 

1806 def slice_replace(self, start=None, stop=None, repl=None): 

1807 """ 

1808 Replace a positional slice of a string with another value. 

1809 

1810 Parameters 

1811 ---------- 

1812 start : int, optional 

1813 Left index position to use for the slice. If not specified (None), 

1814 the slice is unbounded on the left, i.e. slice from the start 

1815 of the string. 

1816 stop : int, optional 

1817 Right index position to use for the slice. If not specified (None), 

1818 the slice is unbounded on the right, i.e. slice until the 

1819 end of the string. 

1820 repl : str, optional 

1821 String for replacement. If not specified (None), the sliced region 

1822 is replaced with an empty string. 

1823 

1824 Returns 

1825 ------- 

1826 Series or Index 

1827 Same type as the original object. 

1828 

1829 See Also 

1830 -------- 

1831 Series.str.slice : Just slicing without replacement. 

1832 

1833 Examples 

1834 -------- 

1835 >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) 

1836 >>> s 

1837 0 a 

1838 1 ab 

1839 2 abc 

1840 3 abdc 

1841 4 abcde 

1842 dtype: object 

1843 

1844 Specify just `start`, meaning replace `start` until the end of the 

1845 string with `repl`. 

1846 

1847 >>> s.str.slice_replace(1, repl='X') 

1848 0 aX 

1849 1 aX 

1850 2 aX 

1851 3 aX 

1852 4 aX 

1853 dtype: object 

1854 

1855 Specify just `stop`, meaning the start of the string to `stop` is replaced 

1856 with `repl`, and the rest of the string is included. 

1857 

1858 >>> s.str.slice_replace(stop=2, repl='X') 

1859 0 X 

1860 1 X 

1861 2 Xc 

1862 3 Xdc 

1863 4 Xcde 

1864 dtype: object 

1865 

1866 Specify `start` and `stop`, meaning the slice from `start` to `stop` is 

1867 replaced with `repl`. Everything before or after `start` and `stop` is 

1868 included as is. 

1869 

1870 >>> s.str.slice_replace(start=1, stop=3, repl='X') 

1871 0 aX 

1872 1 aX 

1873 2 aX 

1874 3 aXc 

1875 4 aXde 

1876 dtype: object 

1877 """ 

1878 result = self._data.array._str_slice_replace(start, stop, repl) 

1879 return self._wrap_result(result) 

1880 

1881 def decode(self, encoding, errors: str = "strict"): 

1882 """ 

1883 Decode character string in the Series/Index using indicated encoding. 

1884 

1885 Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in 

1886 python3. 

1887 

1888 Parameters 

1889 ---------- 

1890 encoding : str 

1891 errors : str, optional 

1892 

1893 Returns 

1894 ------- 

1895 Series or Index 

1896 """ 

1897 # TODO: Add a similar _bytes interface. 

1898 if encoding in _cpython_optimized_decoders: 

1899 # CPython optimized implementation 

1900 f = lambda x: x.decode(encoding, errors) 

1901 else: 

1902 decoder = codecs.getdecoder(encoding) 

1903 f = lambda x: decoder(x, errors)[0] 

1904 arr = self._data.array 

1905 # assert isinstance(arr, (StringArray,)) 

1906 result = arr._str_map(f) 

1907 return self._wrap_result(result) 

1908 

1909 @forbid_nonstring_types(["bytes"]) 

1910 def encode(self, encoding, errors: str = "strict"): 

1911 """ 

1912 Encode character string in the Series/Index using indicated encoding. 

1913 

1914 Equivalent to :meth:`str.encode`. 

1915 

1916 Parameters 

1917 ---------- 

1918 encoding : str 

1919 errors : str, optional 

1920 

1921 Returns 

1922 ------- 

1923 Series/Index of objects 

1924 """ 

1925 result = self._data.array._str_encode(encoding, errors) 

1926 return self._wrap_result(result, returns_string=False) 

1927 

1928 _shared_docs[ 

1929 "str_strip" 

1930 ] = r""" 

1931 Remove %(position)s characters. 

1932 

1933 Strip whitespaces (including newlines) or a set of specified characters 

1934 from each string in the Series/Index from %(side)s. 

1935 Replaces any non-strings in Series with NaNs. 

1936 Equivalent to :meth:`str.%(method)s`. 

1937 

1938 Parameters 

1939 ---------- 

1940 to_strip : str or None, default None 

1941 Specifying the set of characters to be removed. 

1942 All combinations of this set of characters will be stripped. 

1943 If None then whitespaces are removed. 

1944 

1945 Returns 

1946 ------- 

1947 Series or Index of object 

1948 

1949 See Also 

1950 -------- 

1951 Series.str.strip : Remove leading and trailing characters in Series/Index. 

1952 Series.str.lstrip : Remove leading characters in Series/Index. 

1953 Series.str.rstrip : Remove trailing characters in Series/Index. 

1954 

1955 Examples 

1956 -------- 

1957 >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan, 10, True]) 

1958 >>> s 

1959 0 1. Ant. 

1960 1 2. Bee!\n 

1961 2 3. Cat?\t 

1962 3 NaN 

1963 4 10 

1964 5 True 

1965 dtype: object 

1966 

1967 >>> s.str.strip() 

1968 0 1. Ant. 

1969 1 2. Bee! 

1970 2 3. Cat? 

1971 3 NaN 

1972 4 NaN 

1973 5 NaN 

1974 dtype: object 

1975 

1976 >>> s.str.lstrip('123.') 

1977 0 Ant. 

1978 1 Bee!\n 

1979 2 Cat?\t 

1980 3 NaN 

1981 4 NaN 

1982 5 NaN 

1983 dtype: object 

1984 

1985 >>> s.str.rstrip('.!? \n\t') 

1986 0 1. Ant 

1987 1 2. Bee 

1988 2 3. Cat 

1989 3 NaN 

1990 4 NaN 

1991 5 NaN 

1992 dtype: object 

1993 

1994 >>> s.str.strip('123.!? \n\t') 

1995 0 Ant 

1996 1 Bee 

1997 2 Cat 

1998 3 NaN 

1999 4 NaN 

2000 5 NaN 

2001 dtype: object 

2002 """ 

2003 

2004 @Appender( 

2005 _shared_docs["str_strip"] 

2006 % { 

2007 "side": "left and right sides", 

2008 "method": "strip", 

2009 "position": "leading and trailing", 

2010 } 

2011 ) 

2012 @forbid_nonstring_types(["bytes"]) 

2013 def strip(self, to_strip=None): 

2014 result = self._data.array._str_strip(to_strip) 

2015 return self._wrap_result(result) 

2016 

2017 @Appender( 

2018 _shared_docs["str_strip"] 

2019 % {"side": "left side", "method": "lstrip", "position": "leading"} 

2020 ) 

2021 @forbid_nonstring_types(["bytes"]) 

2022 def lstrip(self, to_strip=None): 

2023 result = self._data.array._str_lstrip(to_strip) 

2024 return self._wrap_result(result) 

2025 

2026 @Appender( 

2027 _shared_docs["str_strip"] 

2028 % {"side": "right side", "method": "rstrip", "position": "trailing"} 

2029 ) 

2030 @forbid_nonstring_types(["bytes"]) 

2031 def rstrip(self, to_strip=None): 

2032 result = self._data.array._str_rstrip(to_strip) 

2033 return self._wrap_result(result) 

2034 

2035 _shared_docs[ 

2036 "str_removefix" 

2037 ] = r""" 

2038 Remove a %(side)s from an object series. 

2039 

2040 If the %(side)s is not present, the original string will be returned. 

2041 

2042 Parameters 

2043 ---------- 

2044 %(side)s : str 

2045 Remove the %(side)s of the string. 

2046 

2047 Returns 

2048 ------- 

2049 Series/Index: object 

2050 The Series or Index with given %(side)s removed. 

2051 

2052 See Also 

2053 -------- 

2054 Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series. 

2055 

2056 Examples 

2057 -------- 

2058 >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"]) 

2059 >>> s 

2060 0 str_foo 

2061 1 str_bar 

2062 2 no_prefix 

2063 dtype: object 

2064 >>> s.str.removeprefix("str_") 

2065 0 foo 

2066 1 bar 

2067 2 no_prefix 

2068 dtype: object 

2069 

2070 >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"]) 

2071 >>> s 

2072 0 foo_str 

2073 1 bar_str 

2074 2 no_suffix 

2075 dtype: object 

2076 >>> s.str.removesuffix("_str") 

2077 0 foo 

2078 1 bar 

2079 2 no_suffix 

2080 dtype: object 

2081 """ 

2082 

2083 @Appender( 

2084 _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"} 

2085 ) 

2086 @forbid_nonstring_types(["bytes"]) 

2087 def removeprefix(self, prefix): 

2088 result = self._data.array._str_removeprefix(prefix) 

2089 return self._wrap_result(result) 

2090 

2091 @Appender( 

2092 _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"} 

2093 ) 

2094 @forbid_nonstring_types(["bytes"]) 

2095 def removesuffix(self, suffix): 

2096 result = self._data.array._str_removesuffix(suffix) 

2097 return self._wrap_result(result) 

2098 

2099 @forbid_nonstring_types(["bytes"]) 

2100 def wrap(self, width, **kwargs): 

2101 r""" 

2102 Wrap strings in Series/Index at specified line width. 

2103 

2104 This method has the same keyword parameters and defaults as 

2105 :class:`textwrap.TextWrapper`. 

2106 

2107 Parameters 

2108 ---------- 

2109 width : int 

2110 Maximum line width. 

2111 expand_tabs : bool, optional 

2112 If True, tab characters will be expanded to spaces (default: True). 

2113 replace_whitespace : bool, optional 

2114 If True, each whitespace character (as defined by string.whitespace) 

2115 remaining after tab expansion will be replaced by a single space 

2116 (default: True). 

2117 drop_whitespace : bool, optional 

2118 If True, whitespace that, after wrapping, happens to end up at the 

2119 beginning or end of a line is dropped (default: True). 

2120 break_long_words : bool, optional 

2121 If True, then words longer than width will be broken in order to ensure 

2122 that no lines are longer than width. If it is false, long words will 

2123 not be broken, and some lines may be longer than width (default: True). 

2124 break_on_hyphens : bool, optional 

2125 If True, wrapping will occur preferably on whitespace and right after 

2126 hyphens in compound words, as it is customary in English. If false, 

2127 only whitespaces will be considered as potentially good places for line 

2128 breaks, but you need to set break_long_words to false if you want truly 

2129 insecable words (default: True). 

2130 

2131 Returns 

2132 ------- 

2133 Series or Index 

2134 

2135 Notes 

2136 ----- 

2137 Internally, this method uses a :class:`textwrap.TextWrapper` instance with 

2138 default settings. To achieve behavior matching R's stringr library str_wrap 

2139 function, use the arguments: 

2140 

2141 - expand_tabs = False 

2142 - replace_whitespace = True 

2143 - drop_whitespace = True 

2144 - break_long_words = False 

2145 - break_on_hyphens = False 

2146 

2147 Examples 

2148 -------- 

2149 >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) 

2150 >>> s.str.wrap(12) 

2151 0 line to be\nwrapped 

2152 1 another line\nto be\nwrapped 

2153 dtype: object 

2154 """ 

2155 result = self._data.array._str_wrap(width, **kwargs) 

2156 return self._wrap_result(result) 

2157 

2158 @forbid_nonstring_types(["bytes"]) 

2159 def get_dummies(self, sep: str = "|"): 

2160 """ 

2161 Return DataFrame of dummy/indicator variables for Series. 

2162 

2163 Each string in Series is split by sep and returned as a DataFrame 

2164 of dummy/indicator variables. 

2165 

2166 Parameters 

2167 ---------- 

2168 sep : str, default "|" 

2169 String to split on. 

2170 

2171 Returns 

2172 ------- 

2173 DataFrame 

2174 Dummy variables corresponding to values of the Series. 

2175 

2176 See Also 

2177 -------- 

2178 get_dummies : Convert categorical variable into dummy/indicator 

2179 variables. 

2180 

2181 Examples 

2182 -------- 

2183 >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() 

2184 a b c 

2185 0 1 1 0 

2186 1 1 0 0 

2187 2 1 0 1 

2188 

2189 >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() 

2190 a b c 

2191 0 1 1 0 

2192 1 0 0 0 

2193 2 1 0 1 

2194 """ 

2195 # we need to cast to Series of strings as only that has all 

2196 # methods available for making the dummies... 

2197 result, name = self._data.array._str_get_dummies(sep) 

2198 return self._wrap_result( 

2199 result, 

2200 name=name, 

2201 expand=True, 

2202 returns_string=False, 

2203 ) 

2204 

2205 @forbid_nonstring_types(["bytes"]) 

2206 def translate(self, table): 

2207 """ 

2208 Map all characters in the string through the given mapping table. 

2209 

2210 Equivalent to standard :meth:`str.translate`. 

2211 

2212 Parameters 

2213 ---------- 

2214 table : dict 

2215 Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or 

2216 None. Unmapped characters are left untouched. 

2217 Characters mapped to None are deleted. :meth:`str.maketrans` is a 

2218 helper function for making translation tables. 

2219 

2220 Returns 

2221 ------- 

2222 Series or Index 

2223 """ 

2224 result = self._data.array._str_translate(table) 

2225 return self._wrap_result(result) 

2226 

2227 @forbid_nonstring_types(["bytes"]) 

2228 def count(self, pat, flags: int = 0): 

2229 r""" 

2230 Count occurrences of pattern in each string of the Series/Index. 

2231 

2232 This function is used to count the number of times a particular regex 

2233 pattern is repeated in each of the string elements of the 

2234 :class:`~pandas.Series`. 

2235 

2236 Parameters 

2237 ---------- 

2238 pat : str 

2239 Valid regular expression. 

2240 flags : int, default 0, meaning no flags 

2241 Flags for the `re` module. For a complete list, `see here 

2242 <https://docs.python.org/3/howto/regex.html#compilation-flags>`_. 

2243 **kwargs 

2244 For compatibility with other string methods. Not used. 

2245 

2246 Returns 

2247 ------- 

2248 Series or Index 

2249 Same type as the calling object containing the integer counts. 

2250 

2251 See Also 

2252 -------- 

2253 re : Standard library module for regular expressions. 

2254 str.count : Standard library version, without regular expression support. 

2255 

2256 Notes 

2257 ----- 

2258 Some characters need to be escaped when passing in `pat`. 

2259 eg. ``'$'`` has a special meaning in regex and must be escaped when 

2260 finding this literal character. 

2261 

2262 Examples 

2263 -------- 

2264 >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) 

2265 >>> s.str.count('a') 

2266 0 0.0 

2267 1 0.0 

2268 2 2.0 

2269 3 2.0 

2270 4 NaN 

2271 5 0.0 

2272 6 1.0 

2273 dtype: float64 

2274 

2275 Escape ``'$'`` to find the literal dollar sign. 

2276 

2277 >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) 

2278 >>> s.str.count('\\$') 

2279 0 1 

2280 1 0 

2281 2 1 

2282 3 2 

2283 4 2 

2284 5 0 

2285 dtype: int64 

2286 

2287 This is also available on Index 

2288 

2289 >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') 

2290 Index([0, 0, 2, 1], dtype='int64') 

2291 """ 

2292 result = self._data.array._str_count(pat, flags) 

2293 return self._wrap_result(result, returns_string=False) 

2294 

2295 @forbid_nonstring_types(["bytes"]) 

2296 def startswith( 

2297 self, pat: str | tuple[str, ...], na: Scalar | None = None 

2298 ) -> Series | Index: 

2299 """ 

2300 Test if the start of each string element matches a pattern. 

2301 

2302 Equivalent to :meth:`str.startswith`. 

2303 

2304 Parameters 

2305 ---------- 

2306 pat : str or tuple[str, ...] 

2307 Character sequence or tuple of strings. Regular expressions are not 

2308 accepted. 

2309 na : object, default NaN 

2310 Object shown if element tested is not a string. The default depends 

2311 on dtype of the array. For object-dtype, ``numpy.nan`` is used. 

2312 For ``StringDtype``, ``pandas.NA`` is used. 

2313 

2314 Returns 

2315 ------- 

2316 Series or Index of bool 

2317 A Series of booleans indicating whether the given pattern matches 

2318 the start of each string element. 

2319 

2320 See Also 

2321 -------- 

2322 str.startswith : Python standard library string method. 

2323 Series.str.endswith : Same as startswith, but tests the end of string. 

2324 Series.str.contains : Tests if string element contains a pattern. 

2325 

2326 Examples 

2327 -------- 

2328 >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) 

2329 >>> s 

2330 0 bat 

2331 1 Bear 

2332 2 cat 

2333 3 NaN 

2334 dtype: object 

2335 

2336 >>> s.str.startswith('b') 

2337 0 True 

2338 1 False 

2339 2 False 

2340 3 NaN 

2341 dtype: object 

2342 

2343 >>> s.str.startswith(('b', 'B')) 

2344 0 True 

2345 1 True 

2346 2 False 

2347 3 NaN 

2348 dtype: object 

2349 

2350 Specifying `na` to be `False` instead of `NaN`. 

2351 

2352 >>> s.str.startswith('b', na=False) 

2353 0 True 

2354 1 False 

2355 2 False 

2356 3 False 

2357 dtype: bool 

2358 """ 

2359 if not isinstance(pat, (str, tuple)): 

2360 msg = f"expected a string or tuple, not {type(pat).__name__}" 

2361 raise TypeError(msg) 

2362 result = self._data.array._str_startswith(pat, na=na) 

2363 return self._wrap_result(result, returns_string=False) 

2364 

2365 @forbid_nonstring_types(["bytes"]) 

2366 def endswith( 

2367 self, pat: str | tuple[str, ...], na: Scalar | None = None 

2368 ) -> Series | Index: 

2369 """ 

2370 Test if the end of each string element matches a pattern. 

2371 

2372 Equivalent to :meth:`str.endswith`. 

2373 

2374 Parameters 

2375 ---------- 

2376 pat : str or tuple[str, ...] 

2377 Character sequence or tuple of strings. Regular expressions are not 

2378 accepted. 

2379 na : object, default NaN 

2380 Object shown if element tested is not a string. The default depends 

2381 on dtype of the array. For object-dtype, ``numpy.nan`` is used. 

2382 For ``StringDtype``, ``pandas.NA`` is used. 

2383 

2384 Returns 

2385 ------- 

2386 Series or Index of bool 

2387 A Series of booleans indicating whether the given pattern matches 

2388 the end of each string element. 

2389 

2390 See Also 

2391 -------- 

2392 str.endswith : Python standard library string method. 

2393 Series.str.startswith : Same as endswith, but tests the start of string. 

2394 Series.str.contains : Tests if string element contains a pattern. 

2395 

2396 Examples 

2397 -------- 

2398 >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) 

2399 >>> s 

2400 0 bat 

2401 1 bear 

2402 2 caT 

2403 3 NaN 

2404 dtype: object 

2405 

2406 >>> s.str.endswith('t') 

2407 0 True 

2408 1 False 

2409 2 False 

2410 3 NaN 

2411 dtype: object 

2412 

2413 >>> s.str.endswith(('t', 'T')) 

2414 0 True 

2415 1 False 

2416 2 True 

2417 3 NaN 

2418 dtype: object 

2419 

2420 Specifying `na` to be `False` instead of `NaN`. 

2421 

2422 >>> s.str.endswith('t', na=False) 

2423 0 True 

2424 1 False 

2425 2 False 

2426 3 False 

2427 dtype: bool 

2428 """ 

2429 if not isinstance(pat, (str, tuple)): 

2430 msg = f"expected a string or tuple, not {type(pat).__name__}" 

2431 raise TypeError(msg) 

2432 result = self._data.array._str_endswith(pat, na=na) 

2433 return self._wrap_result(result, returns_string=False) 

2434 

2435 @forbid_nonstring_types(["bytes"]) 

2436 def findall(self, pat, flags: int = 0): 

2437 """ 

2438 Find all occurrences of pattern or regular expression in the Series/Index. 

2439 

2440 Equivalent to applying :func:`re.findall` to all the elements in the 

2441 Series/Index. 

2442 

2443 Parameters 

2444 ---------- 

2445 pat : str 

2446 Pattern or regular expression. 

2447 flags : int, default 0 

2448 Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which 

2449 means no flags). 

2450 

2451 Returns 

2452 ------- 

2453 Series/Index of lists of strings 

2454 All non-overlapping matches of pattern or regular expression in each 

2455 string of this Series/Index. 

2456 

2457 See Also 

2458 -------- 

2459 count : Count occurrences of pattern or regular expression in each string 

2460 of the Series/Index. 

2461 extractall : For each string in the Series, extract groups from all matches 

2462 of regular expression and return a DataFrame with one row for each 

2463 match and one column for each group. 

2464 re.findall : The equivalent ``re`` function to all non-overlapping matches 

2465 of pattern or regular expression in string, as a list of strings. 

2466 

2467 Examples 

2468 -------- 

2469 >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) 

2470 

2471 The search for the pattern 'Monkey' returns one match: 

2472 

2473 >>> s.str.findall('Monkey') 

2474 0 [] 

2475 1 [Monkey] 

2476 2 [] 

2477 dtype: object 

2478 

2479 On the other hand, the search for the pattern 'MONKEY' doesn't return any 

2480 match: 

2481 

2482 >>> s.str.findall('MONKEY') 

2483 0 [] 

2484 1 [] 

2485 2 [] 

2486 dtype: object 

2487 

2488 Flags can be added to the pattern or regular expression. For instance, 

2489 to find the pattern 'MONKEY' ignoring the case: 

2490 

2491 >>> import re 

2492 >>> s.str.findall('MONKEY', flags=re.IGNORECASE) 

2493 0 [] 

2494 1 [Monkey] 

2495 2 [] 

2496 dtype: object 

2497 

2498 When the pattern matches more than one string in the Series, all matches 

2499 are returned: 

2500 

2501 >>> s.str.findall('on') 

2502 0 [on] 

2503 1 [on] 

2504 2 [] 

2505 dtype: object 

2506 

2507 Regular expressions are supported too. For instance, the search for all the 

2508 strings ending with the word 'on' is shown next: 

2509 

2510 >>> s.str.findall('on$') 

2511 0 [on] 

2512 1 [] 

2513 2 [] 

2514 dtype: object 

2515 

2516 If the pattern is found more than once in the same string, then a list of 

2517 multiple strings is returned: 

2518 

2519 >>> s.str.findall('b') 

2520 0 [] 

2521 1 [] 

2522 2 [b, b] 

2523 dtype: object 

2524 """ 

2525 result = self._data.array._str_findall(pat, flags) 

2526 return self._wrap_result(result, returns_string=False) 

2527 

2528 @forbid_nonstring_types(["bytes"]) 

2529 def extract( 

2530 self, pat: str, flags: int = 0, expand: bool = True 

2531 ) -> DataFrame | Series | Index: 

2532 r""" 

2533 Extract capture groups in the regex `pat` as columns in a DataFrame. 

2534 

2535 For each subject string in the Series, extract groups from the 

2536 first match of regular expression `pat`. 

2537 

2538 Parameters 

2539 ---------- 

2540 pat : str 

2541 Regular expression pattern with capturing groups. 

2542 flags : int, default 0 (no flags) 

2543 Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that 

2544 modify regular expression matching for things like case, 

2545 spaces, etc. For more details, see :mod:`re`. 

2546 expand : bool, default True 

2547 If True, return DataFrame with one column per capture group. 

2548 If False, return a Series/Index if there is one capture group 

2549 or DataFrame if there are multiple capture groups. 

2550 

2551 Returns 

2552 ------- 

2553 DataFrame or Series or Index 

2554 A DataFrame with one row for each subject string, and one 

2555 column for each group. Any capture group names in regular 

2556 expression pat will be used for column names; otherwise 

2557 capture group numbers will be used. The dtype of each result 

2558 column is always object, even when no match is found. If 

2559 ``expand=False`` and pat has only one capture group, then 

2560 return a Series (if subject is a Series) or Index (if subject 

2561 is an Index). 

2562 

2563 See Also 

2564 -------- 

2565 extractall : Returns all matches (not just the first match). 

2566 

2567 Examples 

2568 -------- 

2569 A pattern with two groups will return a DataFrame with two columns. 

2570 Non-matches will be NaN. 

2571 

2572 >>> s = pd.Series(['a1', 'b2', 'c3']) 

2573 >>> s.str.extract(r'([ab])(\d)') 

2574 0 1 

2575 0 a 1 

2576 1 b 2 

2577 2 NaN NaN 

2578 

2579 A pattern may contain optional groups. 

2580 

2581 >>> s.str.extract(r'([ab])?(\d)') 

2582 0 1 

2583 0 a 1 

2584 1 b 2 

2585 2 NaN 3 

2586 

2587 Named groups will become column names in the result. 

2588 

2589 >>> s.str.extract(r'(?P<letter>[ab])(?P<digit>\d)') 

2590 letter digit 

2591 0 a 1 

2592 1 b 2 

2593 2 NaN NaN 

2594 

2595 A pattern with one group will return a DataFrame with one column 

2596 if expand=True. 

2597 

2598 >>> s.str.extract(r'[ab](\d)', expand=True) 

2599 0 

2600 0 1 

2601 1 2 

2602 2 NaN 

2603 

2604 A pattern with one group will return a Series if expand=False. 

2605 

2606 >>> s.str.extract(r'[ab](\d)', expand=False) 

2607 0 1 

2608 1 2 

2609 2 NaN 

2610 dtype: object 

2611 """ 

2612 from pandas import DataFrame 

2613 

2614 if not isinstance(expand, bool): 

2615 raise ValueError("expand must be True or False") 

2616 

2617 regex = re.compile(pat, flags=flags) 

2618 if regex.groups == 0: 

2619 raise ValueError("pattern contains no capture groups") 

2620 

2621 if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex): 

2622 raise ValueError("only one regex group is supported with Index") 

2623 

2624 obj = self._data 

2625 result_dtype = _result_dtype(obj) 

2626 

2627 returns_df = regex.groups > 1 or expand 

2628 

2629 if returns_df: 

2630 name = None 

2631 columns = _get_group_names(regex) 

2632 

2633 if obj.array.size == 0: 

2634 result = DataFrame(columns=columns, dtype=result_dtype) 

2635 

2636 else: 

2637 result_list = self._data.array._str_extract( 

2638 pat, flags=flags, expand=returns_df 

2639 ) 

2640 

2641 result_index: Index | None 

2642 if isinstance(obj, ABCSeries): 

2643 result_index = obj.index 

2644 else: 

2645 result_index = None 

2646 

2647 result = DataFrame( 

2648 result_list, columns=columns, index=result_index, dtype=result_dtype 

2649 ) 

2650 

2651 else: 

2652 name = _get_single_group_name(regex) 

2653 result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) 

2654 return self._wrap_result(result, name=name) 

2655 

2656 @forbid_nonstring_types(["bytes"]) 

2657 def extractall(self, pat, flags: int = 0): 

2658 r""" 

2659 Extract capture groups in the regex `pat` as columns in DataFrame. 

2660 

2661 For each subject string in the Series, extract groups from all 

2662 matches of regular expression pat. When each subject string in the 

2663 Series has exactly one match, extractall(pat).xs(0, level='match') 

2664 is the same as extract(pat). 

2665 

2666 Parameters 

2667 ---------- 

2668 pat : str 

2669 Regular expression pattern with capturing groups. 

2670 flags : int, default 0 (no flags) 

2671 A ``re`` module flag, for example ``re.IGNORECASE``. These allow 

2672 to modify regular expression matching for things like case, spaces, 

2673 etc. Multiple flags can be combined with the bitwise OR operator, 

2674 for example ``re.IGNORECASE | re.MULTILINE``. 

2675 

2676 Returns 

2677 ------- 

2678 DataFrame 

2679 A ``DataFrame`` with one row for each match, and one column for each 

2680 group. Its rows have a ``MultiIndex`` with first levels that come from 

2681 the subject ``Series``. The last level is named 'match' and indexes the 

2682 matches in each item of the ``Series``. Any capture group names in 

2683 regular expression pat will be used for column names; otherwise capture 

2684 group numbers will be used. 

2685 

2686 See Also 

2687 -------- 

2688 extract : Returns first match only (not all matches). 

2689 

2690 Examples 

2691 -------- 

2692 A pattern with one group will return a DataFrame with one column. 

2693 Indices with no matches will not appear in the result. 

2694 

2695 >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) 

2696 >>> s.str.extractall(r"[ab](\d)") 

2697 0 

2698 match 

2699 A 0 1 

2700 1 2 

2701 B 0 1 

2702 

2703 Capture group names are used for column names of the result. 

2704 

2705 >>> s.str.extractall(r"[ab](?P<digit>\d)") 

2706 digit 

2707 match 

2708 A 0 1 

2709 1 2 

2710 B 0 1 

2711 

2712 A pattern with two groups will return a DataFrame with two columns. 

2713 

2714 >>> s.str.extractall(r"(?P<letter>[ab])(?P<digit>\d)") 

2715 letter digit 

2716 match 

2717 A 0 a 1 

2718 1 a 2 

2719 B 0 b 1 

2720 

2721 Optional groups that do not match are NaN in the result. 

2722 

2723 >>> s.str.extractall(r"(?P<letter>[ab])?(?P<digit>\d)") 

2724 letter digit 

2725 match 

2726 A 0 a 1 

2727 1 a 2 

2728 B 0 b 1 

2729 C 0 NaN 1 

2730 """ 

2731 # TODO: dispatch 

2732 return str_extractall(self._orig, pat, flags) 

2733 

2734 _shared_docs[ 

2735 "find" 

2736 ] = """ 

2737 Return %(side)s indexes in each strings in the Series/Index. 

2738 

2739 Each of returned indexes corresponds to the position where the 

2740 substring is fully contained between [start:end]. Return -1 on 

2741 failure. Equivalent to standard :meth:`str.%(method)s`. 

2742 

2743 Parameters 

2744 ---------- 

2745 sub : str 

2746 Substring being searched. 

2747 start : int 

2748 Left edge index. 

2749 end : int 

2750 Right edge index. 

2751 

2752 Returns 

2753 ------- 

2754 Series or Index of int. 

2755 

2756 See Also 

2757 -------- 

2758 %(also)s 

2759 """ 

2760 

2761 @Appender( 

2762 _shared_docs["find"] 

2763 % { 

2764 "side": "lowest", 

2765 "method": "find", 

2766 "also": "rfind : Return highest indexes in each strings.", 

2767 } 

2768 ) 

2769 @forbid_nonstring_types(["bytes"]) 

2770 def find(self, sub, start: int = 0, end=None): 

2771 if not isinstance(sub, str): 

2772 msg = f"expected a string object, not {type(sub).__name__}" 

2773 raise TypeError(msg) 

2774 

2775 result = self._data.array._str_find(sub, start, end) 

2776 return self._wrap_result(result, returns_string=False) 

2777 

2778 @Appender( 

2779 _shared_docs["find"] 

2780 % { 

2781 "side": "highest", 

2782 "method": "rfind", 

2783 "also": "find : Return lowest indexes in each strings.", 

2784 } 

2785 ) 

2786 @forbid_nonstring_types(["bytes"]) 

2787 def rfind(self, sub, start: int = 0, end=None): 

2788 if not isinstance(sub, str): 

2789 msg = f"expected a string object, not {type(sub).__name__}" 

2790 raise TypeError(msg) 

2791 

2792 result = self._data.array._str_rfind(sub, start=start, end=end) 

2793 return self._wrap_result(result, returns_string=False) 

2794 

2795 @forbid_nonstring_types(["bytes"]) 

2796 def normalize(self, form): 

2797 """ 

2798 Return the Unicode normal form for the strings in the Series/Index. 

2799 

2800 For more information on the forms, see the 

2801 :func:`unicodedata.normalize`. 

2802 

2803 Parameters 

2804 ---------- 

2805 form : {'NFC', 'NFKC', 'NFD', 'NFKD'} 

2806 Unicode form. 

2807 

2808 Returns 

2809 ------- 

2810 Series/Index of objects 

2811 """ 

2812 result = self._data.array._str_normalize(form) 

2813 return self._wrap_result(result) 

2814 

2815 _shared_docs[ 

2816 "index" 

2817 ] = """ 

2818 Return %(side)s indexes in each string in Series/Index. 

2819 

2820 Each of the returned indexes corresponds to the position where the 

2821 substring is fully contained between [start:end]. This is the same 

2822 as ``str.%(similar)s`` except instead of returning -1, it raises a 

2823 ValueError when the substring is not found. Equivalent to standard 

2824 ``str.%(method)s``. 

2825 

2826 Parameters 

2827 ---------- 

2828 sub : str 

2829 Substring being searched. 

2830 start : int 

2831 Left edge index. 

2832 end : int 

2833 Right edge index. 

2834 

2835 Returns 

2836 ------- 

2837 Series or Index of object 

2838 

2839 See Also 

2840 -------- 

2841 %(also)s 

2842 """ 

2843 

2844 @Appender( 

2845 _shared_docs["index"] 

2846 % { 

2847 "side": "lowest", 

2848 "similar": "find", 

2849 "method": "index", 

2850 "also": "rindex : Return highest indexes in each strings.", 

2851 } 

2852 ) 

2853 @forbid_nonstring_types(["bytes"]) 

2854 def index(self, sub, start: int = 0, end=None): 

2855 if not isinstance(sub, str): 

2856 msg = f"expected a string object, not {type(sub).__name__}" 

2857 raise TypeError(msg) 

2858 

2859 result = self._data.array._str_index(sub, start=start, end=end) 

2860 return self._wrap_result(result, returns_string=False) 

2861 

2862 @Appender( 

2863 _shared_docs["index"] 

2864 % { 

2865 "side": "highest", 

2866 "similar": "rfind", 

2867 "method": "rindex", 

2868 "also": "index : Return lowest indexes in each strings.", 

2869 } 

2870 ) 

2871 @forbid_nonstring_types(["bytes"]) 

2872 def rindex(self, sub, start: int = 0, end=None): 

2873 if not isinstance(sub, str): 

2874 msg = f"expected a string object, not {type(sub).__name__}" 

2875 raise TypeError(msg) 

2876 

2877 result = self._data.array._str_rindex(sub, start=start, end=end) 

2878 return self._wrap_result(result, returns_string=False) 

2879 

2880 def len(self): 

2881 """ 

2882 Compute the length of each element in the Series/Index. 

2883 

2884 The element may be a sequence (such as a string, tuple or list) or a collection 

2885 (such as a dictionary). 

2886 

2887 Returns 

2888 ------- 

2889 Series or Index of int 

2890 A Series or Index of integer values indicating the length of each 

2891 element in the Series or Index. 

2892 

2893 See Also 

2894 -------- 

2895 str.len : Python built-in function returning the length of an object. 

2896 Series.size : Returns the length of the Series. 

2897 

2898 Examples 

2899 -------- 

2900 Returns the length (number of characters) in a string. Returns the 

2901 number of entries for dictionaries, lists or tuples. 

2902 

2903 >>> s = pd.Series(['dog', 

2904 ... '', 

2905 ... 5, 

2906 ... {'foo' : 'bar'}, 

2907 ... [2, 3, 5, 7], 

2908 ... ('one', 'two', 'three')]) 

2909 >>> s 

2910 0 dog 

2911 1 

2912 2 5 

2913 3 {'foo': 'bar'} 

2914 4 [2, 3, 5, 7] 

2915 5 (one, two, three) 

2916 dtype: object 

2917 >>> s.str.len() 

2918 0 3.0 

2919 1 0.0 

2920 2 NaN 

2921 3 1.0 

2922 4 4.0 

2923 5 3.0 

2924 dtype: float64 

2925 """ 

2926 result = self._data.array._str_len() 

2927 return self._wrap_result(result, returns_string=False) 

2928 

2929 _shared_docs[ 

2930 "casemethods" 

2931 ] = """ 

2932 Convert strings in the Series/Index to %(type)s. 

2933 %(version)s 

2934 Equivalent to :meth:`str.%(method)s`. 

2935 

2936 Returns 

2937 ------- 

2938 Series or Index of object 

2939 

2940 See Also 

2941 -------- 

2942 Series.str.lower : Converts all characters to lowercase. 

2943 Series.str.upper : Converts all characters to uppercase. 

2944 Series.str.title : Converts first character of each word to uppercase and 

2945 remaining to lowercase. 

2946 Series.str.capitalize : Converts first character to uppercase and 

2947 remaining to lowercase. 

2948 Series.str.swapcase : Converts uppercase to lowercase and lowercase to 

2949 uppercase. 

2950 Series.str.casefold: Removes all case distinctions in the string. 

2951 

2952 Examples 

2953 -------- 

2954 >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) 

2955 >>> s 

2956 0 lower 

2957 1 CAPITALS 

2958 2 this is a sentence 

2959 3 SwApCaSe 

2960 dtype: object 

2961 

2962 >>> s.str.lower() 

2963 0 lower 

2964 1 capitals 

2965 2 this is a sentence 

2966 3 swapcase 

2967 dtype: object 

2968 

2969 >>> s.str.upper() 

2970 0 LOWER 

2971 1 CAPITALS 

2972 2 THIS IS A SENTENCE 

2973 3 SWAPCASE 

2974 dtype: object 

2975 

2976 >>> s.str.title() 

2977 0 Lower 

2978 1 Capitals 

2979 2 This Is A Sentence 

2980 3 Swapcase 

2981 dtype: object 

2982 

2983 >>> s.str.capitalize() 

2984 0 Lower 

2985 1 Capitals 

2986 2 This is a sentence 

2987 3 Swapcase 

2988 dtype: object 

2989 

2990 >>> s.str.swapcase() 

2991 0 LOWER 

2992 1 capitals 

2993 2 THIS IS A SENTENCE 

2994 3 sWaPcAsE 

2995 dtype: object 

2996 """ 

2997 # Types: 

2998 # cases: 

2999 # upper, lower, title, capitalize, swapcase, casefold 

3000 # boolean: 

3001 # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle 

3002 # _doc_args holds dict of strings to use in substituting casemethod docs 

3003 _doc_args: dict[str, dict[str, str]] = {} 

3004 _doc_args["lower"] = {"type": "lowercase", "method": "lower", "version": ""} 

3005 _doc_args["upper"] = {"type": "uppercase", "method": "upper", "version": ""} 

3006 _doc_args["title"] = {"type": "titlecase", "method": "title", "version": ""} 

3007 _doc_args["capitalize"] = { 

3008 "type": "be capitalized", 

3009 "method": "capitalize", 

3010 "version": "", 

3011 } 

3012 _doc_args["swapcase"] = { 

3013 "type": "be swapcased", 

3014 "method": "swapcase", 

3015 "version": "", 

3016 } 

3017 _doc_args["casefold"] = { 

3018 "type": "be casefolded", 

3019 "method": "casefold", 

3020 "version": "", 

3021 } 

3022 

3023 @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) 

3024 @forbid_nonstring_types(["bytes"]) 

3025 def lower(self): 

3026 result = self._data.array._str_lower() 

3027 return self._wrap_result(result) 

3028 

3029 @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) 

3030 @forbid_nonstring_types(["bytes"]) 

3031 def upper(self): 

3032 result = self._data.array._str_upper() 

3033 return self._wrap_result(result) 

3034 

3035 @Appender(_shared_docs["casemethods"] % _doc_args["title"]) 

3036 @forbid_nonstring_types(["bytes"]) 

3037 def title(self): 

3038 result = self._data.array._str_title() 

3039 return self._wrap_result(result) 

3040 

3041 @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) 

3042 @forbid_nonstring_types(["bytes"]) 

3043 def capitalize(self): 

3044 result = self._data.array._str_capitalize() 

3045 return self._wrap_result(result) 

3046 

3047 @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) 

3048 @forbid_nonstring_types(["bytes"]) 

3049 def swapcase(self): 

3050 result = self._data.array._str_swapcase() 

3051 return self._wrap_result(result) 

3052 

3053 @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) 

3054 @forbid_nonstring_types(["bytes"]) 

3055 def casefold(self): 

3056 result = self._data.array._str_casefold() 

3057 return self._wrap_result(result) 

3058 

3059 _shared_docs[ 

3060 "ismethods" 

3061 ] = """ 

3062 Check whether all characters in each string are %(type)s. 

3063 

3064 This is equivalent to running the Python string method 

3065 :meth:`str.%(method)s` for each element of the Series/Index. If a string 

3066 has zero characters, ``False`` is returned for that check. 

3067 

3068 Returns 

3069 ------- 

3070 Series or Index of bool 

3071 Series or Index of boolean values with the same length as the original 

3072 Series/Index. 

3073 

3074 See Also 

3075 -------- 

3076 Series.str.isalpha : Check whether all characters are alphabetic. 

3077 Series.str.isnumeric : Check whether all characters are numeric. 

3078 Series.str.isalnum : Check whether all characters are alphanumeric. 

3079 Series.str.isdigit : Check whether all characters are digits. 

3080 Series.str.isdecimal : Check whether all characters are decimal. 

3081 Series.str.isspace : Check whether all characters are whitespace. 

3082 Series.str.islower : Check whether all characters are lowercase. 

3083 Series.str.isupper : Check whether all characters are uppercase. 

3084 Series.str.istitle : Check whether all characters are titlecase. 

3085 

3086 Examples 

3087 -------- 

3088 **Checks for Alphabetic and Numeric Characters** 

3089 

3090 >>> s1 = pd.Series(['one', 'one1', '1', '']) 

3091 

3092 >>> s1.str.isalpha() 

3093 0 True 

3094 1 False 

3095 2 False 

3096 3 False 

3097 dtype: bool 

3098 

3099 >>> s1.str.isnumeric() 

3100 0 False 

3101 1 False 

3102 2 True 

3103 3 False 

3104 dtype: bool 

3105 

3106 >>> s1.str.isalnum() 

3107 0 True 

3108 1 True 

3109 2 True 

3110 3 False 

3111 dtype: bool 

3112 

3113 Note that checks against characters mixed with any additional punctuation 

3114 or whitespace will evaluate to false for an alphanumeric check. 

3115 

3116 >>> s2 = pd.Series(['A B', '1.5', '3,000']) 

3117 >>> s2.str.isalnum() 

3118 0 False 

3119 1 False 

3120 2 False 

3121 dtype: bool 

3122 

3123 **More Detailed Checks for Numeric Characters** 

3124 

3125 There are several different but overlapping sets of numeric characters that 

3126 can be checked for. 

3127 

3128 >>> s3 = pd.Series(['23', '³', '⅕', '']) 

3129 

3130 The ``s3.str.isdecimal`` method checks for characters used to form numbers 

3131 in base 10. 

3132 

3133 >>> s3.str.isdecimal() 

3134 0 True 

3135 1 False 

3136 2 False 

3137 3 False 

3138 dtype: bool 

3139 

3140 The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also 

3141 includes special digits, like superscripted and subscripted digits in 

3142 unicode. 

3143 

3144 >>> s3.str.isdigit() 

3145 0 True 

3146 1 True 

3147 2 False 

3148 3 False 

3149 dtype: bool 

3150 

3151 The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also 

3152 includes other characters that can represent quantities such as unicode 

3153 fractions. 

3154 

3155 >>> s3.str.isnumeric() 

3156 0 True 

3157 1 True 

3158 2 True 

3159 3 False 

3160 dtype: bool 

3161 

3162 **Checks for Whitespace** 

3163 

3164 >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) 

3165 >>> s4.str.isspace() 

3166 0 True 

3167 1 True 

3168 2 False 

3169 dtype: bool 

3170 

3171 **Checks for Character Case** 

3172 

3173 >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) 

3174 

3175 >>> s5.str.islower() 

3176 0 True 

3177 1 False 

3178 2 False 

3179 3 False 

3180 dtype: bool 

3181 

3182 >>> s5.str.isupper() 

3183 0 False 

3184 1 False 

3185 2 True 

3186 3 False 

3187 dtype: bool 

3188 

3189 The ``s5.str.istitle`` method checks for whether all words are in title 

3190 case (whether only the first letter of each word is capitalized). Words are 

3191 assumed to be as any sequence of non-numeric characters separated by 

3192 whitespace characters. 

3193 

3194 >>> s5.str.istitle() 

3195 0 False 

3196 1 True 

3197 2 False 

3198 3 False 

3199 dtype: bool 

3200 """ 

3201 _doc_args["isalnum"] = {"type": "alphanumeric", "method": "isalnum"} 

3202 _doc_args["isalpha"] = {"type": "alphabetic", "method": "isalpha"} 

3203 _doc_args["isdigit"] = {"type": "digits", "method": "isdigit"} 

3204 _doc_args["isspace"] = {"type": "whitespace", "method": "isspace"} 

3205 _doc_args["islower"] = {"type": "lowercase", "method": "islower"} 

3206 _doc_args["isupper"] = {"type": "uppercase", "method": "isupper"} 

3207 _doc_args["istitle"] = {"type": "titlecase", "method": "istitle"} 

3208 _doc_args["isnumeric"] = {"type": "numeric", "method": "isnumeric"} 

3209 _doc_args["isdecimal"] = {"type": "decimal", "method": "isdecimal"} 

3210 # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) 

3211 

3212 isalnum = _map_and_wrap( 

3213 "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] 

3214 ) 

3215 isalpha = _map_and_wrap( 

3216 "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] 

3217 ) 

3218 isdigit = _map_and_wrap( 

3219 "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] 

3220 ) 

3221 isspace = _map_and_wrap( 

3222 "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] 

3223 ) 

3224 islower = _map_and_wrap( 

3225 "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] 

3226 ) 

3227 isupper = _map_and_wrap( 

3228 "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] 

3229 ) 

3230 istitle = _map_and_wrap( 

3231 "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] 

3232 ) 

3233 isnumeric = _map_and_wrap( 

3234 "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] 

3235 ) 

3236 isdecimal = _map_and_wrap( 

3237 "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] 

3238 ) 

3239 

3240 

3241def cat_safe(list_of_columns: list, sep: str): 

3242 """ 

3243 Auxiliary function for :meth:`str.cat`. 

3244 

3245 Same signature as cat_core, but handles TypeErrors in concatenation, which 

3246 happen if the arrays in list_of columns have the wrong dtypes or content. 

3247 

3248 Parameters 

3249 ---------- 

3250 list_of_columns : list of numpy arrays 

3251 List of arrays to be concatenated with sep; 

3252 these arrays may not contain NaNs! 

3253 sep : string 

3254 The separator string for concatenating the columns. 

3255 

3256 Returns 

3257 ------- 

3258 nd.array 

3259 The concatenation of list_of_columns with sep. 

3260 """ 

3261 try: 

3262 result = cat_core(list_of_columns, sep) 

3263 except TypeError: 

3264 # if there are any non-string values (wrong dtype or hidden behind 

3265 # object dtype), np.sum will fail; catch and return with better message 

3266 for column in list_of_columns: 

3267 dtype = lib.infer_dtype(column, skipna=True) 

3268 if dtype not in ["string", "empty"]: 

3269 raise TypeError( 

3270 "Concatenation requires list-likes containing only " 

3271 "strings (or missing values). Offending values found in " 

3272 f"column {dtype}" 

3273 ) from None 

3274 return result 

3275 

3276 

3277def cat_core(list_of_columns: list, sep: str): 

3278 """ 

3279 Auxiliary function for :meth:`str.cat` 

3280 

3281 Parameters 

3282 ---------- 

3283 list_of_columns : list of numpy arrays 

3284 List of arrays to be concatenated with sep; 

3285 these arrays may not contain NaNs! 

3286 sep : string 

3287 The separator string for concatenating the columns. 

3288 

3289 Returns 

3290 ------- 

3291 nd.array 

3292 The concatenation of list_of_columns with sep. 

3293 """ 

3294 if sep == "": 

3295 # no need to interleave sep if it is empty 

3296 arr_of_cols = np.asarray(list_of_columns, dtype=object) 

3297 return np.sum(arr_of_cols, axis=0) 

3298 list_with_sep = [sep] * (2 * len(list_of_columns) - 1) 

3299 list_with_sep[::2] = list_of_columns 

3300 arr_with_sep = np.asarray(list_with_sep, dtype=object) 

3301 return np.sum(arr_with_sep, axis=0) 

3302 

3303 

3304def _result_dtype(arr): 

3305 # workaround #27953 

3306 # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails 

3307 # when the list of values is empty. 

3308 from pandas.core.arrays.string_ import StringDtype 

3309 

3310 if isinstance(arr.dtype, StringDtype): 

3311 return arr.dtype 

3312 else: 

3313 return object 

3314 

3315 

3316def _get_single_group_name(regex: re.Pattern) -> Hashable: 

3317 if regex.groupindex: 

3318 return next(iter(regex.groupindex)) 

3319 else: 

3320 return None 

3321 

3322 

3323def _get_group_names(regex: re.Pattern) -> list[Hashable]: 

3324 """ 

3325 Get named groups from compiled regex. 

3326 

3327 Unnamed groups are numbered. 

3328 

3329 Parameters 

3330 ---------- 

3331 regex : compiled regex 

3332 

3333 Returns 

3334 ------- 

3335 list of column labels 

3336 """ 

3337 names = {v: k for k, v in regex.groupindex.items()} 

3338 return [names.get(1 + i, i) for i in range(regex.groups)] 

3339 

3340 

3341def str_extractall(arr, pat, flags: int = 0): 

3342 regex = re.compile(pat, flags=flags) 

3343 # the regex must contain capture groups. 

3344 if regex.groups == 0: 

3345 raise ValueError("pattern contains no capture groups") 

3346 

3347 if isinstance(arr, ABCIndex): 

3348 arr = arr.to_series().reset_index(drop=True) 

3349 

3350 columns = _get_group_names(regex) 

3351 match_list = [] 

3352 index_list = [] 

3353 is_mi = arr.index.nlevels > 1 

3354 

3355 for subject_key, subject in arr.items(): 

3356 if isinstance(subject, str): 

3357 if not is_mi: 

3358 subject_key = (subject_key,) 

3359 

3360 for match_i, match_tuple in enumerate(regex.findall(subject)): 

3361 if isinstance(match_tuple, str): 

3362 match_tuple = (match_tuple,) 

3363 na_tuple = [np.NaN if group == "" else group for group in match_tuple] 

3364 match_list.append(na_tuple) 

3365 result_key = tuple(subject_key + (match_i,)) 

3366 index_list.append(result_key) 

3367 

3368 from pandas import MultiIndex 

3369 

3370 index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) 

3371 dtype = _result_dtype(arr) 

3372 

3373 result = arr._constructor_expanddim( 

3374 match_list, index=index, columns=columns, dtype=dtype 

3375 ) 

3376 return result