Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/strings/object_array.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

300 statements  

1from __future__ import annotations 

2 

3import functools 

4import re 

5import textwrap 

6from typing import ( 

7 TYPE_CHECKING, 

8 Callable, 

9 Literal, 

10 cast, 

11) 

12import unicodedata 

13 

14import numpy as np 

15 

16from pandas._libs import lib 

17import pandas._libs.missing as libmissing 

18import pandas._libs.ops as libops 

19 

20from pandas.core.dtypes.missing import isna 

21 

22from pandas.core.strings.base import BaseStringArrayMethods 

23 

24if TYPE_CHECKING: 

25 from collections.abc import Sequence 

26 

27 from pandas._typing import ( 

28 NpDtype, 

29 Scalar, 

30 ) 

31 

32 from pandas import Series 

33 

34 

35class ObjectStringArrayMixin(BaseStringArrayMethods): 

36 """ 

37 String Methods operating on object-dtype ndarrays. 

38 """ 

39 

40 _str_na_value = np.nan 

41 

42 def __len__(self) -> int: 

43 # For typing, _str_map relies on the object being sized. 

44 raise NotImplementedError 

45 

46 def _str_map( 

47 self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True 

48 ): 

49 """ 

50 Map a callable over valid elements of the array. 

51 

52 Parameters 

53 ---------- 

54 f : Callable 

55 A function to call on each non-NA element. 

56 na_value : Scalar, optional 

57 The value to set for NA values. Might also be used for the 

58 fill value if the callable `f` raises an exception. 

59 This defaults to ``self._str_na_value`` which is ``np.nan`` 

60 for object-dtype and Categorical and ``pd.NA`` for StringArray. 

61 dtype : Dtype, optional 

62 The dtype of the result array. 

63 convert : bool, default True 

64 Whether to call `maybe_convert_objects` on the resulting ndarray 

65 """ 

66 if dtype is None: 

67 dtype = np.dtype("object") 

68 if na_value is None: 

69 na_value = self._str_na_value 

70 

71 if not len(self): 

72 return np.array([], dtype=dtype) 

73 

74 arr = np.asarray(self, dtype=object) 

75 mask = isna(arr) 

76 map_convert = convert and not np.all(mask) 

77 try: 

78 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) 

79 except (TypeError, AttributeError) as err: 

80 # Reraise the exception if callable `f` got wrong number of args. 

81 # The user may want to be warned by this, instead of getting NaN 

82 p_err = ( 

83 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " 

84 r"(?(3)required )positional arguments?" 

85 ) 

86 

87 if len(err.args) >= 1 and re.search(p_err, err.args[0]): 

88 # FIXME: this should be totally avoidable 

89 raise err 

90 

91 def g(x): 

92 # This type of fallback behavior can be removed once 

93 # we remove object-dtype .str accessor. 

94 try: 

95 return f(x) 

96 except (TypeError, AttributeError): 

97 return na_value 

98 

99 return self._str_map(g, na_value=na_value, dtype=dtype) 

100 if not isinstance(result, np.ndarray): 

101 return result 

102 if na_value is not np.nan: 

103 np.putmask(result, mask, na_value) 

104 if convert and result.dtype == object: 

105 result = lib.maybe_convert_objects(result) 

106 return result 

107 

108 def _str_count(self, pat, flags: int = 0): 

109 regex = re.compile(pat, flags=flags) 

110 f = lambda x: len(regex.findall(x)) 

111 return self._str_map(f, dtype="int64") 

112 

113 def _str_pad( 

114 self, 

115 width: int, 

116 side: Literal["left", "right", "both"] = "left", 

117 fillchar: str = " ", 

118 ): 

119 if side == "left": 

120 f = lambda x: x.rjust(width, fillchar) 

121 elif side == "right": 

122 f = lambda x: x.ljust(width, fillchar) 

123 elif side == "both": 

124 f = lambda x: x.center(width, fillchar) 

125 else: # pragma: no cover 

126 raise ValueError("Invalid side") 

127 return self._str_map(f) 

128 

129 def _str_contains( 

130 self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True 

131 ): 

132 if regex: 

133 if not case: 

134 flags |= re.IGNORECASE 

135 

136 pat = re.compile(pat, flags=flags) 

137 

138 f = lambda x: pat.search(x) is not None 

139 else: 

140 if case: 

141 f = lambda x: pat in x 

142 else: 

143 upper_pat = pat.upper() 

144 f = lambda x: upper_pat in x.upper() 

145 return self._str_map(f, na, dtype=np.dtype("bool")) 

146 

147 def _str_startswith(self, pat, na=None): 

148 f = lambda x: x.startswith(pat) 

149 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

150 

151 def _str_endswith(self, pat, na=None): 

152 f = lambda x: x.endswith(pat) 

153 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

154 

155 def _str_replace( 

156 self, 

157 pat: str | re.Pattern, 

158 repl: str | Callable, 

159 n: int = -1, 

160 case: bool = True, 

161 flags: int = 0, 

162 regex: bool = True, 

163 ): 

164 if case is False: 

165 # add case flag, if provided 

166 flags |= re.IGNORECASE 

167 

168 if regex or flags or callable(repl): 

169 if not isinstance(pat, re.Pattern): 

170 if regex is False: 

171 pat = re.escape(pat) 

172 pat = re.compile(pat, flags=flags) 

173 

174 n = n if n >= 0 else 0 

175 f = lambda x: pat.sub(repl=repl, string=x, count=n) 

176 else: 

177 f = lambda x: x.replace(pat, repl, n) 

178 

179 return self._str_map(f, dtype=str) 

180 

181 def _str_repeat(self, repeats: int | Sequence[int]): 

182 if lib.is_integer(repeats): 

183 rint = cast(int, repeats) 

184 

185 def scalar_rep(x): 

186 try: 

187 return bytes.__mul__(x, rint) 

188 except TypeError: 

189 return str.__mul__(x, rint) 

190 

191 return self._str_map(scalar_rep, dtype=str) 

192 else: 

193 from pandas.core.arrays.string_ import BaseStringArray 

194 

195 def rep(x, r): 

196 if x is libmissing.NA: 

197 return x 

198 try: 

199 return bytes.__mul__(x, r) 

200 except TypeError: 

201 return str.__mul__(x, r) 

202 

203 result = libops.vec_binop( 

204 np.asarray(self), 

205 np.asarray(repeats, dtype=object), 

206 rep, 

207 ) 

208 if isinstance(self, BaseStringArray): 

209 # Not going through map, so we have to do this here. 

210 result = type(self)._from_sequence(result, dtype=self.dtype) 

211 return result 

212 

213 def _str_match( 

214 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None 

215 ): 

216 if not case: 

217 flags |= re.IGNORECASE 

218 

219 regex = re.compile(pat, flags=flags) 

220 

221 f = lambda x: regex.match(x) is not None 

222 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

223 

224 def _str_fullmatch( 

225 self, 

226 pat: str | re.Pattern, 

227 case: bool = True, 

228 flags: int = 0, 

229 na: Scalar | None = None, 

230 ): 

231 if not case: 

232 flags |= re.IGNORECASE 

233 

234 regex = re.compile(pat, flags=flags) 

235 

236 f = lambda x: regex.fullmatch(x) is not None 

237 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

238 

239 def _str_encode(self, encoding, errors: str = "strict"): 

240 f = lambda x: x.encode(encoding, errors=errors) 

241 return self._str_map(f, dtype=object) 

242 

243 def _str_find(self, sub, start: int = 0, end=None): 

244 return self._str_find_(sub, start, end, side="left") 

245 

246 def _str_rfind(self, sub, start: int = 0, end=None): 

247 return self._str_find_(sub, start, end, side="right") 

248 

249 def _str_find_(self, sub, start, end, side): 

250 if side == "left": 

251 method = "find" 

252 elif side == "right": 

253 method = "rfind" 

254 else: # pragma: no cover 

255 raise ValueError("Invalid side") 

256 

257 if end is None: 

258 f = lambda x: getattr(x, method)(sub, start) 

259 else: 

260 f = lambda x: getattr(x, method)(sub, start, end) 

261 return self._str_map(f, dtype="int64") 

262 

263 def _str_findall(self, pat, flags: int = 0): 

264 regex = re.compile(pat, flags=flags) 

265 return self._str_map(regex.findall, dtype="object") 

266 

267 def _str_get(self, i): 

268 def f(x): 

269 if isinstance(x, dict): 

270 return x.get(i) 

271 elif len(x) > i >= -len(x): 

272 return x[i] 

273 return self._str_na_value 

274 

275 return self._str_map(f) 

276 

277 def _str_index(self, sub, start: int = 0, end=None): 

278 if end: 

279 f = lambda x: x.index(sub, start, end) 

280 else: 

281 f = lambda x: x.index(sub, start, end) 

282 return self._str_map(f, dtype="int64") 

283 

284 def _str_rindex(self, sub, start: int = 0, end=None): 

285 if end: 

286 f = lambda x: x.rindex(sub, start, end) 

287 else: 

288 f = lambda x: x.rindex(sub, start, end) 

289 return self._str_map(f, dtype="int64") 

290 

291 def _str_join(self, sep: str): 

292 return self._str_map(sep.join) 

293 

294 def _str_partition(self, sep: str, expand): 

295 result = self._str_map(lambda x: x.partition(sep), dtype="object") 

296 return result 

297 

298 def _str_rpartition(self, sep: str, expand): 

299 return self._str_map(lambda x: x.rpartition(sep), dtype="object") 

300 

301 def _str_len(self): 

302 return self._str_map(len, dtype="int64") 

303 

304 def _str_slice(self, start=None, stop=None, step=None): 

305 obj = slice(start, stop, step) 

306 return self._str_map(lambda x: x[obj]) 

307 

308 def _str_slice_replace(self, start=None, stop=None, repl=None): 

309 if repl is None: 

310 repl = "" 

311 

312 def f(x): 

313 if x[start:stop] == "": 

314 local_stop = start 

315 else: 

316 local_stop = stop 

317 y = "" 

318 if start is not None: 

319 y += x[:start] 

320 y += repl 

321 if stop is not None: 

322 y += x[local_stop:] 

323 return y 

324 

325 return self._str_map(f) 

326 

327 def _str_split( 

328 self, 

329 pat: str | re.Pattern | None = None, 

330 n=-1, 

331 expand: bool = False, 

332 regex: bool | None = None, 

333 ): 

334 if pat is None: 

335 if n is None or n == 0: 

336 n = -1 

337 f = lambda x: x.split(pat, n) 

338 else: 

339 new_pat: str | re.Pattern 

340 if regex is True or isinstance(pat, re.Pattern): 

341 new_pat = re.compile(pat) 

342 elif regex is False: 

343 new_pat = pat 

344 # regex is None so link to old behavior #43563 

345 else: 

346 if len(pat) == 1: 

347 new_pat = pat 

348 else: 

349 new_pat = re.compile(pat) 

350 

351 if isinstance(new_pat, re.Pattern): 

352 if n is None or n == -1: 

353 n = 0 

354 f = lambda x: new_pat.split(x, maxsplit=n) 

355 else: 

356 if n is None or n == 0: 

357 n = -1 

358 f = lambda x: x.split(pat, n) 

359 return self._str_map(f, dtype=object) 

360 

361 def _str_rsplit(self, pat=None, n=-1): 

362 if n is None or n == 0: 

363 n = -1 

364 f = lambda x: x.rsplit(pat, n) 

365 return self._str_map(f, dtype="object") 

366 

367 def _str_translate(self, table): 

368 return self._str_map(lambda x: x.translate(table)) 

369 

370 def _str_wrap(self, width: int, **kwargs): 

371 kwargs["width"] = width 

372 tw = textwrap.TextWrapper(**kwargs) 

373 return self._str_map(lambda s: "\n".join(tw.wrap(s))) 

374 

375 def _str_get_dummies(self, sep: str = "|"): 

376 from pandas import Series 

377 

378 arr = Series(self).fillna("") 

379 try: 

380 arr = sep + arr + sep 

381 except (TypeError, NotImplementedError): 

382 arr = sep + arr.astype(str) + sep 

383 

384 tags: set[str] = set() 

385 for ts in Series(arr, copy=False).str.split(sep): 

386 tags.update(ts) 

387 tags2 = sorted(tags - {""}) 

388 

389 dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) 

390 

391 def _isin(test_elements: str, element: str) -> bool: 

392 return element in test_elements 

393 

394 for i, t in enumerate(tags2): 

395 pat = sep + t + sep 

396 dummies[:, i] = lib.map_infer( 

397 arr.to_numpy(), functools.partial(_isin, element=pat) 

398 ) 

399 return dummies, tags2 

400 

401 def _str_upper(self): 

402 return self._str_map(lambda x: x.upper()) 

403 

404 def _str_isalnum(self): 

405 return self._str_map(str.isalnum, dtype="bool") 

406 

407 def _str_isalpha(self): 

408 return self._str_map(str.isalpha, dtype="bool") 

409 

410 def _str_isdecimal(self): 

411 return self._str_map(str.isdecimal, dtype="bool") 

412 

413 def _str_isdigit(self): 

414 return self._str_map(str.isdigit, dtype="bool") 

415 

416 def _str_islower(self): 

417 return self._str_map(str.islower, dtype="bool") 

418 

419 def _str_isnumeric(self): 

420 return self._str_map(str.isnumeric, dtype="bool") 

421 

422 def _str_isspace(self): 

423 return self._str_map(str.isspace, dtype="bool") 

424 

425 def _str_istitle(self): 

426 return self._str_map(str.istitle, dtype="bool") 

427 

428 def _str_isupper(self): 

429 return self._str_map(str.isupper, dtype="bool") 

430 

431 def _str_capitalize(self): 

432 return self._str_map(str.capitalize) 

433 

434 def _str_casefold(self): 

435 return self._str_map(str.casefold) 

436 

437 def _str_title(self): 

438 return self._str_map(str.title) 

439 

440 def _str_swapcase(self): 

441 return self._str_map(str.swapcase) 

442 

443 def _str_lower(self): 

444 return self._str_map(str.lower) 

445 

446 def _str_normalize(self, form): 

447 f = lambda x: unicodedata.normalize(form, x) 

448 return self._str_map(f) 

449 

450 def _str_strip(self, to_strip=None): 

451 return self._str_map(lambda x: x.strip(to_strip)) 

452 

453 def _str_lstrip(self, to_strip=None): 

454 return self._str_map(lambda x: x.lstrip(to_strip)) 

455 

456 def _str_rstrip(self, to_strip=None): 

457 return self._str_map(lambda x: x.rstrip(to_strip)) 

458 

459 def _str_removeprefix(self, prefix: str) -> Series: 

460 # outstanding question on whether to use native methods for users on Python 3.9+ 

461 # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770, 

462 # in which case we could do return self._str_map(str.removeprefix) 

463 

464 def removeprefix(text: str) -> str: 

465 if text.startswith(prefix): 

466 return text[len(prefix) :] 

467 return text 

468 

469 return self._str_map(removeprefix) 

470 

471 def _str_removesuffix(self, suffix: str) -> Series: 

472 return self._str_map(lambda x: x.removesuffix(suffix)) 

473 

474 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): 

475 regex = re.compile(pat, flags=flags) 

476 na_value = self._str_na_value 

477 

478 if not expand: 

479 

480 def g(x): 

481 m = regex.search(x) 

482 return m.groups()[0] if m else na_value 

483 

484 return self._str_map(g, convert=False) 

485 

486 empty_row = [na_value] * regex.groups 

487 

488 def f(x): 

489 if not isinstance(x, str): 

490 return empty_row 

491 m = regex.search(x) 

492 if m: 

493 return [na_value if item is None else item for item in m.groups()] 

494 else: 

495 return empty_row 

496 

497 return [f(val) for val in np.asarray(self)]