Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/strings/object_array.py: 23%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

304 statements  

1from __future__ import annotations 

2 

3import functools 

4import re 

5import sys 

6import textwrap 

7from typing import ( 

8 TYPE_CHECKING, 

9 Callable, 

10 Literal, 

11) 

12import unicodedata 

13 

14import numpy as np 

15 

16from pandas._libs import lib 

17import pandas._libs.missing as libmissing 

18import pandas._libs.ops as libops 

19from pandas._typing import ( 

20 NpDtype, 

21 Scalar, 

22) 

23 

24from pandas.core.dtypes.common import is_scalar 

25from pandas.core.dtypes.missing import isna 

26 

27from pandas.core.strings.base import BaseStringArrayMethods 

28 

29if TYPE_CHECKING: 

30 from pandas import Series 

31 

32 

33class ObjectStringArrayMixin(BaseStringArrayMethods): 

34 """ 

35 String Methods operating on object-dtype ndarrays. 

36 """ 

37 

38 _str_na_value = np.nan 

39 

40 def __len__(self) -> int: 

41 # For typing, _str_map relies on the object being sized. 

42 raise NotImplementedError 

43 

44 def _str_map( 

45 self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True 

46 ): 

47 """ 

48 Map a callable over valid elements of the array. 

49 

50 Parameters 

51 ---------- 

52 f : Callable 

53 A function to call on each non-NA element. 

54 na_value : Scalar, optional 

55 The value to set for NA values. Might also be used for the 

56 fill value if the callable `f` raises an exception. 

57 This defaults to ``self._str_na_value`` which is ``np.nan`` 

58 for object-dtype and Categorical and ``pd.NA`` for StringArray. 

59 dtype : Dtype, optional 

60 The dtype of the result array. 

61 convert : bool, default True 

62 Whether to call `maybe_convert_objects` on the resulting ndarray 

63 """ 

64 if dtype is None: 

65 dtype = np.dtype("object") 

66 if na_value is None: 

67 na_value = self._str_na_value 

68 

69 if not len(self): 

70 return np.array([], dtype=dtype) 

71 

72 arr = np.asarray(self, dtype=object) 

73 mask = isna(arr) 

74 map_convert = convert and not np.all(mask) 

75 try: 

76 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) 

77 except (TypeError, AttributeError) as err: 

78 # Reraise the exception if callable `f` got wrong number of args. 

79 # The user may want to be warned by this, instead of getting NaN 

80 p_err = ( 

81 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " 

82 r"(?(3)required )positional arguments?" 

83 ) 

84 

85 if len(err.args) >= 1 and re.search(p_err, err.args[0]): 

86 # FIXME: this should be totally avoidable 

87 raise err 

88 

89 def g(x): 

90 # This type of fallback behavior can be removed once 

91 # we remove object-dtype .str accessor. 

92 try: 

93 return f(x) 

94 except (TypeError, AttributeError): 

95 return na_value 

96 

97 return self._str_map(g, na_value=na_value, dtype=dtype) 

98 if not isinstance(result, np.ndarray): 

99 return result 

100 if na_value is not np.nan: 

101 np.putmask(result, mask, na_value) 

102 if convert and result.dtype == object: 

103 result = lib.maybe_convert_objects(result) 

104 return result 

105 

106 def _str_count(self, pat, flags: int = 0): 

107 regex = re.compile(pat, flags=flags) 

108 f = lambda x: len(regex.findall(x)) 

109 return self._str_map(f, dtype="int64") 

110 

111 def _str_pad( 

112 self, 

113 width, 

114 side: Literal["left", "right", "both"] = "left", 

115 fillchar: str = " ", 

116 ): 

117 if side == "left": 

118 f = lambda x: x.rjust(width, fillchar) 

119 elif side == "right": 

120 f = lambda x: x.ljust(width, fillchar) 

121 elif side == "both": 

122 f = lambda x: x.center(width, fillchar) 

123 else: # pragma: no cover 

124 raise ValueError("Invalid side") 

125 return self._str_map(f) 

126 

127 def _str_contains( 

128 self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True 

129 ): 

130 if regex: 

131 if not case: 

132 flags |= re.IGNORECASE 

133 

134 pat = re.compile(pat, flags=flags) 

135 

136 f = lambda x: pat.search(x) is not None 

137 else: 

138 if case: 

139 f = lambda x: pat in x 

140 else: 

141 upper_pat = pat.upper() 

142 f = lambda x: upper_pat in x.upper() 

143 return self._str_map(f, na, dtype=np.dtype("bool")) 

144 

145 def _str_startswith(self, pat, na=None): 

146 f = lambda x: x.startswith(pat) 

147 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

148 

149 def _str_endswith(self, pat, na=None): 

150 f = lambda x: x.endswith(pat) 

151 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

152 

153 def _str_replace( 

154 self, 

155 pat: str | re.Pattern, 

156 repl: str | Callable, 

157 n: int = -1, 

158 case: bool = True, 

159 flags: int = 0, 

160 regex: bool = True, 

161 ): 

162 if case is False: 

163 # add case flag, if provided 

164 flags |= re.IGNORECASE 

165 

166 if regex or flags or callable(repl): 

167 if not isinstance(pat, re.Pattern): 

168 if regex is False: 

169 pat = re.escape(pat) 

170 pat = re.compile(pat, flags=flags) 

171 

172 n = n if n >= 0 else 0 

173 f = lambda x: pat.sub(repl=repl, string=x, count=n) 

174 else: 

175 f = lambda x: x.replace(pat, repl, n) 

176 

177 return self._str_map(f, dtype=str) 

178 

179 def _str_repeat(self, repeats): 

180 if is_scalar(repeats): 

181 

182 def scalar_rep(x): 

183 try: 

184 return bytes.__mul__(x, repeats) 

185 except TypeError: 

186 return str.__mul__(x, repeats) 

187 

188 return self._str_map(scalar_rep, dtype=str) 

189 else: 

190 from pandas.core.arrays.string_ import BaseStringArray 

191 

192 def rep(x, r): 

193 if x is libmissing.NA: 

194 return x 

195 try: 

196 return bytes.__mul__(x, r) 

197 except TypeError: 

198 return str.__mul__(x, r) 

199 

200 repeats = np.asarray(repeats, dtype=object) 

201 result = libops.vec_binop(np.asarray(self), repeats, rep) 

202 if isinstance(self, BaseStringArray): 

203 # Not going through map, so we have to do this here. 

204 result = type(self)._from_sequence(result) 

205 return result 

206 

207 def _str_match( 

208 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None 

209 ): 

210 if not case: 

211 flags |= re.IGNORECASE 

212 

213 regex = re.compile(pat, flags=flags) 

214 

215 f = lambda x: regex.match(x) is not None 

216 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

217 

218 def _str_fullmatch( 

219 self, 

220 pat: str | re.Pattern, 

221 case: bool = True, 

222 flags: int = 0, 

223 na: Scalar | None = None, 

224 ): 

225 if not case: 

226 flags |= re.IGNORECASE 

227 

228 regex = re.compile(pat, flags=flags) 

229 

230 f = lambda x: regex.fullmatch(x) is not None 

231 return self._str_map(f, na_value=na, dtype=np.dtype(bool)) 

232 

233 def _str_encode(self, encoding, errors: str = "strict"): 

234 f = lambda x: x.encode(encoding, errors=errors) 

235 return self._str_map(f, dtype=object) 

236 

237 def _str_find(self, sub, start: int = 0, end=None): 

238 return self._str_find_(sub, start, end, side="left") 

239 

240 def _str_rfind(self, sub, start: int = 0, end=None): 

241 return self._str_find_(sub, start, end, side="right") 

242 

243 def _str_find_(self, sub, start, end, side): 

244 if side == "left": 

245 method = "find" 

246 elif side == "right": 

247 method = "rfind" 

248 else: # pragma: no cover 

249 raise ValueError("Invalid side") 

250 

251 if end is None: 

252 f = lambda x: getattr(x, method)(sub, start) 

253 else: 

254 f = lambda x: getattr(x, method)(sub, start, end) 

255 return self._str_map(f, dtype="int64") 

256 

257 def _str_findall(self, pat, flags: int = 0): 

258 regex = re.compile(pat, flags=flags) 

259 return self._str_map(regex.findall, dtype="object") 

260 

261 def _str_get(self, i): 

262 def f(x): 

263 if isinstance(x, dict): 

264 return x.get(i) 

265 elif len(x) > i >= -len(x): 

266 return x[i] 

267 return self._str_na_value 

268 

269 return self._str_map(f) 

270 

271 def _str_index(self, sub, start: int = 0, end=None): 

272 if end: 

273 f = lambda x: x.index(sub, start, end) 

274 else: 

275 f = lambda x: x.index(sub, start, end) 

276 return self._str_map(f, dtype="int64") 

277 

278 def _str_rindex(self, sub, start: int = 0, end=None): 

279 if end: 

280 f = lambda x: x.rindex(sub, start, end) 

281 else: 

282 f = lambda x: x.rindex(sub, start, end) 

283 return self._str_map(f, dtype="int64") 

284 

285 def _str_join(self, sep): 

286 return self._str_map(sep.join) 

287 

288 def _str_partition(self, sep, expand): 

289 result = self._str_map(lambda x: x.partition(sep), dtype="object") 

290 return result 

291 

292 def _str_rpartition(self, sep, expand): 

293 return self._str_map(lambda x: x.rpartition(sep), dtype="object") 

294 

295 def _str_len(self): 

296 return self._str_map(len, dtype="int64") 

297 

298 def _str_slice(self, start=None, stop=None, step=None): 

299 obj = slice(start, stop, step) 

300 return self._str_map(lambda x: x[obj]) 

301 

302 def _str_slice_replace(self, start=None, stop=None, repl=None): 

303 if repl is None: 

304 repl = "" 

305 

306 def f(x): 

307 if x[start:stop] == "": 

308 local_stop = start 

309 else: 

310 local_stop = stop 

311 y = "" 

312 if start is not None: 

313 y += x[:start] 

314 y += repl 

315 if stop is not None: 

316 y += x[local_stop:] 

317 return y 

318 

319 return self._str_map(f) 

320 

321 def _str_split( 

322 self, 

323 pat: str | re.Pattern | None = None, 

324 n=-1, 

325 expand: bool = False, 

326 regex: bool | None = None, 

327 ): 

328 if pat is None: 

329 if n is None or n == 0: 

330 n = -1 

331 f = lambda x: x.split(pat, n) 

332 else: 

333 new_pat: str | re.Pattern 

334 if regex is True or isinstance(pat, re.Pattern): 

335 new_pat = re.compile(pat) 

336 elif regex is False: 

337 new_pat = pat 

338 # regex is None so link to old behavior #43563 

339 else: 

340 if len(pat) == 1: 

341 new_pat = pat 

342 else: 

343 new_pat = re.compile(pat) 

344 

345 if isinstance(new_pat, re.Pattern): 

346 if n is None or n == -1: 

347 n = 0 

348 f = lambda x: new_pat.split(x, maxsplit=n) 

349 else: 

350 if n is None or n == 0: 

351 n = -1 

352 f = lambda x: x.split(pat, n) 

353 return self._str_map(f, dtype=object) 

354 

355 def _str_rsplit(self, pat=None, n=-1): 

356 if n is None or n == 0: 

357 n = -1 

358 f = lambda x: x.rsplit(pat, n) 

359 return self._str_map(f, dtype="object") 

360 

361 def _str_translate(self, table): 

362 return self._str_map(lambda x: x.translate(table)) 

363 

364 def _str_wrap(self, width, **kwargs): 

365 kwargs["width"] = width 

366 tw = textwrap.TextWrapper(**kwargs) 

367 return self._str_map(lambda s: "\n".join(tw.wrap(s))) 

368 

369 def _str_get_dummies(self, sep: str = "|"): 

370 from pandas import Series 

371 

372 arr = Series(self).fillna("") 

373 try: 

374 arr = sep + arr + sep 

375 except (TypeError, NotImplementedError): 

376 arr = sep + arr.astype(str) + sep 

377 

378 tags: set[str] = set() 

379 for ts in Series(arr, copy=False).str.split(sep): 

380 tags.update(ts) 

381 tags2 = sorted(tags - {""}) 

382 

383 dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) 

384 

385 def _isin(test_elements: str, element: str) -> bool: 

386 return element in test_elements 

387 

388 for i, t in enumerate(tags2): 

389 pat = sep + t + sep 

390 dummies[:, i] = lib.map_infer( 

391 arr.to_numpy(), functools.partial(_isin, element=pat) 

392 ) 

393 return dummies, tags2 

394 

395 def _str_upper(self): 

396 return self._str_map(lambda x: x.upper()) 

397 

398 def _str_isalnum(self): 

399 return self._str_map(str.isalnum, dtype="bool") 

400 

401 def _str_isalpha(self): 

402 return self._str_map(str.isalpha, dtype="bool") 

403 

404 def _str_isdecimal(self): 

405 return self._str_map(str.isdecimal, dtype="bool") 

406 

407 def _str_isdigit(self): 

408 return self._str_map(str.isdigit, dtype="bool") 

409 

410 def _str_islower(self): 

411 return self._str_map(str.islower, dtype="bool") 

412 

413 def _str_isnumeric(self): 

414 return self._str_map(str.isnumeric, dtype="bool") 

415 

416 def _str_isspace(self): 

417 return self._str_map(str.isspace, dtype="bool") 

418 

419 def _str_istitle(self): 

420 return self._str_map(str.istitle, dtype="bool") 

421 

422 def _str_isupper(self): 

423 return self._str_map(str.isupper, dtype="bool") 

424 

425 def _str_capitalize(self): 

426 return self._str_map(str.capitalize) 

427 

428 def _str_casefold(self): 

429 return self._str_map(str.casefold) 

430 

431 def _str_title(self): 

432 return self._str_map(str.title) 

433 

434 def _str_swapcase(self): 

435 return self._str_map(str.swapcase) 

436 

437 def _str_lower(self): 

438 return self._str_map(str.lower) 

439 

440 def _str_normalize(self, form): 

441 f = lambda x: unicodedata.normalize(form, x) 

442 return self._str_map(f) 

443 

444 def _str_strip(self, to_strip=None): 

445 return self._str_map(lambda x: x.strip(to_strip)) 

446 

447 def _str_lstrip(self, to_strip=None): 

448 return self._str_map(lambda x: x.lstrip(to_strip)) 

449 

450 def _str_rstrip(self, to_strip=None): 

451 return self._str_map(lambda x: x.rstrip(to_strip)) 

452 

453 def _str_removeprefix(self, prefix: str) -> Series: 

454 # outstanding question on whether to use native methods for users on Python 3.9+ 

455 # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770, 

456 # in which case we could do return self._str_map(str.removeprefix) 

457 

458 def removeprefix(text: str) -> str: 

459 if text.startswith(prefix): 

460 return text[len(prefix) :] 

461 return text 

462 

463 return self._str_map(removeprefix) 

464 

465 def _str_removesuffix(self, suffix: str) -> Series: 

466 if sys.version_info < (3, 9): 

467 # NOTE pyupgrade will remove this when we run it with --py39-plus 

468 # so don't remove the unnecessary `else` statement below 

469 from pandas.util._str_methods import removesuffix 

470 

471 return self._str_map(functools.partial(removesuffix, suffix=suffix)) 

472 else: 

473 return self._str_map(lambda x: x.removesuffix(suffix)) 

474 

475 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): 

476 regex = re.compile(pat, flags=flags) 

477 na_value = self._str_na_value 

478 

479 if not expand: 

480 

481 def g(x): 

482 m = regex.search(x) 

483 return m.groups()[0] if m else na_value 

484 

485 return self._str_map(g, convert=False) 

486 

487 empty_row = [na_value] * regex.groups 

488 

489 def f(x): 

490 if not isinstance(x, str): 

491 return empty_row 

492 m = regex.search(x) 

493 if m: 

494 return [na_value if item is None else item for item in m.groups()] 

495 else: 

496 return empty_row 

497 

498 return [f(val) for val in np.asarray(self)]