Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/strings/object

1from __future__ import annotations

3import functools

4import re

5import textwrap

6from typing import (

7 TYPE_CHECKING,

8 Callable,

9 Literal,

10 cast,

11)

12import unicodedata

14import numpy as np

16from pandas._libs import lib

17import pandas._libs.missing as libmissing

18import pandas._libs.ops as libops

20from pandas.core.dtypes.missing import isna

22from pandas.core.strings.base import BaseStringArrayMethods

24if TYPE_CHECKING:

25 from collections.abc import Sequence

27 from pandas._typing import (

28 NpDtype,

29 Scalar,

30 )

32 from pandas import Series

35class ObjectStringArrayMixin(BaseStringArrayMethods):

36 """

37 String Methods operating on object-dtype ndarrays.

38 """

40 _str_na_value = np.nan

42 def __len__(self) -> int:

43 # For typing, _str_map relies on the object being sized.

44 raise NotImplementedError

46 def _str_map(

47 self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True

48 ):

49 """

50 Map a callable over valid elements of the array.

52 Parameters

53 ----------

54 f : Callable

55 A function to call on each non-NA element.

56 na_value : Scalar, optional

57 The value to set for NA values. Might also be used for the

58 fill value if the callable `f` raises an exception.

59 This defaults to ``self._str_na_value`` which is ``np.nan``

60 for object-dtype and Categorical and ``pd.NA`` for StringArray.

61 dtype : Dtype, optional

62 The dtype of the result array.

63 convert : bool, default True

64 Whether to call `maybe_convert_objects` on the resulting ndarray

65 """

66 if dtype is None:

67 dtype = np.dtype("object")

68 if na_value is None:

69 na_value = self._str_na_value

71 if not len(self):

72 return np.array([], dtype=dtype)

74 arr = np.asarray(self, dtype=object)

75 mask = isna(arr)

76 map_convert = convert and not np.all(mask)

77 try:

78 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)

79 except (TypeError, AttributeError) as err:

80 # Reraise the exception if callable `f` got wrong number of args.

81 # The user may want to be warned by this, instead of getting NaN

82 p_err = (

83 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "

84 r"(?(3)required )positional arguments?"

85 )

87 if len(err.args) >= 1 and re.search(p_err, err.args[0]):

88 # FIXME: this should be totally avoidable

89 raise err

91 def g(x):

92 # This type of fallback behavior can be removed once

93 # we remove object-dtype .str accessor.

94 try:

95 return f(x)

96 except (TypeError, AttributeError):

97 return na_value

99 return self._str_map(g, na_value=na_value, dtype=dtype)

100 if not isinstance(result, np.ndarray):

101 return result

102 if na_value is not np.nan:

103 np.putmask(result, mask, na_value)

104 if convert and result.dtype == object:

105 result = lib.maybe_convert_objects(result)

106 return result

107

108 def _str_count(self, pat, flags: int = 0):

109 regex = re.compile(pat, flags=flags)

110 f = lambda x: len(regex.findall(x))

111 return self._str_map(f, dtype="int64")

112

113 def _str_pad(

114 self,

115 width: int,

116 side: Literal["left", "right", "both"] = "left",

117 fillchar: str = " ",

118 ):

119 if side == "left":

120 f = lambda x: x.rjust(width, fillchar)

121 elif side == "right":

122 f = lambda x: x.ljust(width, fillchar)

123 elif side == "both":

124 f = lambda x: x.center(width, fillchar)

125 else: # pragma: no cover

126 raise ValueError("Invalid side")

127 return self._str_map(f)

128

129 def _str_contains(

130 self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True

131 ):

132 if regex:

133 if not case:

134 flags |= re.IGNORECASE

135

136 pat = re.compile(pat, flags=flags)

137

138 f = lambda x: pat.search(x) is not None

139 else:

140 if case:

141 f = lambda x: pat in x

142 else:

143 upper_pat = pat.upper()

144 f = lambda x: upper_pat in x.upper()

145 return self._str_map(f, na, dtype=np.dtype("bool"))

146

147 def _str_startswith(self, pat, na=None):

148 f = lambda x: x.startswith(pat)

149 return self._str_map(f, na_value=na, dtype=np.dtype(bool))

150

151 def _str_endswith(self, pat, na=None):

152 f = lambda x: x.endswith(pat)

153 return self._str_map(f, na_value=na, dtype=np.dtype(bool))

154

155 def _str_replace(

156 self,

157 pat: str | re.Pattern,

158 repl: str | Callable,

159 n: int = -1,

160 case: bool = True,

161 flags: int = 0,

162 regex: bool = True,

163 ):

164 if case is False:

165 # add case flag, if provided

166 flags |= re.IGNORECASE

167

168 if regex or flags or callable(repl):

169 if not isinstance(pat, re.Pattern):

170 if regex is False:

171 pat = re.escape(pat)

172 pat = re.compile(pat, flags=flags)

173

174 n = n if n >= 0 else 0

175 f = lambda x: pat.sub(repl=repl, string=x, count=n)

176 else:

177 f = lambda x: x.replace(pat, repl, n)

178

179 return self._str_map(f, dtype=str)

180

181 def _str_repeat(self, repeats: int | Sequence[int]):

182 if lib.is_integer(repeats):

183 rint = cast(int, repeats)

184

185 def scalar_rep(x):

186 try:

187 return bytes.__mul__(x, rint)

188 except TypeError:

189 return str.__mul__(x, rint)

190

191 return self._str_map(scalar_rep, dtype=str)

192 else:

193 from pandas.core.arrays.string_ import BaseStringArray

194

195 def rep(x, r):

196 if x is libmissing.NA:

197 return x

198 try:

199 return bytes.__mul__(x, r)

200 except TypeError:

201 return str.__mul__(x, r)

202

203 result = libops.vec_binop(

204 np.asarray(self),

205 np.asarray(repeats, dtype=object),

206 rep,

207 )

208 if isinstance(self, BaseStringArray):

209 # Not going through map, so we have to do this here.

210 result = type(self)._from_sequence(result, dtype=self.dtype)

211 return result

212

213 def _str_match(

214 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None

215 ):

216 if not case:

217 flags |= re.IGNORECASE

218

219 regex = re.compile(pat, flags=flags)

220

221 f = lambda x: regex.match(x) is not None

222 return self._str_map(f, na_value=na, dtype=np.dtype(bool))

223

224 def _str_fullmatch(

225 self,

226 pat: str | re.Pattern,

227 case: bool = True,

228 flags: int = 0,

229 na: Scalar | None = None,

230 ):

231 if not case:

232 flags |= re.IGNORECASE

233

234 regex = re.compile(pat, flags=flags)

235

236 f = lambda x: regex.fullmatch(x) is not None

237 return self._str_map(f, na_value=na, dtype=np.dtype(bool))

238

239 def _str_encode(self, encoding, errors: str = "strict"):

240 f = lambda x: x.encode(encoding, errors=errors)

241 return self._str_map(f, dtype=object)

242

243 def _str_find(self, sub, start: int = 0, end=None):

244 return self._str_find_(sub, start, end, side="left")

245

246 def _str_rfind(self, sub, start: int = 0, end=None):

247 return self._str_find_(sub, start, end, side="right")

248

249 def _str_find_(self, sub, start, end, side):

250 if side == "left":

251 method = "find"

252 elif side == "right":

253 method = "rfind"

254 else: # pragma: no cover

255 raise ValueError("Invalid side")

256

257 if end is None:

258 f = lambda x: getattr(x, method)(sub, start)

259 else:

260 f = lambda x: getattr(x, method)(sub, start, end)

261 return self._str_map(f, dtype="int64")

262

263 def _str_findall(self, pat, flags: int = 0):

264 regex = re.compile(pat, flags=flags)

265 return self._str_map(regex.findall, dtype="object")

266

267 def _str_get(self, i):

268 def f(x):

269 if isinstance(x, dict):

270 return x.get(i)

271 elif len(x) > i >= -len(x):

272 return x[i]

273 return self._str_na_value

274

275 return self._str_map(f)

276

277 def _str_index(self, sub, start: int = 0, end=None):

278 if end:

279 f = lambda x: x.index(sub, start, end)

280 else:

281 f = lambda x: x.index(sub, start, end)

282 return self._str_map(f, dtype="int64")

283

284 def _str_rindex(self, sub, start: int = 0, end=None):

285 if end:

286 f = lambda x: x.rindex(sub, start, end)

287 else:

288 f = lambda x: x.rindex(sub, start, end)

289 return self._str_map(f, dtype="int64")

290

291 def _str_join(self, sep: str):

292 return self._str_map(sep.join)

293

294 def _str_partition(self, sep: str, expand):

295 result = self._str_map(lambda x: x.partition(sep), dtype="object")

296 return result

297

298 def _str_rpartition(self, sep: str, expand):

299 return self._str_map(lambda x: x.rpartition(sep), dtype="object")

300

301 def _str_len(self):

302 return self._str_map(len, dtype="int64")

303

304 def _str_slice(self, start=None, stop=None, step=None):

305 obj = slice(start, stop, step)

306 return self._str_map(lambda x: x[obj])

307

308 def _str_slice_replace(self, start=None, stop=None, repl=None):

309 if repl is None:

310 repl = ""

311

312 def f(x):

313 if x[start:stop] == "":

314 local_stop = start

315 else:

316 local_stop = stop

317 y = ""

318 if start is not None:

319 y += x[:start]

320 y += repl

321 if stop is not None:

322 y += x[local_stop:]

323 return y

324

325 return self._str_map(f)

326

327 def _str_split(

328 self,

329 pat: str | re.Pattern | None = None,

330 n=-1,

331 expand: bool = False,

332 regex: bool | None = None,

333 ):

334 if pat is None:

335 if n is None or n == 0:

336 n = -1

337 f = lambda x: x.split(pat, n)

338 else:

339 new_pat: str | re.Pattern

340 if regex is True or isinstance(pat, re.Pattern):

341 new_pat = re.compile(pat)

342 elif regex is False:

343 new_pat = pat

344 # regex is None so link to old behavior #43563

345 else:

346 if len(pat) == 1:

347 new_pat = pat

348 else:

349 new_pat = re.compile(pat)

350

351 if isinstance(new_pat, re.Pattern):

352 if n is None or n == -1:

353 n = 0

354 f = lambda x: new_pat.split(x, maxsplit=n)

355 else:

356 if n is None or n == 0:

357 n = -1

358 f = lambda x: x.split(pat, n)

359 return self._str_map(f, dtype=object)

360

361 def _str_rsplit(self, pat=None, n=-1):

362 if n is None or n == 0:

363 n = -1

364 f = lambda x: x.rsplit(pat, n)

365 return self._str_map(f, dtype="object")

366

367 def _str_translate(self, table):

368 return self._str_map(lambda x: x.translate(table))

369

370 def _str_wrap(self, width: int, **kwargs):

371 kwargs["width"] = width

372 tw = textwrap.TextWrapper(**kwargs)

373 return self._str_map(lambda s: "\n".join(tw.wrap(s)))

374

375 def _str_get_dummies(self, sep: str = "|"):

376 from pandas import Series

377

378 arr = Series(self).fillna("")

379 try:

380 arr = sep + arr + sep

381 except (TypeError, NotImplementedError):

382 arr = sep + arr.astype(str) + sep

383

384 tags: set[str] = set()

385 for ts in Series(arr, copy=False).str.split(sep):

386 tags.update(ts)

387 tags2 = sorted(tags - {""})

388

389 dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)

390

391 def _isin(test_elements: str, element: str) -> bool:

392 return element in test_elements

393

394 for i, t in enumerate(tags2):

395 pat = sep + t + sep

396 dummies[:, i] = lib.map_infer(

397 arr.to_numpy(), functools.partial(_isin, element=pat)

398 )

399 return dummies, tags2

400

401 def _str_upper(self):

402 return self._str_map(lambda x: x.upper())

403

404 def _str_isalnum(self):

405 return self._str_map(str.isalnum, dtype="bool")

406

407 def _str_isalpha(self):

408 return self._str_map(str.isalpha, dtype="bool")

409

410 def _str_isdecimal(self):

411 return self._str_map(str.isdecimal, dtype="bool")

412

413 def _str_isdigit(self):

414 return self._str_map(str.isdigit, dtype="bool")

415

416 def _str_islower(self):

417 return self._str_map(str.islower, dtype="bool")

418

419 def _str_isnumeric(self):

420 return self._str_map(str.isnumeric, dtype="bool")

421

422 def _str_isspace(self):

423 return self._str_map(str.isspace, dtype="bool")

424

425 def _str_istitle(self):

426 return self._str_map(str.istitle, dtype="bool")

427

428 def _str_isupper(self):

429 return self._str_map(str.isupper, dtype="bool")

430

431 def _str_capitalize(self):

432 return self._str_map(str.capitalize)

433

434 def _str_casefold(self):

435 return self._str_map(str.casefold)

436

437 def _str_title(self):

438 return self._str_map(str.title)

439

440 def _str_swapcase(self):

441 return self._str_map(str.swapcase)

442

443 def _str_lower(self):

444 return self._str_map(str.lower)

445

446 def _str_normalize(self, form):

447 f = lambda x: unicodedata.normalize(form, x)

448 return self._str_map(f)

449

450 def _str_strip(self, to_strip=None):

451 return self._str_map(lambda x: x.strip(to_strip))

452

453 def _str_lstrip(self, to_strip=None):

454 return self._str_map(lambda x: x.lstrip(to_strip))

455

456 def _str_rstrip(self, to_strip=None):

457 return self._str_map(lambda x: x.rstrip(to_strip))

458

459 def _str_removeprefix(self, prefix: str) -> Series:

460 # outstanding question on whether to use native methods for users on Python 3.9+

461 # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770,

462 # in which case we could do return self._str_map(str.removeprefix)

463

464 def removeprefix(text: str) -> str:

465 if text.startswith(prefix):

466 return text[len(prefix) :]

467 return text

468

469 return self._str_map(removeprefix)

470

471 def _str_removesuffix(self, suffix: str) -> Series:

472 return self._str_map(lambda x: x.removesuffix(suffix))

473

474 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):

475 regex = re.compile(pat, flags=flags)

476 na_value = self._str_na_value

477

478 if not expand:

479

480 def g(x):

481 m = regex.search(x)

482 return m.groups()[0] if m else na_value

483

484 return self._str_map(g, convert=False)

485

486 empty_row = [na_value] * regex.groups

487

488 def f(x):

489 if not isinstance(x, str):

490 return empty_row

491 m = regex.search(x)

492 if m:

493 return [na_value if item is None else item for item in m.groups()]

494 else:

495 return empty_row

496

497 return [f(val) for val in np.asarray(self)]

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/strings/object_array.py: 22%

300 statements