Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/strings/object

1from __future__ import annotations

3import functools

4import re

5import sys

6import textwrap

7from typing import (

8 TYPE_CHECKING,

9 Callable,

10 Literal,

11)

12import unicodedata

14import numpy as np

16from pandas._libs import lib

17import pandas._libs.missing as libmissing

18import pandas._libs.ops as libops

19from pandas._typing import (

20 NpDtype,

21 Scalar,

22)

24from pandas.core.dtypes.common import is_scalar

25from pandas.core.dtypes.missing import isna

27from pandas.core.strings.base import BaseStringArrayMethods

29if TYPE_CHECKING:

30 from pandas import Series

33class ObjectStringArrayMixin(BaseStringArrayMethods):

34 """

35 String Methods operating on object-dtype ndarrays.

36 """

38 _str_na_value = np.nan

40 def __len__(self) -> int:

41 # For typing, _str_map relies on the object being sized.

42 raise NotImplementedError

44 def _str_map(

45 self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True

46 ):

47 """

48 Map a callable over valid elements of the array.

50 Parameters

51 ----------

52 f : Callable

53 A function to call on each non-NA element.

54 na_value : Scalar, optional

55 The value to set for NA values. Might also be used for the

56 fill value if the callable `f` raises an exception.

57 This defaults to ``self._str_na_value`` which is ``np.nan``

58 for object-dtype and Categorical and ``pd.NA`` for StringArray.

59 dtype : Dtype, optional

60 The dtype of the result array.

61 convert : bool, default True

62 Whether to call `maybe_convert_objects` on the resulting ndarray

63 """

64 if dtype is None:

65 dtype = np.dtype("object")

66 if na_value is None:

67 na_value = self._str_na_value

69 if not len(self):

70 return np.array([], dtype=dtype)

72 arr = np.asarray(self, dtype=object)

73 mask = isna(arr)

74 map_convert = convert and not np.all(mask)

75 try:

76 result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)

77 except (TypeError, AttributeError) as err:

78 # Reraise the exception if callable `f` got wrong number of args.

79 # The user may want to be warned by this, instead of getting NaN

80 p_err = (

81 r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "

82 r"(?(3)required )positional arguments?"

83 )

85 if len(err.args) >= 1 and re.search(p_err, err.args[0]):

86 # FIXME: this should be totally avoidable

87 raise err

89 def g(x):

90 # This type of fallback behavior can be removed once

91 # we remove object-dtype .str accessor.

92 try:

93 return f(x)

94 except (TypeError, AttributeError):

95 return na_value

97 return self._str_map(g, na_value=na_value, dtype=dtype)

98 if not isinstance(result, np.ndarray):

99 return result

100 if na_value is not np.nan:

101 np.putmask(result, mask, na_value)

102 if convert and result.dtype == object:

103 result = lib.maybe_convert_objects(result)

104 return result

105

106 def _str_count(self, pat, flags: int = 0):

107 regex = re.compile(pat, flags=flags)

108 f = lambda x: len(regex.findall(x))

109 return self._str_map(f, dtype="int64")

110

111 def _str_pad(

112 self,

113 width,

114 side: Literal["left", "right", "both"] = "left",

115 fillchar: str = " ",

116 ):

117 if side == "left":

118 f = lambda x: x.rjust(width, fillchar)

119 elif side == "right":

120 f = lambda x: x.ljust(width, fillchar)

121 elif side == "both":

122 f = lambda x: x.center(width, fillchar)

123 else: # pragma: no cover

124 raise ValueError("Invalid side")

125 return self._str_map(f)

126

127 def _str_contains(

128 self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True

129 ):

130 if regex:

131 if not case:

132 flags |= re.IGNORECASE

133

134 pat = re.compile(pat, flags=flags)

135

136 f = lambda x: pat.search(x) is not None

137 else:

138 if case:

139 f = lambda x: pat in x

140 else:

141 upper_pat = pat.upper()

142 f = lambda x: upper_pat in x.upper()

143 return self._str_map(f, na, dtype=np.dtype("bool"))

144

145 def _str_startswith(self, pat, na=None):

146 f = lambda x: x.startswith(pat)

147 return self._str_map(f, na_value=na, dtype=np.dtype(bool))

148

149 def _str_endswith(self, pat, na=None):

150 f = lambda x: x.endswith(pat)

151 return self._str_map(f, na_value=na, dtype=np.dtype(bool))

152

153 def _str_replace(

154 self,

155 pat: str | re.Pattern,

156 repl: str | Callable,

157 n: int = -1,

158 case: bool = True,

159 flags: int = 0,

160 regex: bool = True,

161 ):

162 if case is False:

163 # add case flag, if provided

164 flags |= re.IGNORECASE

165

166 if regex or flags or callable(repl):

167 if not isinstance(pat, re.Pattern):

168 if regex is False:

169 pat = re.escape(pat)

170 pat = re.compile(pat, flags=flags)

171

172 n = n if n >= 0 else 0

173 f = lambda x: pat.sub(repl=repl, string=x, count=n)

174 else:

175 f = lambda x: x.replace(pat, repl, n)

176

177 return self._str_map(f, dtype=str)

178

179 def _str_repeat(self, repeats):

180 if is_scalar(repeats):

181

182 def scalar_rep(x):

183 try:

184 return bytes.__mul__(x, repeats)

185 except TypeError:

186 return str.__mul__(x, repeats)

187

188 return self._str_map(scalar_rep, dtype=str)

189 else:

190 from pandas.core.arrays.string_ import BaseStringArray

191

192 def rep(x, r):

193 if x is libmissing.NA:

194 return x

195 try:

196 return bytes.__mul__(x, r)

197 except TypeError:

198 return str.__mul__(x, r)

199

200 repeats = np.asarray(repeats, dtype=object)

201 result = libops.vec_binop(np.asarray(self), repeats, rep)

202 if isinstance(self, BaseStringArray):

203 # Not going through map, so we have to do this here.

204 result = type(self)._from_sequence(result)

205 return result

206

207 def _str_match(

208 self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None

209 ):

210 if not case:

211 flags |= re.IGNORECASE

212

213 regex = re.compile(pat, flags=flags)

214

215 f = lambda x: regex.match(x) is not None

216 return self._str_map(f, na_value=na, dtype=np.dtype(bool))

217

218 def _str_fullmatch(

219 self,

220 pat: str | re.Pattern,

221 case: bool = True,

222 flags: int = 0,

223 na: Scalar | None = None,

224 ):

225 if not case:

226 flags |= re.IGNORECASE

227

228 regex = re.compile(pat, flags=flags)

229

230 f = lambda x: regex.fullmatch(x) is not None

231 return self._str_map(f, na_value=na, dtype=np.dtype(bool))

232

233 def _str_encode(self, encoding, errors: str = "strict"):

234 f = lambda x: x.encode(encoding, errors=errors)

235 return self._str_map(f, dtype=object)

236

237 def _str_find(self, sub, start: int = 0, end=None):

238 return self._str_find_(sub, start, end, side="left")

239

240 def _str_rfind(self, sub, start: int = 0, end=None):

241 return self._str_find_(sub, start, end, side="right")

242

243 def _str_find_(self, sub, start, end, side):

244 if side == "left":

245 method = "find"

246 elif side == "right":

247 method = "rfind"

248 else: # pragma: no cover

249 raise ValueError("Invalid side")

250

251 if end is None:

252 f = lambda x: getattr(x, method)(sub, start)

253 else:

254 f = lambda x: getattr(x, method)(sub, start, end)

255 return self._str_map(f, dtype="int64")

256

257 def _str_findall(self, pat, flags: int = 0):

258 regex = re.compile(pat, flags=flags)

259 return self._str_map(regex.findall, dtype="object")

260

261 def _str_get(self, i):

262 def f(x):

263 if isinstance(x, dict):

264 return x.get(i)

265 elif len(x) > i >= -len(x):

266 return x[i]

267 return self._str_na_value

268

269 return self._str_map(f)

270

271 def _str_index(self, sub, start: int = 0, end=None):

272 if end:

273 f = lambda x: x.index(sub, start, end)

274 else:

275 f = lambda x: x.index(sub, start, end)

276 return self._str_map(f, dtype="int64")

277

278 def _str_rindex(self, sub, start: int = 0, end=None):

279 if end:

280 f = lambda x: x.rindex(sub, start, end)

281 else:

282 f = lambda x: x.rindex(sub, start, end)

283 return self._str_map(f, dtype="int64")

284

285 def _str_join(self, sep):

286 return self._str_map(sep.join)

287

288 def _str_partition(self, sep, expand):

289 result = self._str_map(lambda x: x.partition(sep), dtype="object")

290 return result

291

292 def _str_rpartition(self, sep, expand):

293 return self._str_map(lambda x: x.rpartition(sep), dtype="object")

294

295 def _str_len(self):

296 return self._str_map(len, dtype="int64")

297

298 def _str_slice(self, start=None, stop=None, step=None):

299 obj = slice(start, stop, step)

300 return self._str_map(lambda x: x[obj])

301

302 def _str_slice_replace(self, start=None, stop=None, repl=None):

303 if repl is None:

304 repl = ""

305

306 def f(x):

307 if x[start:stop] == "":

308 local_stop = start

309 else:

310 local_stop = stop

311 y = ""

312 if start is not None:

313 y += x[:start]

314 y += repl

315 if stop is not None:

316 y += x[local_stop:]

317 return y

318

319 return self._str_map(f)

320

321 def _str_split(

322 self,

323 pat: str | re.Pattern | None = None,

324 n=-1,

325 expand: bool = False,

326 regex: bool | None = None,

327 ):

328 if pat is None:

329 if n is None or n == 0:

330 n = -1

331 f = lambda x: x.split(pat, n)

332 else:

333 new_pat: str | re.Pattern

334 if regex is True or isinstance(pat, re.Pattern):

335 new_pat = re.compile(pat)

336 elif regex is False:

337 new_pat = pat

338 # regex is None so link to old behavior #43563

339 else:

340 if len(pat) == 1:

341 new_pat = pat

342 else:

343 new_pat = re.compile(pat)

344

345 if isinstance(new_pat, re.Pattern):

346 if n is None or n == -1:

347 n = 0

348 f = lambda x: new_pat.split(x, maxsplit=n)

349 else:

350 if n is None or n == 0:

351 n = -1

352 f = lambda x: x.split(pat, n)

353 return self._str_map(f, dtype=object)

354

355 def _str_rsplit(self, pat=None, n=-1):

356 if n is None or n == 0:

357 n = -1

358 f = lambda x: x.rsplit(pat, n)

359 return self._str_map(f, dtype="object")

360

361 def _str_translate(self, table):

362 return self._str_map(lambda x: x.translate(table))

363

364 def _str_wrap(self, width, **kwargs):

365 kwargs["width"] = width

366 tw = textwrap.TextWrapper(**kwargs)

367 return self._str_map(lambda s: "\n".join(tw.wrap(s)))

368

369 def _str_get_dummies(self, sep: str = "|"):

370 from pandas import Series

371

372 arr = Series(self).fillna("")

373 try:

374 arr = sep + arr + sep

375 except (TypeError, NotImplementedError):

376 arr = sep + arr.astype(str) + sep

377

378 tags: set[str] = set()

379 for ts in Series(arr, copy=False).str.split(sep):

380 tags.update(ts)

381 tags2 = sorted(tags - {""})

382

383 dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)

384

385 def _isin(test_elements: str, element: str) -> bool:

386 return element in test_elements

387

388 for i, t in enumerate(tags2):

389 pat = sep + t + sep

390 dummies[:, i] = lib.map_infer(

391 arr.to_numpy(), functools.partial(_isin, element=pat)

392 )

393 return dummies, tags2

394

395 def _str_upper(self):

396 return self._str_map(lambda x: x.upper())

397

398 def _str_isalnum(self):

399 return self._str_map(str.isalnum, dtype="bool")

400

401 def _str_isalpha(self):

402 return self._str_map(str.isalpha, dtype="bool")

403

404 def _str_isdecimal(self):

405 return self._str_map(str.isdecimal, dtype="bool")

406

407 def _str_isdigit(self):

408 return self._str_map(str.isdigit, dtype="bool")

409

410 def _str_islower(self):

411 return self._str_map(str.islower, dtype="bool")

412

413 def _str_isnumeric(self):

414 return self._str_map(str.isnumeric, dtype="bool")

415

416 def _str_isspace(self):

417 return self._str_map(str.isspace, dtype="bool")

418

419 def _str_istitle(self):

420 return self._str_map(str.istitle, dtype="bool")

421

422 def _str_isupper(self):

423 return self._str_map(str.isupper, dtype="bool")

424

425 def _str_capitalize(self):

426 return self._str_map(str.capitalize)

427

428 def _str_casefold(self):

429 return self._str_map(str.casefold)

430

431 def _str_title(self):

432 return self._str_map(str.title)

433

434 def _str_swapcase(self):

435 return self._str_map(str.swapcase)

436

437 def _str_lower(self):

438 return self._str_map(str.lower)

439

440 def _str_normalize(self, form):

441 f = lambda x: unicodedata.normalize(form, x)

442 return self._str_map(f)

443

444 def _str_strip(self, to_strip=None):

445 return self._str_map(lambda x: x.strip(to_strip))

446

447 def _str_lstrip(self, to_strip=None):

448 return self._str_map(lambda x: x.lstrip(to_strip))

449

450 def _str_rstrip(self, to_strip=None):

451 return self._str_map(lambda x: x.rstrip(to_strip))

452

453 def _str_removeprefix(self, prefix: str) -> Series:

454 # outstanding question on whether to use native methods for users on Python 3.9+

455 # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770,

456 # in which case we could do return self._str_map(str.removeprefix)

457

458 def removeprefix(text: str) -> str:

459 if text.startswith(prefix):

460 return text[len(prefix) :]

461 return text

462

463 return self._str_map(removeprefix)

464

465 def _str_removesuffix(self, suffix: str) -> Series:

466 if sys.version_info < (3, 9):

467 # NOTE pyupgrade will remove this when we run it with --py39-plus

468 # so don't remove the unnecessary `else` statement below

469 from pandas.util._str_methods import removesuffix

470

471 return self._str_map(functools.partial(removesuffix, suffix=suffix))

472 else:

473 return self._str_map(lambda x: x.removesuffix(suffix))

474

475 def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):

476 regex = re.compile(pat, flags=flags)

477 na_value = self._str_na_value

478

479 if not expand:

480

481 def g(x):

482 m = regex.search(x)

483 return m.groups()[0] if m else na_value

484

485 return self._str_map(g, convert=False)

486

487 empty_row = [na_value] * regex.groups

488

489 def f(x):

490 if not isinstance(x, str):

491 return empty_row

492 m = regex.search(x)

493 if m:

494 return [na_value if item is None else item for item in m.groups()]

495 else:

496 return empty_row

497

498 return [f(val) for val in np.asarray(self)]

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/strings/object_array.py: 23%

304 statements