Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/string_.py: 26%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

280 statements  

1from __future__ import annotations 

2 

3from typing import ( 

4 TYPE_CHECKING, 

5 ClassVar, 

6 Literal, 

7) 

8 

9import numpy as np 

10 

11from pandas._config import get_option 

12 

13from pandas._libs import ( 

14 lib, 

15 missing as libmissing, 

16) 

17from pandas._libs.arrays import NDArrayBacked 

18from pandas._libs.lib import ensure_string_array 

19from pandas.compat import pa_version_under10p1 

20from pandas.compat.numpy import function as nv 

21from pandas.util._decorators import doc 

22 

23from pandas.core.dtypes.base import ( 

24 ExtensionDtype, 

25 StorageExtensionDtype, 

26 register_extension_dtype, 

27) 

28from pandas.core.dtypes.common import ( 

29 is_array_like, 

30 is_bool_dtype, 

31 is_integer_dtype, 

32 is_object_dtype, 

33 is_string_dtype, 

34 pandas_dtype, 

35) 

36 

37from pandas.core import ops 

38from pandas.core.array_algos import masked_reductions 

39from pandas.core.arrays.base import ExtensionArray 

40from pandas.core.arrays.floating import ( 

41 FloatingArray, 

42 FloatingDtype, 

43) 

44from pandas.core.arrays.integer import ( 

45 IntegerArray, 

46 IntegerDtype, 

47) 

48from pandas.core.arrays.numpy_ import NumpyExtensionArray 

49from pandas.core.construction import extract_array 

50from pandas.core.indexers import check_array_indexer 

51from pandas.core.missing import isna 

52 

53if TYPE_CHECKING: 

54 import pyarrow 

55 

56 from pandas._typing import ( 

57 AxisInt, 

58 Dtype, 

59 DtypeObj, 

60 NumpySorter, 

61 NumpyValueArrayLike, 

62 Scalar, 

63 Self, 

64 npt, 

65 type_t, 

66 ) 

67 

68 from pandas import Series 

69 

70 

71@register_extension_dtype 

72class StringDtype(StorageExtensionDtype): 

73 """ 

74 Extension dtype for string data. 

75 

76 .. warning:: 

77 

78 StringDtype is considered experimental. The implementation and 

79 parts of the API may change without warning. 

80 

81 Parameters 

82 ---------- 

83 storage : {"python", "pyarrow", "pyarrow_numpy"}, optional 

84 If not given, the value of ``pd.options.mode.string_storage``. 

85 

86 Attributes 

87 ---------- 

88 None 

89 

90 Methods 

91 ------- 

92 None 

93 

94 Examples 

95 -------- 

96 >>> pd.StringDtype() 

97 string[python] 

98 

99 >>> pd.StringDtype(storage="pyarrow") 

100 string[pyarrow] 

101 """ 

102 

103 # error: Cannot override instance variable (previously declared on 

104 # base class "StorageExtensionDtype") with class variable 

105 name: ClassVar[str] = "string" # type: ignore[misc] 

106 

107 #: StringDtype().na_value uses pandas.NA except the implementation that 

108 # follows NumPy semantics, which uses nan. 

109 @property 

110 def na_value(self) -> libmissing.NAType | float: # type: ignore[override] 

111 if self.storage == "pyarrow_numpy": 

112 return np.nan 

113 else: 

114 return libmissing.NA 

115 

116 _metadata = ("storage",) 

117 

118 def __init__(self, storage=None) -> None: 

119 if storage is None: 

120 infer_string = get_option("future.infer_string") 

121 if infer_string: 

122 storage = "pyarrow_numpy" 

123 else: 

124 storage = get_option("mode.string_storage") 

125 if storage not in {"python", "pyarrow", "pyarrow_numpy"}: 

126 raise ValueError( 

127 f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. " 

128 f"Got {storage} instead." 

129 ) 

130 if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1: 

131 raise ImportError( 

132 "pyarrow>=10.0.1 is required for PyArrow backed StringArray." 

133 ) 

134 self.storage = storage 

135 

136 @property 

137 def type(self) -> type[str]: 

138 return str 

139 

140 @classmethod 

141 def construct_from_string(cls, string) -> Self: 

142 """ 

143 Construct a StringDtype from a string. 

144 

145 Parameters 

146 ---------- 

147 string : str 

148 The type of the name. The storage type will be taking from `string`. 

149 Valid options and their storage types are 

150 

151 ========================== ============================================== 

152 string result storage 

153 ========================== ============================================== 

154 ``'string'`` pd.options.mode.string_storage, default python 

155 ``'string[python]'`` python 

156 ``'string[pyarrow]'`` pyarrow 

157 ========================== ============================================== 

158 

159 Returns 

160 ------- 

161 StringDtype 

162 

163 Raise 

164 ----- 

165 TypeError 

166 If the string is not a valid option. 

167 """ 

168 if not isinstance(string, str): 

169 raise TypeError( 

170 f"'construct_from_string' expects a string, got {type(string)}" 

171 ) 

172 if string == "string": 

173 return cls() 

174 elif string == "string[python]": 

175 return cls(storage="python") 

176 elif string == "string[pyarrow]": 

177 return cls(storage="pyarrow") 

178 elif string == "string[pyarrow_numpy]": 

179 return cls(storage="pyarrow_numpy") 

180 else: 

181 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") 

182 

183 # https://github.com/pandas-dev/pandas/issues/36126 

184 # error: Signature of "construct_array_type" incompatible with supertype 

185 # "ExtensionDtype" 

186 def construct_array_type( # type: ignore[override] 

187 self, 

188 ) -> type_t[BaseStringArray]: 

189 """ 

190 Return the array type associated with this dtype. 

191 

192 Returns 

193 ------- 

194 type 

195 """ 

196 from pandas.core.arrays.string_arrow import ( 

197 ArrowStringArray, 

198 ArrowStringArrayNumpySemantics, 

199 ) 

200 

201 if self.storage == "python": 

202 return StringArray 

203 elif self.storage == "pyarrow": 

204 return ArrowStringArray 

205 else: 

206 return ArrowStringArrayNumpySemantics 

207 

208 def __from_arrow__( 

209 self, array: pyarrow.Array | pyarrow.ChunkedArray 

210 ) -> BaseStringArray: 

211 """ 

212 Construct StringArray from pyarrow Array/ChunkedArray. 

213 """ 

214 if self.storage == "pyarrow": 

215 from pandas.core.arrays.string_arrow import ArrowStringArray 

216 

217 return ArrowStringArray(array) 

218 elif self.storage == "pyarrow_numpy": 

219 from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics 

220 

221 return ArrowStringArrayNumpySemantics(array) 

222 else: 

223 import pyarrow 

224 

225 if isinstance(array, pyarrow.Array): 

226 chunks = [array] 

227 else: 

228 # pyarrow.ChunkedArray 

229 chunks = array.chunks 

230 

231 results = [] 

232 for arr in chunks: 

233 # convert chunk by chunk to numpy and concatenate then, to avoid 

234 # overflow for large string data when concatenating the pyarrow arrays 

235 arr = arr.to_numpy(zero_copy_only=False) 

236 arr = ensure_string_array(arr, na_value=libmissing.NA) 

237 results.append(arr) 

238 

239 if len(chunks) == 0: 

240 arr = np.array([], dtype=object) 

241 else: 

242 arr = np.concatenate(results) 

243 

244 # Bypass validation inside StringArray constructor, see GH#47781 

245 new_string_array = StringArray.__new__(StringArray) 

246 NDArrayBacked.__init__( 

247 new_string_array, 

248 arr, 

249 StringDtype(storage="python"), 

250 ) 

251 return new_string_array 

252 

253 

254class BaseStringArray(ExtensionArray): 

255 """ 

256 Mixin class for StringArray, ArrowStringArray. 

257 """ 

258 

259 @doc(ExtensionArray.tolist) 

260 def tolist(self): 

261 if self.ndim > 1: 

262 return [x.tolist() for x in self] 

263 return list(self.to_numpy()) 

264 

265 @classmethod 

266 def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self: 

267 if lib.infer_dtype(scalars, skipna=True) not in ["string", "empty"]: 

268 # TODO: require any NAs be valid-for-string 

269 raise ValueError 

270 return cls._from_sequence(scalars, dtype=dtype) 

271 

272 

273# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is 

274# incompatible with definition in base class "ExtensionArray" 

275class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc] 

276 """ 

277 Extension array for string data. 

278 

279 .. warning:: 

280 

281 StringArray is considered experimental. The implementation and 

282 parts of the API may change without warning. 

283 

284 Parameters 

285 ---------- 

286 values : array-like 

287 The array of data. 

288 

289 .. warning:: 

290 

291 Currently, this expects an object-dtype ndarray 

292 where the elements are Python strings 

293 or nan-likes (``None``, ``np.nan``, ``NA``). 

294 This may change without warning in the future. Use 

295 :meth:`pandas.array` with ``dtype="string"`` for a stable way of 

296 creating a `StringArray` from any sequence. 

297 

298 .. versionchanged:: 1.5.0 

299 

300 StringArray now accepts array-likes containing 

301 nan-likes(``None``, ``np.nan``) for the ``values`` parameter 

302 in addition to strings and :attr:`pandas.NA` 

303 

304 copy : bool, default False 

305 Whether to copy the array of data. 

306 

307 Attributes 

308 ---------- 

309 None 

310 

311 Methods 

312 ------- 

313 None 

314 

315 See Also 

316 -------- 

317 :func:`pandas.array` 

318 The recommended function for creating a StringArray. 

319 Series.str 

320 The string methods are available on Series backed by 

321 a StringArray. 

322 

323 Notes 

324 ----- 

325 StringArray returns a BooleanArray for comparison methods. 

326 

327 Examples 

328 -------- 

329 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") 

330 <StringArray> 

331 ['This is', 'some text', <NA>, 'data.'] 

332 Length: 4, dtype: string 

333 

334 Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` 

335 will convert the values to strings. 

336 

337 >>> pd.array(['1', 1], dtype="object") 

338 <NumpyExtensionArray> 

339 ['1', 1] 

340 Length: 2, dtype: object 

341 >>> pd.array(['1', 1], dtype="string") 

342 <StringArray> 

343 ['1', '1'] 

344 Length: 2, dtype: string 

345 

346 However, instantiating StringArrays directly with non-strings will raise an error. 

347 

348 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: 

349 

350 >>> pd.array(["a", None, "c"], dtype="string") == "a" 

351 <BooleanArray> 

352 [True, <NA>, False] 

353 Length: 3, dtype: boolean 

354 """ 

355 

356 # undo the NumpyExtensionArray hack 

357 _typ = "extension" 

358 

359 def __init__(self, values, copy: bool = False) -> None: 

360 values = extract_array(values) 

361 

362 super().__init__(values, copy=copy) 

363 if not isinstance(values, type(self)): 

364 self._validate() 

365 NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) 

366 

367 def _validate(self): 

368 """Validate that we only store NA or strings.""" 

369 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): 

370 raise ValueError("StringArray requires a sequence of strings or pandas.NA") 

371 if self._ndarray.dtype != "object": 

372 raise ValueError( 

373 "StringArray requires a sequence of strings or pandas.NA. Got " 

374 f"'{self._ndarray.dtype}' dtype instead." 

375 ) 

376 # Check to see if need to convert Na values to pd.NA 

377 if self._ndarray.ndim > 2: 

378 # Ravel if ndims > 2 b/c no cythonized version available 

379 lib.convert_nans_to_NA(self._ndarray.ravel("K")) 

380 else: 

381 lib.convert_nans_to_NA(self._ndarray) 

382 

383 @classmethod 

384 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): 

385 if dtype and not (isinstance(dtype, str) and dtype == "string"): 

386 dtype = pandas_dtype(dtype) 

387 assert isinstance(dtype, StringDtype) and dtype.storage == "python" 

388 

389 from pandas.core.arrays.masked import BaseMaskedArray 

390 

391 if isinstance(scalars, BaseMaskedArray): 

392 # avoid costly conversion to object dtype 

393 na_values = scalars._mask 

394 result = scalars._data 

395 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) 

396 result[na_values] = libmissing.NA 

397 

398 else: 

399 if lib.is_pyarrow_array(scalars): 

400 # pyarrow array; we cannot rely on the "to_numpy" check in 

401 # ensure_string_array because calling scalars.to_numpy would set 

402 # zero_copy_only to True which caused problems see GH#52076 

403 scalars = np.array(scalars) 

404 # convert non-na-likes to str, and nan-likes to StringDtype().na_value 

405 result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) 

406 

407 # Manually creating new array avoids the validation step in the __init__, so is 

408 # faster. Refactor need for validation? 

409 new_string_array = cls.__new__(cls) 

410 NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) 

411 

412 return new_string_array 

413 

414 @classmethod 

415 def _from_sequence_of_strings( 

416 cls, strings, *, dtype: Dtype | None = None, copy: bool = False 

417 ): 

418 return cls._from_sequence(strings, dtype=dtype, copy=copy) 

419 

420 @classmethod 

421 def _empty(cls, shape, dtype) -> StringArray: 

422 values = np.empty(shape, dtype=object) 

423 values[:] = libmissing.NA 

424 return cls(values).astype(dtype, copy=False) 

425 

426 def __arrow_array__(self, type=None): 

427 """ 

428 Convert myself into a pyarrow Array. 

429 """ 

430 import pyarrow as pa 

431 

432 if type is None: 

433 type = pa.string() 

434 

435 values = self._ndarray.copy() 

436 values[self.isna()] = None 

437 return pa.array(values, type=type, from_pandas=True) 

438 

439 def _values_for_factorize(self): 

440 arr = self._ndarray.copy() 

441 mask = self.isna() 

442 arr[mask] = None 

443 return arr, None 

444 

445 def __setitem__(self, key, value) -> None: 

446 value = extract_array(value, extract_numpy=True) 

447 if isinstance(value, type(self)): 

448 # extract_array doesn't extract NumpyExtensionArray subclasses 

449 value = value._ndarray 

450 

451 key = check_array_indexer(self, key) 

452 scalar_key = lib.is_scalar(key) 

453 scalar_value = lib.is_scalar(value) 

454 if scalar_key and not scalar_value: 

455 raise ValueError("setting an array element with a sequence.") 

456 

457 # validate new items 

458 if scalar_value: 

459 if isna(value): 

460 value = libmissing.NA 

461 elif not isinstance(value, str): 

462 raise TypeError( 

463 f"Cannot set non-string value '{value}' into a StringArray." 

464 ) 

465 else: 

466 if not is_array_like(value): 

467 value = np.asarray(value, dtype=object) 

468 if len(value) and not lib.is_string_array(value, skipna=True): 

469 raise TypeError("Must provide strings.") 

470 

471 mask = isna(value) 

472 if mask.any(): 

473 value = value.copy() 

474 value[isna(value)] = libmissing.NA 

475 

476 super().__setitem__(key, value) 

477 

478 def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: 

479 # the super() method NDArrayBackedExtensionArray._putmask uses 

480 # np.putmask which doesn't properly handle None/pd.NA, so using the 

481 # base class implementation that uses __setitem__ 

482 ExtensionArray._putmask(self, mask, value) 

483 

484 def astype(self, dtype, copy: bool = True): 

485 dtype = pandas_dtype(dtype) 

486 

487 if dtype == self.dtype: 

488 if copy: 

489 return self.copy() 

490 return self 

491 

492 elif isinstance(dtype, IntegerDtype): 

493 arr = self._ndarray.copy() 

494 mask = self.isna() 

495 arr[mask] = 0 

496 values = arr.astype(dtype.numpy_dtype) 

497 return IntegerArray(values, mask, copy=False) 

498 elif isinstance(dtype, FloatingDtype): 

499 arr = self.copy() 

500 mask = self.isna() 

501 arr[mask] = "0" 

502 values = arr.astype(dtype.numpy_dtype) 

503 return FloatingArray(values, mask, copy=False) 

504 elif isinstance(dtype, ExtensionDtype): 

505 # Skip the NumpyExtensionArray.astype method 

506 return ExtensionArray.astype(self, dtype, copy) 

507 elif np.issubdtype(dtype, np.floating): 

508 arr = self._ndarray.copy() 

509 mask = self.isna() 

510 arr[mask] = 0 

511 values = arr.astype(dtype) 

512 values[mask] = np.nan 

513 return values 

514 

515 return super().astype(dtype, copy) 

516 

517 def _reduce( 

518 self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs 

519 ): 

520 if name in ["min", "max"]: 

521 return getattr(self, name)(skipna=skipna, axis=axis) 

522 

523 raise TypeError(f"Cannot perform reduction '{name}' with string dtype") 

524 

525 def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: 

526 nv.validate_min((), kwargs) 

527 result = masked_reductions.min( 

528 values=self.to_numpy(), mask=self.isna(), skipna=skipna 

529 ) 

530 return self._wrap_reduction_result(axis, result) 

531 

532 def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: 

533 nv.validate_max((), kwargs) 

534 result = masked_reductions.max( 

535 values=self.to_numpy(), mask=self.isna(), skipna=skipna 

536 ) 

537 return self._wrap_reduction_result(axis, result) 

538 

539 def value_counts(self, dropna: bool = True) -> Series: 

540 from pandas.core.algorithms import value_counts_internal as value_counts 

541 

542 result = value_counts(self._ndarray, dropna=dropna).astype("Int64") 

543 result.index = result.index.astype(self.dtype) 

544 return result 

545 

546 def memory_usage(self, deep: bool = False) -> int: 

547 result = self._ndarray.nbytes 

548 if deep: 

549 return result + lib.memory_usage_of_objects(self._ndarray) 

550 return result 

551 

552 @doc(ExtensionArray.searchsorted) 

553 def searchsorted( 

554 self, 

555 value: NumpyValueArrayLike | ExtensionArray, 

556 side: Literal["left", "right"] = "left", 

557 sorter: NumpySorter | None = None, 

558 ) -> npt.NDArray[np.intp] | np.intp: 

559 if self._hasna: 

560 raise ValueError( 

561 "searchsorted requires array to be sorted, which is impossible " 

562 "with NAs present." 

563 ) 

564 return super().searchsorted(value=value, side=side, sorter=sorter) 

565 

566 def _cmp_method(self, other, op): 

567 from pandas.arrays import BooleanArray 

568 

569 if isinstance(other, StringArray): 

570 other = other._ndarray 

571 

572 mask = isna(self) | isna(other) 

573 valid = ~mask 

574 

575 if not lib.is_scalar(other): 

576 if len(other) != len(self): 

577 # prevent improper broadcasting when other is 2D 

578 raise ValueError( 

579 f"Lengths of operands do not match: {len(self)} != {len(other)}" 

580 ) 

581 

582 other = np.asarray(other) 

583 other = other[valid] 

584 

585 if op.__name__ in ops.ARITHMETIC_BINOPS: 

586 result = np.empty_like(self._ndarray, dtype="object") 

587 result[mask] = libmissing.NA 

588 result[valid] = op(self._ndarray[valid], other) 

589 return StringArray(result) 

590 else: 

591 # logical 

592 result = np.zeros(len(self._ndarray), dtype="bool") 

593 result[valid] = op(self._ndarray[valid], other) 

594 return BooleanArray(result, mask) 

595 

596 _arith_method = _cmp_method 

597 

598 # ------------------------------------------------------------------------ 

599 # String methods interface 

600 # error: Incompatible types in assignment (expression has type "NAType", 

601 # base class "NumpyExtensionArray" defined the type as "float") 

602 _str_na_value = libmissing.NA # type: ignore[assignment] 

603 

604 def _str_map( 

605 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True 

606 ): 

607 from pandas.arrays import BooleanArray 

608 

609 if dtype is None: 

610 dtype = StringDtype(storage="python") 

611 if na_value is None: 

612 na_value = self.dtype.na_value 

613 

614 mask = isna(self) 

615 arr = np.asarray(self) 

616 

617 if is_integer_dtype(dtype) or is_bool_dtype(dtype): 

618 constructor: type[IntegerArray | BooleanArray] 

619 if is_integer_dtype(dtype): 

620 constructor = IntegerArray 

621 else: 

622 constructor = BooleanArray 

623 

624 na_value_is_na = isna(na_value) 

625 if na_value_is_na: 

626 na_value = 1 

627 elif dtype == np.dtype("bool"): 

628 na_value = bool(na_value) 

629 result = lib.map_infer_mask( 

630 arr, 

631 f, 

632 mask.view("uint8"), 

633 convert=False, 

634 na_value=na_value, 

635 # error: Argument 1 to "dtype" has incompatible type 

636 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected 

637 # "Type[object]" 

638 dtype=np.dtype(dtype), # type: ignore[arg-type] 

639 ) 

640 

641 if not na_value_is_na: 

642 mask[:] = False 

643 

644 return constructor(result, mask) 

645 

646 elif is_string_dtype(dtype) and not is_object_dtype(dtype): 

647 # i.e. StringDtype 

648 result = lib.map_infer_mask( 

649 arr, f, mask.view("uint8"), convert=False, na_value=na_value 

650 ) 

651 return StringArray(result) 

652 else: 

653 # This is when the result type is object. We reach this when 

654 # -> We know the result type is truly object (e.g. .encode returns bytes 

655 # or .findall returns a list). 

656 # -> We don't know the result type. E.g. `.get` can return anything. 

657 return lib.map_infer_mask(arr, f, mask.view("uint8"))