Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/string_.py: 25%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

257 statements  

1from __future__ import annotations 

2 

3from typing import ( 

4 TYPE_CHECKING, 

5 Literal, 

6) 

7 

8import numpy as np 

9 

10from pandas._config import get_option 

11 

12from pandas._libs import ( 

13 lib, 

14 missing as libmissing, 

15) 

16from pandas._libs.arrays import NDArrayBacked 

17from pandas._typing import ( 

18 AxisInt, 

19 Dtype, 

20 Scalar, 

21 npt, 

22 type_t, 

23) 

24from pandas.compat import pa_version_under7p0 

25from pandas.compat.numpy import function as nv 

26from pandas.util._decorators import doc 

27 

28from pandas.core.dtypes.base import ( 

29 ExtensionDtype, 

30 StorageExtensionDtype, 

31 register_extension_dtype, 

32) 

33from pandas.core.dtypes.common import ( 

34 is_array_like, 

35 is_bool_dtype, 

36 is_dtype_equal, 

37 is_integer_dtype, 

38 is_object_dtype, 

39 is_string_dtype, 

40 pandas_dtype, 

41) 

42 

43from pandas.core import ops 

44from pandas.core.array_algos import masked_reductions 

45from pandas.core.arrays import ( 

46 ExtensionArray, 

47 FloatingArray, 

48 IntegerArray, 

49) 

50from pandas.core.arrays.floating import FloatingDtype 

51from pandas.core.arrays.integer import IntegerDtype 

52from pandas.core.arrays.numpy_ import PandasArray 

53from pandas.core.construction import extract_array 

54from pandas.core.indexers import check_array_indexer 

55from pandas.core.missing import isna 

56 

57if TYPE_CHECKING: 

58 import pyarrow 

59 

60 from pandas._typing import ( 

61 NumpySorter, 

62 NumpyValueArrayLike, 

63 ) 

64 

65 from pandas import Series 

66 

67 

68@register_extension_dtype 

69class StringDtype(StorageExtensionDtype): 

70 """ 

71 Extension dtype for string data. 

72 

73 .. warning:: 

74 

75 StringDtype is considered experimental. The implementation and 

76 parts of the API may change without warning. 

77 

78 Parameters 

79 ---------- 

80 storage : {"python", "pyarrow"}, optional 

81 If not given, the value of ``pd.options.mode.string_storage``. 

82 

83 Attributes 

84 ---------- 

85 None 

86 

87 Methods 

88 ------- 

89 None 

90 

91 Examples 

92 -------- 

93 >>> pd.StringDtype() 

94 string[python] 

95 

96 >>> pd.StringDtype(storage="pyarrow") 

97 string[pyarrow] 

98 """ 

99 

100 name = "string" 

101 

102 #: StringDtype().na_value uses pandas.NA 

103 @property 

104 def na_value(self) -> libmissing.NAType: 

105 return libmissing.NA 

106 

107 _metadata = ("storage",) 

108 

109 def __init__(self, storage=None) -> None: 

110 if storage is None: 

111 storage = get_option("mode.string_storage") 

112 if storage not in {"python", "pyarrow"}: 

113 raise ValueError( 

114 f"Storage must be 'python' or 'pyarrow'. Got {storage} instead." 

115 ) 

116 if storage == "pyarrow" and pa_version_under7p0: 

117 raise ImportError( 

118 "pyarrow>=7.0.0 is required for PyArrow backed StringArray." 

119 ) 

120 self.storage = storage 

121 

122 @property 

123 def type(self) -> type[str]: 

124 return str 

125 

126 @classmethod 

127 def construct_from_string(cls, string): 

128 """ 

129 Construct a StringDtype from a string. 

130 

131 Parameters 

132 ---------- 

133 string : str 

134 The type of the name. The storage type will be taking from `string`. 

135 Valid options and their storage types are 

136 

137 ========================== ============================================== 

138 string result storage 

139 ========================== ============================================== 

140 ``'string'`` pd.options.mode.string_storage, default python 

141 ``'string[python]'`` python 

142 ``'string[pyarrow]'`` pyarrow 

143 ========================== ============================================== 

144 

145 Returns 

146 ------- 

147 StringDtype 

148 

149 Raise 

150 ----- 

151 TypeError 

152 If the string is not a valid option. 

153 """ 

154 if not isinstance(string, str): 

155 raise TypeError( 

156 f"'construct_from_string' expects a string, got {type(string)}" 

157 ) 

158 if string == "string": 

159 return cls() 

160 elif string == "string[python]": 

161 return cls(storage="python") 

162 elif string == "string[pyarrow]": 

163 return cls(storage="pyarrow") 

164 else: 

165 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'") 

166 

167 # https://github.com/pandas-dev/pandas/issues/36126 

168 # error: Signature of "construct_array_type" incompatible with supertype 

169 # "ExtensionDtype" 

170 def construct_array_type( # type: ignore[override] 

171 self, 

172 ) -> type_t[BaseStringArray]: 

173 """ 

174 Return the array type associated with this dtype. 

175 

176 Returns 

177 ------- 

178 type 

179 """ 

180 from pandas.core.arrays.string_arrow import ArrowStringArray 

181 

182 if self.storage == "python": 

183 return StringArray 

184 else: 

185 return ArrowStringArray 

186 

187 def __from_arrow__( 

188 self, array: pyarrow.Array | pyarrow.ChunkedArray 

189 ) -> BaseStringArray: 

190 """ 

191 Construct StringArray from pyarrow Array/ChunkedArray. 

192 """ 

193 if self.storage == "pyarrow": 

194 from pandas.core.arrays.string_arrow import ArrowStringArray 

195 

196 return ArrowStringArray(array) 

197 else: 

198 import pyarrow 

199 

200 if isinstance(array, pyarrow.Array): 

201 chunks = [array] 

202 else: 

203 # pyarrow.ChunkedArray 

204 chunks = array.chunks 

205 

206 results = [] 

207 for arr in chunks: 

208 # using _from_sequence to ensure None is converted to NA 

209 str_arr = StringArray._from_sequence(np.array(arr)) 

210 results.append(str_arr) 

211 

212 if results: 

213 return StringArray._concat_same_type(results) 

214 else: 

215 return StringArray(np.array([], dtype="object")) 

216 

217 

218class BaseStringArray(ExtensionArray): 

219 """ 

220 Mixin class for StringArray, ArrowStringArray. 

221 """ 

222 

223 @doc(ExtensionArray.tolist) 

224 def tolist(self): 

225 if self.ndim > 1: 

226 return [x.tolist() for x in self] 

227 return list(self.to_numpy()) 

228 

229 

230class StringArray(BaseStringArray, PandasArray): 

231 """ 

232 Extension array for string data. 

233 

234 .. warning:: 

235 

236 StringArray is considered experimental. The implementation and 

237 parts of the API may change without warning. 

238 

239 Parameters 

240 ---------- 

241 values : array-like 

242 The array of data. 

243 

244 .. warning:: 

245 

246 Currently, this expects an object-dtype ndarray 

247 where the elements are Python strings 

248 or nan-likes (``None``, ``np.nan``, ``NA``). 

249 This may change without warning in the future. Use 

250 :meth:`pandas.array` with ``dtype="string"`` for a stable way of 

251 creating a `StringArray` from any sequence. 

252 

253 .. versionchanged:: 1.5.0 

254 

255 StringArray now accepts array-likes containing 

256 nan-likes(``None``, ``np.nan``) for the ``values`` parameter 

257 in addition to strings and :attr:`pandas.NA` 

258 

259 copy : bool, default False 

260 Whether to copy the array of data. 

261 

262 Attributes 

263 ---------- 

264 None 

265 

266 Methods 

267 ------- 

268 None 

269 

270 See Also 

271 -------- 

272 :func:`pandas.array` 

273 The recommended function for creating a StringArray. 

274 Series.str 

275 The string methods are available on Series backed by 

276 a StringArray. 

277 

278 Notes 

279 ----- 

280 StringArray returns a BooleanArray for comparison methods. 

281 

282 Examples 

283 -------- 

284 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string") 

285 <StringArray> 

286 ['This is', 'some text', <NA>, 'data.'] 

287 Length: 4, dtype: string 

288 

289 Unlike arrays instantiated with ``dtype="object"``, ``StringArray`` 

290 will convert the values to strings. 

291 

292 >>> pd.array(['1', 1], dtype="object") 

293 <PandasArray> 

294 ['1', 1] 

295 Length: 2, dtype: object 

296 >>> pd.array(['1', 1], dtype="string") 

297 <StringArray> 

298 ['1', '1'] 

299 Length: 2, dtype: string 

300 

301 However, instantiating StringArrays directly with non-strings will raise an error. 

302 

303 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`: 

304 

305 >>> pd.array(["a", None, "c"], dtype="string") == "a" 

306 <BooleanArray> 

307 [True, <NA>, False] 

308 Length: 3, dtype: boolean 

309 """ 

310 

311 # undo the PandasArray hack 

312 _typ = "extension" 

313 

314 def __init__(self, values, copy: bool = False) -> None: 

315 values = extract_array(values) 

316 

317 super().__init__(values, copy=copy) 

318 if not isinstance(values, type(self)): 

319 self._validate() 

320 NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python")) 

321 

322 def _validate(self): 

323 """Validate that we only store NA or strings.""" 

324 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True): 

325 raise ValueError("StringArray requires a sequence of strings or pandas.NA") 

326 if self._ndarray.dtype != "object": 

327 raise ValueError( 

328 "StringArray requires a sequence of strings or pandas.NA. Got " 

329 f"'{self._ndarray.dtype}' dtype instead." 

330 ) 

331 # Check to see if need to convert Na values to pd.NA 

332 if self._ndarray.ndim > 2: 

333 # Ravel if ndims > 2 b/c no cythonized version available 

334 lib.convert_nans_to_NA(self._ndarray.ravel("K")) 

335 else: 

336 lib.convert_nans_to_NA(self._ndarray) 

337 

338 @classmethod 

339 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): 

340 if dtype and not (isinstance(dtype, str) and dtype == "string"): 

341 dtype = pandas_dtype(dtype) 

342 assert isinstance(dtype, StringDtype) and dtype.storage == "python" 

343 

344 from pandas.core.arrays.masked import BaseMaskedArray 

345 

346 if isinstance(scalars, BaseMaskedArray): 

347 # avoid costly conversion to object dtype 

348 na_values = scalars._mask 

349 result = scalars._data 

350 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) 

351 result[na_values] = libmissing.NA 

352 

353 else: 

354 if hasattr(scalars, "type"): 

355 # pyarrow array 

356 scalars = np.array(scalars) 

357 # convert non-na-likes to str, and nan-likes to StringDtype().na_value 

358 result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy) 

359 

360 # Manually creating new array avoids the validation step in the __init__, so is 

361 # faster. Refactor need for validation? 

362 new_string_array = cls.__new__(cls) 

363 NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python")) 

364 

365 return new_string_array 

366 

367 @classmethod 

368 def _from_sequence_of_strings( 

369 cls, strings, *, dtype: Dtype | None = None, copy: bool = False 

370 ): 

371 return cls._from_sequence(strings, dtype=dtype, copy=copy) 

372 

373 @classmethod 

374 def _empty(cls, shape, dtype) -> StringArray: 

375 values = np.empty(shape, dtype=object) 

376 values[:] = libmissing.NA 

377 return cls(values).astype(dtype, copy=False) 

378 

379 def __arrow_array__(self, type=None): 

380 """ 

381 Convert myself into a pyarrow Array. 

382 """ 

383 import pyarrow as pa 

384 

385 if type is None: 

386 type = pa.string() 

387 

388 values = self._ndarray.copy() 

389 values[self.isna()] = None 

390 return pa.array(values, type=type, from_pandas=True) 

391 

392 def _values_for_factorize(self): 

393 arr = self._ndarray.copy() 

394 mask = self.isna() 

395 arr[mask] = None 

396 return arr, None 

397 

398 def __setitem__(self, key, value): 

399 value = extract_array(value, extract_numpy=True) 

400 if isinstance(value, type(self)): 

401 # extract_array doesn't extract PandasArray subclasses 

402 value = value._ndarray 

403 

404 key = check_array_indexer(self, key) 

405 scalar_key = lib.is_scalar(key) 

406 scalar_value = lib.is_scalar(value) 

407 if scalar_key and not scalar_value: 

408 raise ValueError("setting an array element with a sequence.") 

409 

410 # validate new items 

411 if scalar_value: 

412 if isna(value): 

413 value = libmissing.NA 

414 elif not isinstance(value, str): 

415 raise TypeError( 

416 f"Cannot set non-string value '{value}' into a StringArray." 

417 ) 

418 else: 

419 if not is_array_like(value): 

420 value = np.asarray(value, dtype=object) 

421 if len(value) and not lib.is_string_array(value, skipna=True): 

422 raise TypeError("Must provide strings.") 

423 

424 mask = isna(value) 

425 if mask.any(): 

426 value = value.copy() 

427 value[isna(value)] = libmissing.NA 

428 

429 super().__setitem__(key, value) 

430 

431 def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None: 

432 # the super() method NDArrayBackedExtensionArray._putmask uses 

433 # np.putmask which doesn't properly handle None/pd.NA, so using the 

434 # base class implementation that uses __setitem__ 

435 ExtensionArray._putmask(self, mask, value) 

436 

437 def astype(self, dtype, copy: bool = True): 

438 dtype = pandas_dtype(dtype) 

439 

440 if is_dtype_equal(dtype, self.dtype): 

441 if copy: 

442 return self.copy() 

443 return self 

444 

445 elif isinstance(dtype, IntegerDtype): 

446 arr = self._ndarray.copy() 

447 mask = self.isna() 

448 arr[mask] = 0 

449 values = arr.astype(dtype.numpy_dtype) 

450 return IntegerArray(values, mask, copy=False) 

451 elif isinstance(dtype, FloatingDtype): 

452 arr = self.copy() 

453 mask = self.isna() 

454 arr[mask] = "0" 

455 values = arr.astype(dtype.numpy_dtype) 

456 return FloatingArray(values, mask, copy=False) 

457 elif isinstance(dtype, ExtensionDtype): 

458 # Skip the PandasArray.astype method 

459 return ExtensionArray.astype(self, dtype, copy) 

460 elif np.issubdtype(dtype, np.floating): 

461 arr = self._ndarray.copy() 

462 mask = self.isna() 

463 arr[mask] = 0 

464 values = arr.astype(dtype) 

465 values[mask] = np.nan 

466 return values 

467 

468 return super().astype(dtype, copy) 

469 

470 def _reduce( 

471 self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs 

472 ): 

473 if name in ["min", "max"]: 

474 return getattr(self, name)(skipna=skipna, axis=axis) 

475 

476 raise TypeError(f"Cannot perform reduction '{name}' with string dtype") 

477 

478 def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: 

479 nv.validate_min((), kwargs) 

480 result = masked_reductions.min( 

481 values=self.to_numpy(), mask=self.isna(), skipna=skipna 

482 ) 

483 return self._wrap_reduction_result(axis, result) 

484 

485 def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar: 

486 nv.validate_max((), kwargs) 

487 result = masked_reductions.max( 

488 values=self.to_numpy(), mask=self.isna(), skipna=skipna 

489 ) 

490 return self._wrap_reduction_result(axis, result) 

491 

492 def value_counts(self, dropna: bool = True) -> Series: 

493 from pandas import value_counts 

494 

495 result = value_counts(self._ndarray, dropna=dropna).astype("Int64") 

496 result.index = result.index.astype(self.dtype) 

497 return result 

498 

499 def memory_usage(self, deep: bool = False) -> int: 

500 result = self._ndarray.nbytes 

501 if deep: 

502 return result + lib.memory_usage_of_objects(self._ndarray) 

503 return result 

504 

505 @doc(ExtensionArray.searchsorted) 

506 def searchsorted( 

507 self, 

508 value: NumpyValueArrayLike | ExtensionArray, 

509 side: Literal["left", "right"] = "left", 

510 sorter: NumpySorter = None, 

511 ) -> npt.NDArray[np.intp] | np.intp: 

512 if self._hasna: 

513 raise ValueError( 

514 "searchsorted requires array to be sorted, which is impossible " 

515 "with NAs present." 

516 ) 

517 return super().searchsorted(value=value, side=side, sorter=sorter) 

518 

519 def _cmp_method(self, other, op): 

520 from pandas.arrays import BooleanArray 

521 

522 if isinstance(other, StringArray): 

523 other = other._ndarray 

524 

525 mask = isna(self) | isna(other) 

526 valid = ~mask 

527 

528 if not lib.is_scalar(other): 

529 if len(other) != len(self): 

530 # prevent improper broadcasting when other is 2D 

531 raise ValueError( 

532 f"Lengths of operands do not match: {len(self)} != {len(other)}" 

533 ) 

534 

535 other = np.asarray(other) 

536 other = other[valid] 

537 

538 if op.__name__ in ops.ARITHMETIC_BINOPS: 

539 result = np.empty_like(self._ndarray, dtype="object") 

540 result[mask] = libmissing.NA 

541 result[valid] = op(self._ndarray[valid], other) 

542 return StringArray(result) 

543 else: 

544 # logical 

545 result = np.zeros(len(self._ndarray), dtype="bool") 

546 result[valid] = op(self._ndarray[valid], other) 

547 return BooleanArray(result, mask) 

548 

549 _arith_method = _cmp_method 

550 

551 # ------------------------------------------------------------------------ 

552 # String methods interface 

553 # error: Incompatible types in assignment (expression has type "NAType", 

554 # base class "PandasArray" defined the type as "float") 

555 _str_na_value = libmissing.NA # type: ignore[assignment] 

556 

557 def _str_map( 

558 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True 

559 ): 

560 from pandas.arrays import BooleanArray 

561 

562 if dtype is None: 

563 dtype = StringDtype(storage="python") 

564 if na_value is None: 

565 na_value = self.dtype.na_value 

566 

567 mask = isna(self) 

568 arr = np.asarray(self) 

569 

570 if is_integer_dtype(dtype) or is_bool_dtype(dtype): 

571 constructor: type[IntegerArray] | type[BooleanArray] 

572 if is_integer_dtype(dtype): 

573 constructor = IntegerArray 

574 else: 

575 constructor = BooleanArray 

576 

577 na_value_is_na = isna(na_value) 

578 if na_value_is_na: 

579 na_value = 1 

580 result = lib.map_infer_mask( 

581 arr, 

582 f, 

583 mask.view("uint8"), 

584 convert=False, 

585 na_value=na_value, 

586 # error: Argument 1 to "dtype" has incompatible type 

587 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected 

588 # "Type[object]" 

589 dtype=np.dtype(dtype), # type: ignore[arg-type] 

590 ) 

591 

592 if not na_value_is_na: 

593 mask[:] = False 

594 

595 return constructor(result, mask) 

596 

597 elif is_string_dtype(dtype) and not is_object_dtype(dtype): 

598 # i.e. StringDtype 

599 result = lib.map_infer_mask( 

600 arr, f, mask.view("uint8"), convert=False, na_value=na_value 

601 ) 

602 return StringArray(result) 

603 else: 

604 # This is when the result type is object. We reach this when 

605 # -> We know the result type is truly object (e.g. .encode returns bytes 

606 # or .findall returns a list). 

607 # -> We don't know the result type. E.g. `.get` can return anything. 

608 return lib.map_infer_mask(arr, f, mask.view("uint8"))