Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/string

1from __future__ import annotations

3from typing import (

4 TYPE_CHECKING,

5 ClassVar,

6 Literal,

9import numpy as np

11from pandas._config import get_option

13from pandas._libs import (

14 lib,

15 missing as libmissing,

16)

17from pandas._libs.arrays import NDArrayBacked

18from pandas._libs.lib import ensure_string_array

19from pandas.compat import pa_version_under10p1

20from pandas.compat.numpy import function as nv

21from pandas.util._decorators import doc

23from pandas.core.dtypes.base import (

24 ExtensionDtype,

25 StorageExtensionDtype,

26 register_extension_dtype,

27)

28from pandas.core.dtypes.common import (

29 is_array_like,

30 is_bool_dtype,

31 is_integer_dtype,

32 is_object_dtype,

33 is_string_dtype,

34 pandas_dtype,

35)

37from pandas.core import ops

38from pandas.core.array_algos import masked_reductions

39from pandas.core.arrays.base import ExtensionArray

40from pandas.core.arrays.floating import (

41 FloatingArray,

42 FloatingDtype,

43)

44from pandas.core.arrays.integer import (

45 IntegerArray,

46 IntegerDtype,

47)

48from pandas.core.arrays.numpy_ import NumpyExtensionArray

49from pandas.core.construction import extract_array

50from pandas.core.indexers import check_array_indexer

51from pandas.core.missing import isna

53if TYPE_CHECKING:

54 import pyarrow

56 from pandas._typing import (

57 AxisInt,

58 Dtype,

59 DtypeObj,

60 NumpySorter,

61 NumpyValueArrayLike,

62 Scalar,

63 Self,

64 npt,

65 type_t,

66 )

68 from pandas import Series

71@register_extension_dtype

72class StringDtype(StorageExtensionDtype):

73 """

74 Extension dtype for string data.

76 .. warning::

78 StringDtype is considered experimental. The implementation and

79 parts of the API may change without warning.

81 Parameters

82 ----------

83 storage : {"python", "pyarrow", "pyarrow_numpy"}, optional

84 If not given, the value of ``pd.options.mode.string_storage``.

86 Attributes

87 ----------

88 None

90 Methods

91 -------

92 None

94 Examples

95 --------

96 >>> pd.StringDtype()

97 string[python]

99 >>> pd.StringDtype(storage="pyarrow")

100 string[pyarrow]

101 """

102

103 # error: Cannot override instance variable (previously declared on

104 # base class "StorageExtensionDtype") with class variable

105 name: ClassVar[str] = "string" # type: ignore[misc]

106

107 #: StringDtype().na_value uses pandas.NA except the implementation that

108 # follows NumPy semantics, which uses nan.

109 @property

110 def na_value(self) -> libmissing.NAType | float: # type: ignore[override]

111 if self.storage == "pyarrow_numpy":

112 return np.nan

113 else:

114 return libmissing.NA

115

116 _metadata = ("storage",)

117

118 def __init__(self, storage=None) -> None:

119 if storage is None:

120 infer_string = get_option("future.infer_string")

121 if infer_string:

122 storage = "pyarrow_numpy"

123 else:

124 storage = get_option("mode.string_storage")

125 if storage not in {"python", "pyarrow", "pyarrow_numpy"}:

126 raise ValueError(

127 f"Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "

128 f"Got {storage} instead."

129 )

130 if storage in ("pyarrow", "pyarrow_numpy") and pa_version_under10p1:

131 raise ImportError(

132 "pyarrow>=10.0.1 is required for PyArrow backed StringArray."

133 )

134 self.storage = storage

135

136 @property

137 def type(self) -> type[str]:

138 return str

139

140 @classmethod

141 def construct_from_string(cls, string) -> Self:

142 """

143 Construct a StringDtype from a string.

144

145 Parameters

146 ----------

147 string : str

148 The type of the name. The storage type will be taking from `string`.

149 Valid options and their storage types are

150

151 ========================== ==============================================

152 string result storage

153 ========================== ==============================================

154 ``'string'`` pd.options.mode.string_storage, default python

155 ``'string[python]'`` python

156 ``'string[pyarrow]'`` pyarrow

157 ========================== ==============================================

158

159 Returns

160 -------

161 StringDtype

162

163 Raise

164 -----

165 TypeError

166 If the string is not a valid option.

167 """

168 if not isinstance(string, str):

169 raise TypeError(

170 f"'construct_from_string' expects a string, got {type(string)}"

171 )

172 if string == "string":

173 return cls()

174 elif string == "string[python]":

175 return cls(storage="python")

176 elif string == "string[pyarrow]":

177 return cls(storage="pyarrow")

178 elif string == "string[pyarrow_numpy]":

179 return cls(storage="pyarrow_numpy")

180 else:

181 raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")

182

183 # https://github.com/pandas-dev/pandas/issues/36126

184 # error: Signature of "construct_array_type" incompatible with supertype

185 # "ExtensionDtype"

186 def construct_array_type( # type: ignore[override]

187 self,

188 ) -> type_t[BaseStringArray]:

189 """

190 Return the array type associated with this dtype.

191

192 Returns

193 -------

194 type

195 """

196 from pandas.core.arrays.string_arrow import (

197 ArrowStringArray,

198 ArrowStringArrayNumpySemantics,

199 )

200

201 if self.storage == "python":

202 return StringArray

203 elif self.storage == "pyarrow":

204 return ArrowStringArray

205 else:

206 return ArrowStringArrayNumpySemantics

207

208 def __from_arrow__(

209 self, array: pyarrow.Array | pyarrow.ChunkedArray

210 ) -> BaseStringArray:

211 """

212 Construct StringArray from pyarrow Array/ChunkedArray.

213 """

214 if self.storage == "pyarrow":

215 from pandas.core.arrays.string_arrow import ArrowStringArray

216

217 return ArrowStringArray(array)

218 elif self.storage == "pyarrow_numpy":

219 from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics

220

221 return ArrowStringArrayNumpySemantics(array)

222 else:

223 import pyarrow

224

225 if isinstance(array, pyarrow.Array):

226 chunks = [array]

227 else:

228 # pyarrow.ChunkedArray

229 chunks = array.chunks

230

231 results = []

232 for arr in chunks:

233 # convert chunk by chunk to numpy and concatenate then, to avoid

234 # overflow for large string data when concatenating the pyarrow arrays

235 arr = arr.to_numpy(zero_copy_only=False)

236 arr = ensure_string_array(arr, na_value=libmissing.NA)

237 results.append(arr)

238

239 if len(chunks) == 0:

240 arr = np.array([], dtype=object)

241 else:

242 arr = np.concatenate(results)

243

244 # Bypass validation inside StringArray constructor, see GH#47781

245 new_string_array = StringArray.__new__(StringArray)

246 NDArrayBacked.__init__(

247 new_string_array,

248 arr,

249 StringDtype(storage="python"),

250 )

251 return new_string_array

252

253

254class BaseStringArray(ExtensionArray):

255 """

256 Mixin class for StringArray, ArrowStringArray.

257 """

258

259 @doc(ExtensionArray.tolist)

260 def tolist(self):

261 if self.ndim > 1:

262 return [x.tolist() for x in self]

263 return list(self.to_numpy())

264

265 @classmethod

266 def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:

267 if lib.infer_dtype(scalars, skipna=True) not in ["string", "empty"]:

268 # TODO: require any NAs be valid-for-string

269 raise ValueError

270 return cls._from_sequence(scalars, dtype=dtype)

271

272

273# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is

274# incompatible with definition in base class "ExtensionArray"

275class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc]

276 """

277 Extension array for string data.

278

279 .. warning::

280

281 StringArray is considered experimental. The implementation and

282 parts of the API may change without warning.

283

284 Parameters

285 ----------

286 values : array-like

287 The array of data.

288

289 .. warning::

290

291 Currently, this expects an object-dtype ndarray

292 where the elements are Python strings

293 or nan-likes (``None``, ``np.nan``, ``NA``).

294 This may change without warning in the future. Use

295 :meth:`pandas.array` with ``dtype="string"`` for a stable way of

296 creating a `StringArray` from any sequence.

297

298 .. versionchanged:: 1.5.0

299

300 StringArray now accepts array-likes containing

301 nan-likes(``None``, ``np.nan``) for the ``values`` parameter

302 in addition to strings and :attr:`pandas.NA`

303

304 copy : bool, default False

305 Whether to copy the array of data.

306

307 Attributes

308 ----------

309 None

310

311 Methods

312 -------

313 None

314

315 See Also

316 --------

317 :func:`pandas.array`

318 The recommended function for creating a StringArray.

319 Series.str

320 The string methods are available on Series backed by

321 a StringArray.

322

323 Notes

324 -----

325 StringArray returns a BooleanArray for comparison methods.

326

327 Examples

328 --------

329 >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")

330 <StringArray>

331 ['This is', 'some text', <NA>, 'data.']

332 Length: 4, dtype: string

333

334 Unlike arrays instantiated with ``dtype="object"``, ``StringArray``

335 will convert the values to strings.

336

337 >>> pd.array(['1', 1], dtype="object")

338 <NumpyExtensionArray>

339 ['1', 1]

340 Length: 2, dtype: object

341 >>> pd.array(['1', 1], dtype="string")

342 <StringArray>

343 ['1', '1']

344 Length: 2, dtype: string

345

346 However, instantiating StringArrays directly with non-strings will raise an error.

347

348 For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:

349

350 >>> pd.array(["a", None, "c"], dtype="string") == "a"

351 <BooleanArray>

352 [True, <NA>, False]

353 Length: 3, dtype: boolean

354 """

355

356 # undo the NumpyExtensionArray hack

357 _typ = "extension"

358

359 def __init__(self, values, copy: bool = False) -> None:

360 values = extract_array(values)

361

362 super().__init__(values, copy=copy)

363 if not isinstance(values, type(self)):

364 self._validate()

365 NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))

366

367 def _validate(self):

368 """Validate that we only store NA or strings."""

369 if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):

370 raise ValueError("StringArray requires a sequence of strings or pandas.NA")

371 if self._ndarray.dtype != "object":

372 raise ValueError(

373 "StringArray requires a sequence of strings or pandas.NA. Got "

374 f"'{self._ndarray.dtype}' dtype instead."

375 )

376 # Check to see if need to convert Na values to pd.NA

377 if self._ndarray.ndim > 2:

378 # Ravel if ndims > 2 b/c no cythonized version available

379 lib.convert_nans_to_NA(self._ndarray.ravel("K"))

380 else:

381 lib.convert_nans_to_NA(self._ndarray)

382

383 @classmethod

384 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):

385 if dtype and not (isinstance(dtype, str) and dtype == "string"):

386 dtype = pandas_dtype(dtype)

387 assert isinstance(dtype, StringDtype) and dtype.storage == "python"

388

389 from pandas.core.arrays.masked import BaseMaskedArray

390

391 if isinstance(scalars, BaseMaskedArray):

392 # avoid costly conversion to object dtype

393 na_values = scalars._mask

394 result = scalars._data

395 result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)

396 result[na_values] = libmissing.NA

397

398 else:

399 if lib.is_pyarrow_array(scalars):

400 # pyarrow array; we cannot rely on the "to_numpy" check in

401 # ensure_string_array because calling scalars.to_numpy would set

402 # zero_copy_only to True which caused problems see GH#52076

403 scalars = np.array(scalars)

404 # convert non-na-likes to str, and nan-likes to StringDtype().na_value

405 result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)

406

407 # Manually creating new array avoids the validation step in the __init__, so is

408 # faster. Refactor need for validation?

409 new_string_array = cls.__new__(cls)

410 NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))

411

412 return new_string_array

413

414 @classmethod

415 def _from_sequence_of_strings(

416 cls, strings, *, dtype: Dtype | None = None, copy: bool = False

417 ):

418 return cls._from_sequence(strings, dtype=dtype, copy=copy)

419

420 @classmethod

421 def _empty(cls, shape, dtype) -> StringArray:

422 values = np.empty(shape, dtype=object)

423 values[:] = libmissing.NA

424 return cls(values).astype(dtype, copy=False)

425

426 def __arrow_array__(self, type=None):

427 """

428 Convert myself into a pyarrow Array.

429 """

430 import pyarrow as pa

431

432 if type is None:

433 type = pa.string()

434

435 values = self._ndarray.copy()

436 values[self.isna()] = None

437 return pa.array(values, type=type, from_pandas=True)

438

439 def _values_for_factorize(self):

440 arr = self._ndarray.copy()

441 mask = self.isna()

442 arr[mask] = None

443 return arr, None

444

445 def __setitem__(self, key, value) -> None:

446 value = extract_array(value, extract_numpy=True)

447 if isinstance(value, type(self)):

448 # extract_array doesn't extract NumpyExtensionArray subclasses

449 value = value._ndarray

450

451 key = check_array_indexer(self, key)

452 scalar_key = lib.is_scalar(key)

453 scalar_value = lib.is_scalar(value)

454 if scalar_key and not scalar_value:

455 raise ValueError("setting an array element with a sequence.")

456

457 # validate new items

458 if scalar_value:

459 if isna(value):

460 value = libmissing.NA

461 elif not isinstance(value, str):

462 raise TypeError(

463 f"Cannot set non-string value '{value}' into a StringArray."

464 )

465 else:

466 if not is_array_like(value):

467 value = np.asarray(value, dtype=object)

468 if len(value) and not lib.is_string_array(value, skipna=True):

469 raise TypeError("Must provide strings.")

470

471 mask = isna(value)

472 if mask.any():

473 value = value.copy()

474 value[isna(value)] = libmissing.NA

475

476 super().__setitem__(key, value)

477

478 def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:

479 # the super() method NDArrayBackedExtensionArray._putmask uses

480 # np.putmask which doesn't properly handle None/pd.NA, so using the

481 # base class implementation that uses __setitem__

482 ExtensionArray._putmask(self, mask, value)

483

484 def astype(self, dtype, copy: bool = True):

485 dtype = pandas_dtype(dtype)

486

487 if dtype == self.dtype:

488 if copy:

489 return self.copy()

490 return self

491

492 elif isinstance(dtype, IntegerDtype):

493 arr = self._ndarray.copy()

494 mask = self.isna()

495 arr[mask] = 0

496 values = arr.astype(dtype.numpy_dtype)

497 return IntegerArray(values, mask, copy=False)

498 elif isinstance(dtype, FloatingDtype):

499 arr = self.copy()

500 mask = self.isna()

501 arr[mask] = "0"

502 values = arr.astype(dtype.numpy_dtype)

503 return FloatingArray(values, mask, copy=False)

504 elif isinstance(dtype, ExtensionDtype):

505 # Skip the NumpyExtensionArray.astype method

506 return ExtensionArray.astype(self, dtype, copy)

507 elif np.issubdtype(dtype, np.floating):

508 arr = self._ndarray.copy()

509 mask = self.isna()

510 arr[mask] = 0

511 values = arr.astype(dtype)

512 values[mask] = np.nan

513 return values

514

515 return super().astype(dtype, copy)

516

517 def _reduce(

518 self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs

519 ):

520 if name in ["min", "max"]:

521 return getattr(self, name)(skipna=skipna, axis=axis)

522

523 raise TypeError(f"Cannot perform reduction '{name}' with string dtype")

524

525 def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:

526 nv.validate_min((), kwargs)

527 result = masked_reductions.min(

528 values=self.to_numpy(), mask=self.isna(), skipna=skipna

529 )

530 return self._wrap_reduction_result(axis, result)

531

532 def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:

533 nv.validate_max((), kwargs)

534 result = masked_reductions.max(

535 values=self.to_numpy(), mask=self.isna(), skipna=skipna

536 )

537 return self._wrap_reduction_result(axis, result)

538

539 def value_counts(self, dropna: bool = True) -> Series:

540 from pandas.core.algorithms import value_counts_internal as value_counts

541

542 result = value_counts(self._ndarray, dropna=dropna).astype("Int64")

543 result.index = result.index.astype(self.dtype)

544 return result

545

546 def memory_usage(self, deep: bool = False) -> int:

547 result = self._ndarray.nbytes

548 if deep:

549 return result + lib.memory_usage_of_objects(self._ndarray)

550 return result

551

552 @doc(ExtensionArray.searchsorted)

553 def searchsorted(

554 self,

555 value: NumpyValueArrayLike | ExtensionArray,

556 side: Literal["left", "right"] = "left",

557 sorter: NumpySorter | None = None,

558 ) -> npt.NDArray[np.intp] | np.intp:

559 if self._hasna:

560 raise ValueError(

561 "searchsorted requires array to be sorted, which is impossible "

562 "with NAs present."

563 )

564 return super().searchsorted(value=value, side=side, sorter=sorter)

565

566 def _cmp_method(self, other, op):

567 from pandas.arrays import BooleanArray

568

569 if isinstance(other, StringArray):

570 other = other._ndarray

571

572 mask = isna(self) | isna(other)

573 valid = ~mask

574

575 if not lib.is_scalar(other):

576 if len(other) != len(self):

577 # prevent improper broadcasting when other is 2D

578 raise ValueError(

579 f"Lengths of operands do not match: {len(self)} != {len(other)}"

580 )

581

582 other = np.asarray(other)

583 other = other[valid]

584

585 if op.__name__ in ops.ARITHMETIC_BINOPS:

586 result = np.empty_like(self._ndarray, dtype="object")

587 result[mask] = libmissing.NA

588 result[valid] = op(self._ndarray[valid], other)

589 return StringArray(result)

590 else:

591 # logical

592 result = np.zeros(len(self._ndarray), dtype="bool")

593 result[valid] = op(self._ndarray[valid], other)

594 return BooleanArray(result, mask)

595

596 _arith_method = _cmp_method

597

598 # ------------------------------------------------------------------------

599 # String methods interface

600 # error: Incompatible types in assignment (expression has type "NAType",

601 # base class "NumpyExtensionArray" defined the type as "float")

602 _str_na_value = libmissing.NA # type: ignore[assignment]

603

604 def _str_map(

605 self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True

606 ):

607 from pandas.arrays import BooleanArray

608

609 if dtype is None:

610 dtype = StringDtype(storage="python")

611 if na_value is None:

612 na_value = self.dtype.na_value

613

614 mask = isna(self)

615 arr = np.asarray(self)

616

617 if is_integer_dtype(dtype) or is_bool_dtype(dtype):

618 constructor: type[IntegerArray | BooleanArray]

619 if is_integer_dtype(dtype):

620 constructor = IntegerArray

621 else:

622 constructor = BooleanArray

623

624 na_value_is_na = isna(na_value)

625 if na_value_is_na:

626 na_value = 1

627 elif dtype == np.dtype("bool"):

628 na_value = bool(na_value)

629 result = lib.map_infer_mask(

630 arr,

631 f,

632 mask.view("uint8"),

633 convert=False,

634 na_value=na_value,

635 # error: Argument 1 to "dtype" has incompatible type

636 # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected

637 # "Type[object]"

638 dtype=np.dtype(dtype), # type: ignore[arg-type]

639 )

640

641 if not na_value_is_na:

642 mask[:] = False

643

644 return constructor(result, mask)

645

646 elif is_string_dtype(dtype) and not is_object_dtype(dtype):

647 # i.e. StringDtype

648 result = lib.map_infer_mask(

649 arr, f, mask.view("uint8"), convert=False, na_value=na_value

650 )

651 return StringArray(result)

652 else:

653 # This is when the result type is object. We reach this when

654 # -> We know the result type is truly object (e.g. .encode returns bytes

655 # or .findall returns a list).

656 # -> We don't know the result type. E.g. `.get` can return anything.

657 return lib.map_infer_mask(arr, f, mask.view("uint8"))

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/string_.py: 26%

280 statements